1 /* regcomp.c 2 */ 3 4 /* 5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee 6 * 7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"] 8 */ 9 10 /* This file contains functions for compiling a regular expression. See 11 * also regexec.c which funnily enough, contains functions for executing 12 * a regular expression. 13 * 14 * This file is also copied at build time to ext/re/re_comp.c, where 15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT. 16 * This causes the main functions to be compiled under new names and with 17 * debugging support added, which makes "use re 'debug'" work. 18 */ 19 20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not be 21 * confused with the original package (see point 3 below). Thanks, Henry! 22 */ 23 24 /* Additional note: this code is very heavily munged from Henry's version 25 * in places. In some spots I've traded clarity for efficiency, so don't 26 * blame Henry for some of the lack of readability. 27 */ 28 29 /* The names of the functions have been changed from regcomp and 30 * regexec to pregcomp and pregexec in order to avoid conflicts 31 * with the POSIX routines of the same names. 32 */ 33 34 /* 35 * pregcomp and pregexec -- regsub and regerror are not used in perl 36 * 37 * Copyright (c) 1986 by University of Toronto. 38 * Written by Henry Spencer. Not derived from licensed software. 39 * 40 * Permission is granted to anyone to use this software for any 41 * purpose on any computer system, and to redistribute it freely, 42 * subject to the following restrictions: 43 * 44 * 1. The author is not responsible for the consequences of use of 45 * this software, no matter how awful, even if they arise 46 * from defects in it. 47 * 48 * 2. The origin of this software must not be misrepresented, either 49 * by explicit claim or by omission. 50 * 51 * 3. Altered versions must be plainly marked as such, and must not 52 * be misrepresented as being the original software. 53 * 54 * 55 **** Alterations to Henry's code are... 56 **** 57 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 58 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 59 **** by Larry Wall and others 60 **** 61 **** You may distribute under the terms of either the GNU General Public 62 **** License or the Artistic License, as specified in the README file. 63 64 * 65 * Beware that some of this code is subtly aware of the way operator 66 * precedence is structured in regular expressions. Serious changes in 67 * regular-expression syntax might require a total rethink. 68 */ 69 70 /* Note on debug output: 71 * 72 * This is set up so that -Dr turns on debugging like all other flags that are 73 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to 74 * all regular expressions encountered in a program, and gives a huge amount of 75 * output for all but the shortest programs. 76 * 77 * The ability to output pattern debugging information lexically, and with much 78 * finer grained control was added, with 'use re qw(Debug ....);' available even 79 * in non-DEBUGGING builds. This is accomplished by copying the contents of 80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c. 81 * Those files are compiled and linked into the perl executable, and they are 82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls 83 * to re.pm. 84 * 85 * That would normally mean linking errors when two functions of the same name 86 * are attempted to be placed into the same executable. That is solved in one 87 * of four ways: 88 * 1) Static functions aren't known outside the file they are in, so for the 89 * many functions of that type in this file, it just isn't a problem. 90 * 2) Most externally known functions are enclosed in 91 * #ifndef PERL_IN_XSUB_RE 92 * ... 93 * #endif 94 * blocks, so there is only one definition for them in the whole 95 * executable, the one in regcomp.c (or regexec.c). The implication of 96 * that is any debugging info that comes from them is controlled only by 97 * -Dr. Further, any static function they call will also be the version 98 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr. 99 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to 100 * have different names, so that what gets loaded in the executable is 101 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function 102 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging 103 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo' 104 * versions and their callees are under control of re.pm. The catch is 105 * that references to all these go through the regexp_engine structure, 106 * which is initialized in regcomp.h to the Perl_foo versions, and 107 * substituted out in lexical scopes where 'use re' is in effect to the 108 * 'my_foo' ones. That structure is public API, so it would be a hard 109 * sell to add any additional members. 110 * 4) For functions in regcomp.c and re_comp.c that are called only from, 111 * respectively, regexec.c and re_exec.c, they can have two different 112 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and 113 * embed.fnc. 114 * 115 * The bottom line is that if you add code to one of the public functions 116 * listed in ext/re/re_top.h, debugging automagically works. But if you write 117 * a new function that needs to do debugging or there is a chain of calls from 118 * it that need to do debugging, all functions in the chain should use options 119 * 2) or 4) above. 120 * 121 * A function may have to be split so that debugging stuff is static, but it 122 * calls out to some other function that only gets compiled in regcomp.c to 123 * access data that we don't want to duplicate. 124 */ 125 126 #ifdef PERL_EXT_RE_BUILD 127 #include "re_top.h" 128 #endif 129 130 #include "EXTERN.h" 131 #define PERL_IN_REGEX_ENGINE 132 #define PERL_IN_REGCOMP_ANY 133 #define PERL_IN_REGCOMP_C 134 #include "perl.h" 135 136 #ifdef PERL_IN_XSUB_RE 137 # include "re_comp.h" 138 EXTERN_C const struct regexp_engine my_reg_engine; 139 EXTERN_C const struct regexp_engine wild_reg_engine; 140 #else 141 # include "regcomp.h" 142 #endif 143 144 #include "invlist_inline.h" 145 #include "unicode_constants.h" 146 #include "regcomp_internal.h" 147 148 /* ========================================================= 149 * BEGIN edit_distance stuff. 150 * 151 * This calculates how many single character changes of any type are needed to 152 * transform a string into another one. It is taken from version 3.1 of 153 * 154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS 155 */ 156 157 /* Our unsorted dictionary linked list. */ 158 /* Note we use UVs, not chars. */ 159 160 struct dictionary{ 161 UV key; 162 UV value; 163 struct dictionary* next; 164 }; 165 typedef struct dictionary item; 166 167 168 PERL_STATIC_INLINE item* 169 push(UV key, item* curr) 170 { 171 item* head; 172 Newx(head, 1, item); 173 head->key = key; 174 head->value = 0; 175 head->next = curr; 176 return head; 177 } 178 179 180 PERL_STATIC_INLINE item* 181 find(item* head, UV key) 182 { 183 item* iterator = head; 184 while (iterator){ 185 if (iterator->key == key){ 186 return iterator; 187 } 188 iterator = iterator->next; 189 } 190 191 return NULL; 192 } 193 194 PERL_STATIC_INLINE item* 195 uniquePush(item* head, UV key) 196 { 197 item* iterator = head; 198 199 while (iterator){ 200 if (iterator->key == key) { 201 return head; 202 } 203 iterator = iterator->next; 204 } 205 206 return push(key, head); 207 } 208 209 PERL_STATIC_INLINE void 210 dict_free(item* head) 211 { 212 item* iterator = head; 213 214 while (iterator) { 215 item* temp = iterator; 216 iterator = iterator->next; 217 Safefree(temp); 218 } 219 220 head = NULL; 221 } 222 223 /* End of Dictionary Stuff */ 224 225 /* All calculations/work are done here */ 226 STATIC int 227 S_edit_distance(const UV* src, 228 const UV* tgt, 229 const STRLEN x, /* length of src[] */ 230 const STRLEN y, /* length of tgt[] */ 231 const SSize_t maxDistance 232 ) 233 { 234 item *head = NULL; 235 UV swapCount, swapScore, targetCharCount, i, j; 236 UV *scores; 237 UV score_ceil = x + y; 238 239 PERL_ARGS_ASSERT_EDIT_DISTANCE; 240 241 /* initialize matrix start values */ 242 Newx(scores, ( (x + 2) * (y + 2)), UV); 243 scores[0] = score_ceil; 244 scores[1 * (y + 2) + 0] = score_ceil; 245 scores[0 * (y + 2) + 1] = score_ceil; 246 scores[1 * (y + 2) + 1] = 0; 247 head = uniquePush(uniquePush(head, src[0]), tgt[0]); 248 249 /* work loops */ 250 /* i = src index */ 251 /* j = tgt index */ 252 for (i=1;i<=x;i++) { 253 if (i < x) 254 head = uniquePush(head, src[i]); 255 scores[(i+1) * (y + 2) + 1] = i; 256 scores[(i+1) * (y + 2) + 0] = score_ceil; 257 swapCount = 0; 258 259 for (j=1;j<=y;j++) { 260 if (i == 1) { 261 if(j < y) 262 head = uniquePush(head, tgt[j]); 263 scores[1 * (y + 2) + (j + 1)] = j; 264 scores[0 * (y + 2) + (j + 1)] = score_ceil; 265 } 266 267 targetCharCount = find(head, tgt[j-1])->value; 268 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; 269 270 if (src[i-1] != tgt[j-1]){ 271 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); 272 } 273 else { 274 swapCount = j; 275 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); 276 } 277 } 278 279 find(head, src[i-1])->value = i; 280 } 281 282 { 283 IV score = scores[(x+1) * (y + 2) + (y + 1)]; 284 dict_free(head); 285 Safefree(scores); 286 return (maxDistance != 0 && maxDistance < score)?(-1):score; 287 } 288 } 289 290 /* END of edit_distance() stuff 291 * ========================================================= */ 292 293 #ifdef PERL_RE_BUILD_AUX 294 /* add a data member to the struct reg_data attached to this regex, it should 295 * always return a non-zero return. the 's' argument is the type of the items 296 * being added and the n is the number of items. The length of 's' should match 297 * the number of items. */ 298 U32 299 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) 300 { 301 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1; 302 303 PERL_ARGS_ASSERT_REG_ADD_DATA; 304 305 /* in the below expression we have (count + n - 1), the minus one is there 306 * because the struct that we allocate already contains a slot for 1 data 307 * item, so we do not need to allocate it the first time. IOW, the 308 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need 309 * to allocate. See struct reg_data in regcomp.h 310 */ 311 Renewc(RExC_rxi->data, 312 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)), 313 char, struct reg_data); 314 /* however in the data->what expression we use (count + n) and do not 315 * subtract one from the result because the data structure contains a 316 * pointer to an array, and does not allocate the first element as part of 317 * the data struct. */ 318 if (count > 1) 319 Renew(RExC_rxi->data->what, (count + n), U8); 320 else { 321 /* when count == 1 it means we have not initialized anything. 322 * we always fill the 0 slot of the data array with a '%' entry, which 323 * means "zero" (all the other types are letters) which exists purely 324 * so the return from reg_add_data is ALWAYS true, so we can tell it apart 325 * from a "no value" idx=0 in places where we would return an index 326 * into reg_add_data. This is particularly important with the new "single 327 * pass, usually, but not always" strategy that we use, where the code 328 * will use a 0 to represent "not able to compute this yet". 329 */ 330 Newx(RExC_rxi->data->what, n+1, U8); 331 /* fill in the placeholder slot of 0 with a what of '%', we use 332 * this because it sorta looks like a zero (0/0) and it is not a letter 333 * like any of the other "whats", this type should never be created 334 * any other way but here. '%' happens to also not appear in this 335 * file for any other reason (at the time of writing this comment)*/ 336 RExC_rxi->data->what[0]= '%'; 337 RExC_rxi->data->data[0]= NULL; 338 } 339 RExC_rxi->data->count = count + n; 340 Copy(s, RExC_rxi->data->what + count, n, U8); 341 assert(count>0); 342 return count; 343 } 344 #endif /* PERL_RE_BUILD_AUX */ 345 346 /*XXX: todo make this not included in a non debugging perl, but appears to be 347 * used anyway there, in 'use re' */ 348 #ifndef PERL_IN_XSUB_RE 349 void 350 Perl_reginitcolors(pTHX) 351 { 352 const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); 353 if (s) { 354 char *t = savepv(s); 355 int i = 0; 356 PL_colors[0] = t; 357 while (++i < 6) { 358 t = strchr(t, '\t'); 359 if (t) { 360 *t = '\0'; 361 PL_colors[i] = ++t; 362 } 363 else 364 PL_colors[i] = t = (char *)""; 365 } 366 } else { 367 int i = 0; 368 while (i < 6) 369 PL_colors[i++] = (char *)""; 370 } 371 PL_colorset = 1; 372 } 373 #endif 374 375 376 #ifdef TRIE_STUDY_OPT 377 /* search for "restudy" in this file for a detailed explanation */ 378 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ 379 STMT_START { \ 380 if ( \ 381 (data.flags & SCF_TRIE_RESTUDY) \ 382 && ! restudied++ \ 383 ) { \ 384 dOsomething; \ 385 goto reStudy; \ 386 } \ 387 } STMT_END 388 #else 389 #define CHECK_RESTUDY_GOTO_butfirst 390 #endif 391 392 #ifndef PERL_IN_XSUB_RE 393 394 /* return the currently in-scope regex engine (or the default if none) */ 395 regexp_engine const * 396 Perl_current_re_engine(pTHX) 397 { 398 if (IN_PERL_COMPILETIME) { 399 HV * const table = GvHV(PL_hintgv); 400 SV **ptr; 401 402 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) 403 return &PL_core_reg_engine; 404 ptr = hv_fetchs(table, "regcomp", FALSE); 405 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) 406 return &PL_core_reg_engine; 407 return INT2PTR(regexp_engine*, SvIV(*ptr)); 408 } 409 else { 410 SV *ptr; 411 if (!PL_curcop->cop_hints_hash) 412 return &PL_core_reg_engine; 413 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); 414 if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) 415 return &PL_core_reg_engine; 416 return INT2PTR(regexp_engine*, SvIV(ptr)); 417 } 418 } 419 420 421 /* 422 * pregcomp - compile a regular expression into internal code 423 * 424 * Decides which engine's compiler to call based on the hint currently in 425 * scope 426 */ 427 428 REGEXP * 429 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) 430 { 431 regexp_engine const *eng = current_re_engine(); 432 DECLARE_AND_GET_RE_DEBUG_FLAGS; 433 434 PERL_ARGS_ASSERT_PREGCOMP; 435 436 /* Dispatch a request to compile a regexp to correct regexp engine. */ 437 DEBUG_COMPILE_r({ 438 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n", 439 PTR2UV(eng)); 440 }); 441 return CALLREGCOMP_ENG(eng, pattern, flags); 442 } 443 #endif 444 445 /* 446 =for apidoc re_compile 447 448 Compile the regular expression pattern C<pattern>, returning a pointer to the 449 compiled object for later matching with the internal regex engine. 450 451 This function is typically used by a custom regexp engine C<.comp()> function 452 to hand off to the core regexp engine those patterns it doesn't want to handle 453 itself (typically passing through the same flags it was called with). In 454 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>> 455 to compile using the currently active regexp engine. 456 457 If C<pattern> is already a C<REGEXP>, this function does nothing but return a 458 pointer to the input. Otherwise the PV is extracted and treated like a string 459 representing a pattern. See L<perlre>. 460 461 The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names 462 all begin with C<RXf_>. 463 464 =cut 465 466 * public entry point for the perl core's own regex compiling code. 467 * It's actually a wrapper for Perl_re_op_compile that only takes an SV 468 * pattern rather than a list of OPs, and uses the internal engine rather 469 * than the current one */ 470 471 REGEXP * 472 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) 473 { 474 SV *pat = pattern; /* defeat constness! */ 475 476 PERL_ARGS_ASSERT_RE_COMPILE; 477 478 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, 479 #ifdef PERL_IN_XSUB_RE 480 &my_reg_engine, 481 #else 482 &PL_core_reg_engine, 483 #endif 484 NULL, NULL, rx_flags, 0); 485 } 486 487 static void 488 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs) 489 { 490 int n; 491 492 if (--cbs->refcnt > 0) 493 return; 494 for (n = 0; n < cbs->count; n++) { 495 REGEXP *rx = cbs->cb[n].src_regex; 496 if (rx) { 497 cbs->cb[n].src_regex = NULL; 498 SvREFCNT_dec_NN(rx); 499 } 500 } 501 Safefree(cbs->cb); 502 Safefree(cbs); 503 } 504 505 506 static struct reg_code_blocks * 507 S_alloc_code_blocks(pTHX_ int ncode) 508 { 509 struct reg_code_blocks *cbs; 510 Newx(cbs, 1, struct reg_code_blocks); 511 cbs->count = ncode; 512 cbs->refcnt = 1; 513 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs); 514 if (ncode) 515 Newx(cbs->cb, ncode, struct reg_code_block); 516 else 517 cbs->cb = NULL; 518 return cbs; 519 } 520 521 522 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code 523 * blocks, recalculate the indices. Update pat_p and plen_p in-place to 524 * point to the realloced string and length. 525 * 526 * This is essentially a copy of Perl_bytes_to_utf8() with the code index 527 * stuff added */ 528 529 static void 530 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, 531 char **pat_p, STRLEN *plen_p, int num_code_blocks) 532 { 533 U8 *const src = (U8*)*pat_p; 534 U8 *dst, *d; 535 int n=0; 536 STRLEN s = 0; 537 bool do_end = 0; 538 DECLARE_AND_GET_RE_DEBUG_FLAGS; 539 540 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 541 "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); 542 543 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */ 544 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8); 545 d = dst; 546 547 while (s < *plen_p) { 548 append_utf8_from_native_byte(src[s], &d); 549 550 if (n < num_code_blocks) { 551 assert(pRExC_state->code_blocks); 552 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) { 553 pRExC_state->code_blocks->cb[n].start = d - dst - 1; 554 assert(*(d - 1) == '('); 555 do_end = 1; 556 } 557 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) { 558 pRExC_state->code_blocks->cb[n].end = d - dst - 1; 559 assert(*(d - 1) == ')'); 560 do_end = 0; 561 n++; 562 } 563 } 564 s++; 565 } 566 *d = '\0'; 567 *plen_p = d - dst; 568 *pat_p = (char*) dst; 569 SAVEFREEPV(*pat_p); 570 RExC_orig_utf8 = RExC_utf8 = 1; 571 } 572 573 574 575 /* S_concat_pat(): concatenate a list of args to the pattern string pat, 576 * while recording any code block indices, and handling overloading, 577 * nested qr// objects etc. If pat is null, it will allocate a new 578 * string, or just return the first arg, if there's only one. 579 * 580 * Returns the malloced/updated pat. 581 * patternp and pat_count is the array of SVs to be concatted; 582 * oplist is the optional list of ops that generated the SVs; 583 * recompile_p is a pointer to a boolean that will be set if 584 * the regex will need to be recompiled. 585 * delim, if non-null is an SV that will be inserted between each element 586 */ 587 588 static SV* 589 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, 590 SV *pat, SV ** const patternp, int pat_count, 591 OP *oplist, bool *recompile_p, SV *delim) 592 { 593 SV **svp; 594 int n = 0; 595 bool use_delim = FALSE; 596 bool alloced = FALSE; 597 598 /* if we know we have at least two args, create an empty string, 599 * then concatenate args to that. For no args, return an empty string */ 600 if (!pat && pat_count != 1) { 601 pat = newSVpvs(""); 602 SAVEFREESV(pat); 603 alloced = TRUE; 604 } 605 606 for (svp = patternp; svp < patternp + pat_count; svp++) { 607 SV *sv; 608 SV *rx = NULL; 609 STRLEN orig_patlen = 0; 610 bool code = 0; 611 SV *msv = use_delim ? delim : *svp; 612 if (!msv) msv = &PL_sv_undef; 613 614 /* if we've got a delimiter, we go round the loop twice for each 615 * svp slot (except the last), using the delimiter the second 616 * time round */ 617 if (use_delim) { 618 svp--; 619 use_delim = FALSE; 620 } 621 else if (delim) 622 use_delim = TRUE; 623 624 if (SvTYPE(msv) == SVt_PVAV) { 625 /* we've encountered an interpolated array within 626 * the pattern, e.g. /...@a..../. Expand the list of elements, 627 * then recursively append elements. 628 * The code in this block is based on S_pushav() */ 629 630 AV *const av = (AV*)msv; 631 const SSize_t maxarg = AvFILL(av) + 1; 632 SV **array; 633 634 if (oplist) { 635 assert(oplist->op_type == OP_PADAV 636 || oplist->op_type == OP_RV2AV); 637 oplist = OpSIBLING(oplist); 638 } 639 640 if (SvRMAGICAL(av)) { 641 SSize_t i; 642 643 Newx(array, maxarg, SV*); 644 SAVEFREEPV(array); 645 for (i=0; i < maxarg; i++) { 646 SV ** const svp = av_fetch(av, i, FALSE); 647 array[i] = svp ? *svp : &PL_sv_undef; 648 } 649 } 650 else 651 array = AvARRAY(av); 652 653 if (maxarg > 0) { 654 pat = S_concat_pat(aTHX_ pRExC_state, pat, 655 array, maxarg, NULL, recompile_p, 656 /* $" */ 657 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV)))); 658 } 659 else if (!pat) { 660 pat = newSVpvs_flags("", SVs_TEMP); 661 } 662 663 continue; 664 } 665 666 667 /* we make the assumption here that each op in the list of 668 * op_siblings maps to one SV pushed onto the stack, 669 * except for code blocks, with have both an OP_NULL and 670 * an OP_CONST. 671 * This allows us to match up the list of SVs against the 672 * list of OPs to find the next code block. 673 * 674 * Note that PUSHMARK PADSV PADSV .. 675 * is optimised to 676 * PADRANGE PADSV PADSV .. 677 * so the alignment still works. */ 678 679 if (oplist) { 680 if (oplist->op_type == OP_NULL 681 && (oplist->op_flags & OPf_SPECIAL)) 682 { 683 assert(n < pRExC_state->code_blocks->count); 684 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0; 685 pRExC_state->code_blocks->cb[n].block = oplist; 686 pRExC_state->code_blocks->cb[n].src_regex = NULL; 687 n++; 688 code = 1; 689 oplist = OpSIBLING(oplist); /* skip CONST */ 690 assert(oplist); 691 } 692 oplist = OpSIBLING(oplist); 693 } 694 695 /* apply magic and QR overloading to arg */ 696 697 SvGETMAGIC(msv); 698 if (SvROK(msv) && SvAMAGIC(msv)) { 699 SV *sv = AMG_CALLunary(msv, regexp_amg); 700 if (sv) { 701 if (SvROK(sv)) 702 sv = SvRV(sv); 703 if (SvTYPE(sv) != SVt_REGEXP) 704 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); 705 msv = sv; 706 } 707 } 708 709 /* try concatenation overload ... */ 710 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) && 711 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) 712 { 713 sv_setsv(pat, sv); 714 /* overloading involved: all bets are off over literal 715 * code. Pretend we haven't seen it */ 716 if (n) 717 pRExC_state->code_blocks->count -= n; 718 n = 0; 719 } 720 else { 721 /* ... or failing that, try "" overload */ 722 while (SvAMAGIC(msv) 723 && (sv = AMG_CALLunary(msv, string_amg)) 724 && sv != msv 725 && !( SvROK(msv) 726 && SvROK(sv) 727 && SvRV(msv) == SvRV(sv)) 728 ) { 729 msv = sv; 730 SvGETMAGIC(msv); 731 } 732 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) 733 msv = SvRV(msv); 734 735 if (pat) { 736 /* this is a partially unrolled 737 * sv_catsv_nomg(pat, msv); 738 * that allows us to adjust code block indices if 739 * needed */ 740 STRLEN dlen; 741 char *dst = SvPV_force_nomg(pat, dlen); 742 orig_patlen = dlen; 743 if (SvUTF8(msv) && !SvUTF8(pat)) { 744 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n); 745 sv_setpvn(pat, dst, dlen); 746 SvUTF8_on(pat); 747 } 748 sv_catsv_nomg(pat, msv); 749 rx = msv; 750 } 751 else { 752 /* We have only one SV to process, but we need to verify 753 * it is properly null terminated or we will fail asserts 754 * later. In theory we probably shouldn't get such SV's, 755 * but if we do we should handle it gracefully. */ 756 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) { 757 /* not a string, or a string with a trailing null */ 758 pat = msv; 759 } else { 760 /* a string with no trailing null, we need to copy it 761 * so it has a trailing null */ 762 pat = sv_2mortal(newSVsv(msv)); 763 } 764 } 765 766 if (code) 767 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1; 768 } 769 770 /* extract any code blocks within any embedded qr//'s */ 771 if (rx && SvTYPE(rx) == SVt_REGEXP 772 && RX_ENGINE((REGEXP*)rx)->op_comp) 773 { 774 775 RXi_GET_DECL(ReANY((REGEXP *)rx), ri); 776 if (ri->code_blocks && ri->code_blocks->count) { 777 int i; 778 /* the presence of an embedded qr// with code means 779 * we should always recompile: the text of the 780 * qr// may not have changed, but it may be a 781 * different closure than last time */ 782 *recompile_p = 1; 783 if (pRExC_state->code_blocks) { 784 int new_count = pRExC_state->code_blocks->count 785 + ri->code_blocks->count; 786 Renew(pRExC_state->code_blocks->cb, 787 new_count, struct reg_code_block); 788 pRExC_state->code_blocks->count = new_count; 789 } 790 else 791 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ 792 ri->code_blocks->count); 793 794 for (i=0; i < ri->code_blocks->count; i++) { 795 struct reg_code_block *src, *dst; 796 STRLEN offset = orig_patlen 797 + ReANY((REGEXP *)rx)->pre_prefix; 798 assert(n < pRExC_state->code_blocks->count); 799 src = &ri->code_blocks->cb[i]; 800 dst = &pRExC_state->code_blocks->cb[n]; 801 dst->start = src->start + offset; 802 dst->end = src->end + offset; 803 dst->block = src->block; 804 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) 805 src->src_regex 806 ? src->src_regex 807 : (REGEXP*)rx); 808 n++; 809 } 810 } 811 } 812 } 813 /* avoid calling magic multiple times on a single element e.g. =~ $qr */ 814 if (alloced) 815 SvSETMAGIC(pat); 816 817 return pat; 818 } 819 820 821 822 /* see if there are any run-time code blocks in the pattern. 823 * False positives are allowed */ 824 825 static bool 826 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 827 char *pat, STRLEN plen) 828 { 829 int n = 0; 830 STRLEN s; 831 832 PERL_UNUSED_CONTEXT; 833 834 for (s = 0; s < plen; s++) { 835 if ( pRExC_state->code_blocks 836 && n < pRExC_state->code_blocks->count 837 && s == pRExC_state->code_blocks->cb[n].start) 838 { 839 s = pRExC_state->code_blocks->cb[n].end; 840 n++; 841 continue; 842 } 843 /* TODO ideally should handle [..], (#..), /#.../x to reduce false 844 * positives here */ 845 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' && 846 (pat[s+2] == '{' 847 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{')) 848 ) 849 return 1; 850 } 851 return 0; 852 } 853 854 /* Handle run-time code blocks. We will already have compiled any direct 855 * or indirect literal code blocks. Now, take the pattern 'pat' and make a 856 * copy of it, but with any literal code blocks blanked out and 857 * appropriate chars escaped; then feed it into 858 * 859 * eval "qr'modified_pattern'" 860 * 861 * For example, 862 * 863 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno 864 * 865 * becomes 866 * 867 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' 868 * 869 * After eval_sv()-ing that, grab any new code blocks from the returned qr 870 * and merge them with any code blocks of the original regexp. 871 * 872 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; 873 * instead, just save the qr and return FALSE; this tells our caller that 874 * the original pattern needs upgrading to utf8. 875 */ 876 877 static bool 878 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, 879 char *pat, STRLEN plen) 880 { 881 SV *qr; 882 883 DECLARE_AND_GET_RE_DEBUG_FLAGS; 884 885 if (pRExC_state->runtime_code_qr) { 886 /* this is the second time we've been called; this should 887 * only happen if the main pattern got upgraded to utf8 888 * during compilation; re-use the qr we compiled first time 889 * round (which should be utf8 too) 890 */ 891 qr = pRExC_state->runtime_code_qr; 892 pRExC_state->runtime_code_qr = NULL; 893 assert(RExC_utf8 && SvUTF8(qr)); 894 } 895 else { 896 int n = 0; 897 STRLEN s; 898 char *p, *newpat; 899 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */ 900 SV *sv, *qr_ref; 901 dSP; 902 903 /* determine how many extra chars we need for ' and \ escaping */ 904 for (s = 0; s < plen; s++) { 905 if (pat[s] == '\'' || pat[s] == '\\') 906 newlen++; 907 } 908 909 Newx(newpat, newlen, char); 910 p = newpat; 911 *p++ = 'q'; *p++ = 'r'; *p++ = '\''; 912 913 for (s = 0; s < plen; s++) { 914 if ( pRExC_state->code_blocks 915 && n < pRExC_state->code_blocks->count 916 && s == pRExC_state->code_blocks->cb[n].start) 917 { 918 /* blank out literal code block so that they aren't 919 * recompiled: eg change from/to: 920 * /(?{xyz})/ 921 * /(?=====)/ 922 * and 923 * /(??{xyz})/ 924 * /(?======)/ 925 * and 926 * /(?(?{xyz}))/ 927 * /(?(?=====))/ 928 */ 929 assert(pat[s] == '('); 930 assert(pat[s+1] == '?'); 931 *p++ = '('; 932 *p++ = '?'; 933 s += 2; 934 while (s < pRExC_state->code_blocks->cb[n].end) { 935 *p++ = '='; 936 s++; 937 } 938 *p++ = ')'; 939 n++; 940 continue; 941 } 942 if (pat[s] == '\'' || pat[s] == '\\') 943 *p++ = '\\'; 944 *p++ = pat[s]; 945 } 946 *p++ = '\''; 947 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) { 948 *p++ = 'x'; 949 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) { 950 *p++ = 'x'; 951 } 952 } 953 *p++ = '\0'; 954 DEBUG_COMPILE_r({ 955 Perl_re_printf( aTHX_ 956 "%sre-parsing pattern for runtime code:%s %s\n", 957 PL_colors[4], PL_colors[5], newpat); 958 }); 959 960 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); 961 Safefree(newpat); 962 963 ENTER; 964 SAVETMPS; 965 save_re_context(); 966 PUSHSTACKi(PERLSI_REQUIRE); 967 /* G_RE_REPARSING causes the toker to collapse \\ into \ when 968 * parsing qr''; normally only q'' does this. It also alters 969 * hints handling */ 970 eval_sv(sv, G_SCALAR|G_RE_REPARSING); 971 SvREFCNT_dec_NN(sv); 972 SPAGAIN; 973 qr_ref = POPs; 974 PUTBACK; 975 { 976 SV * const errsv = ERRSV; 977 if (SvTRUE_NN(errsv)) 978 /* use croak_sv ? */ 979 Perl_croak_nocontext("%" SVf, SVfARG(errsv)); 980 } 981 assert(SvROK(qr_ref)); 982 qr = SvRV(qr_ref); 983 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); 984 /* the leaving below frees the tmp qr_ref. 985 * Give qr a life of its own */ 986 SvREFCNT_inc(qr); 987 POPSTACK; 988 FREETMPS; 989 LEAVE; 990 991 } 992 993 if (!RExC_utf8 && SvUTF8(qr)) { 994 /* first time through; the pattern got upgraded; save the 995 * qr for the next time through */ 996 assert(!pRExC_state->runtime_code_qr); 997 pRExC_state->runtime_code_qr = qr; 998 return 0; 999 } 1000 1001 1002 /* extract any code blocks within the returned qr// */ 1003 1004 1005 /* merge the main (r1) and run-time (r2) code blocks into one */ 1006 { 1007 RXi_GET_DECL(ReANY((REGEXP *)qr), r2); 1008 struct reg_code_block *new_block, *dst; 1009 RExC_state_t * const r1 = pRExC_state; /* convenient alias */ 1010 int i1 = 0, i2 = 0; 1011 int r1c, r2c; 1012 1013 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */ 1014 { 1015 SvREFCNT_dec_NN(qr); 1016 return 1; 1017 } 1018 1019 if (!r1->code_blocks) 1020 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0); 1021 1022 r1c = r1->code_blocks->count; 1023 r2c = r2->code_blocks->count; 1024 1025 Newx(new_block, r1c + r2c, struct reg_code_block); 1026 1027 dst = new_block; 1028 1029 while (i1 < r1c || i2 < r2c) { 1030 struct reg_code_block *src; 1031 bool is_qr = 0; 1032 1033 if (i1 == r1c) { 1034 src = &r2->code_blocks->cb[i2++]; 1035 is_qr = 1; 1036 } 1037 else if (i2 == r2c) 1038 src = &r1->code_blocks->cb[i1++]; 1039 else if ( r1->code_blocks->cb[i1].start 1040 < r2->code_blocks->cb[i2].start) 1041 { 1042 src = &r1->code_blocks->cb[i1++]; 1043 assert(src->end < r2->code_blocks->cb[i2].start); 1044 } 1045 else { 1046 assert( r1->code_blocks->cb[i1].start 1047 > r2->code_blocks->cb[i2].start); 1048 src = &r2->code_blocks->cb[i2++]; 1049 is_qr = 1; 1050 assert(src->end < r1->code_blocks->cb[i1].start); 1051 } 1052 1053 assert(pat[src->start] == '('); 1054 assert(pat[src->end] == ')'); 1055 dst->start = src->start; 1056 dst->end = src->end; 1057 dst->block = src->block; 1058 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) 1059 : src->src_regex; 1060 dst++; 1061 } 1062 r1->code_blocks->count += r2c; 1063 Safefree(r1->code_blocks->cb); 1064 r1->code_blocks->cb = new_block; 1065 } 1066 1067 SvREFCNT_dec_NN(qr); 1068 return 1; 1069 } 1070 1071 1072 STATIC bool 1073 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, 1074 struct reg_substr_datum *rsd, 1075 struct scan_data_substrs *sub, 1076 STRLEN longest_length) 1077 { 1078 /* This is the common code for setting up the floating and fixed length 1079 * string data extracted from Perl_re_op_compile() below. Returns a boolean 1080 * as to whether succeeded or not */ 1081 1082 I32 t; 1083 SSize_t ml; 1084 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL); 1085 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL); 1086 1087 if (! (longest_length 1088 || (eol /* Can't have SEOL and MULTI */ 1089 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) 1090 ) 1091 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */ 1092 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN)) 1093 { 1094 return FALSE; 1095 } 1096 1097 /* copy the information about the longest from the reg_scan_data 1098 over to the program. */ 1099 if (SvUTF8(sub->str)) { 1100 rsd->substr = NULL; 1101 rsd->utf8_substr = sub->str; 1102 } else { 1103 rsd->substr = sub->str; 1104 rsd->utf8_substr = NULL; 1105 } 1106 /* end_shift is how many chars that must be matched that 1107 follow this item. We calculate it ahead of time as once the 1108 lookbehind offset is added in we lose the ability to correctly 1109 calculate it.*/ 1110 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length; 1111 rsd->end_shift = ml - sub->min_offset 1112 - longest_length 1113 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL 1114 * intead? - DAPM 1115 + (SvTAIL(sub->str) != 0) 1116 */ 1117 + sub->lookbehind; 1118 1119 t = (eol/* Can't have SEOL and MULTI */ 1120 && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); 1121 fbm_compile(sub->str, t ? FBMcf_TAIL : 0); 1122 1123 return TRUE; 1124 } 1125 1126 STATIC void 1127 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx) 1128 { 1129 /* Calculates and sets in the compiled pattern 'Rx' the string to compile, 1130 * properly wrapped with the right modifiers */ 1131 1132 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); 1133 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags) 1134 != REGEX_DEPENDS_CHARSET); 1135 1136 /* The caret is output if there are any defaults: if not all the STD 1137 * flags are set, or if no character set specifier is needed */ 1138 bool has_default = 1139 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) 1140 || ! has_charset); 1141 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) 1142 == REG_RUN_ON_COMMENT_SEEN); 1143 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD) 1144 >> RXf_PMf_STD_PMMOD_SHIFT); 1145 const char *fptr = STD_PAT_MODS; /*"msixxn"*/ 1146 char *p; 1147 STRLEN pat_len = RExC_precomp_end - RExC_precomp; 1148 1149 /* We output all the necessary flags; we never output a minus, as all 1150 * those are defaults, so are 1151 * covered by the caret */ 1152 const STRLEN wraplen = pat_len + has_p + has_runon 1153 + has_default /* If needs a caret */ 1154 + PL_bitcount[reganch] /* 1 char for each set standard flag */ 1155 1156 /* If needs a character set specifier */ 1157 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) 1158 + (sizeof("(?:)") - 1); 1159 1160 PERL_ARGS_ASSERT_SET_REGEX_PV; 1161 1162 /* make sure PL_bitcount bounds not exceeded */ 1163 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8); 1164 1165 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */ 1166 SvPOK_on(Rx); 1167 if (RExC_utf8) 1168 SvFLAGS(Rx) |= SVf_UTF8; 1169 *p++='('; *p++='?'; 1170 1171 /* If a default, cover it using the caret */ 1172 if (has_default) { 1173 *p++= DEFAULT_PAT_MOD; 1174 } 1175 if (has_charset) { 1176 STRLEN len; 1177 const char* name; 1178 1179 name = get_regex_charset_name(RExC_rx->extflags, &len); 1180 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */ 1181 assert(RExC_utf8); 1182 name = UNICODE_PAT_MODS; 1183 len = sizeof(UNICODE_PAT_MODS) - 1; 1184 } 1185 Copy(name, p, len, char); 1186 p += len; 1187 } 1188 if (has_p) 1189 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ 1190 { 1191 char ch; 1192 while((ch = *fptr++)) { 1193 if(reganch & 1) 1194 *p++ = ch; 1195 reganch >>= 1; 1196 } 1197 } 1198 1199 *p++ = ':'; 1200 Copy(RExC_precomp, p, pat_len, char); 1201 assert ((RX_WRAPPED(Rx) - p) < 16); 1202 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx); 1203 p += pat_len; 1204 1205 /* Adding a trailing \n causes this to compile properly: 1206 my $R = qr / A B C # D E/x; /($R)/ 1207 Otherwise the parens are considered part of the comment */ 1208 if (has_runon) 1209 *p++ = '\n'; 1210 *p++ = ')'; 1211 *p = 0; 1212 SvCUR_set(Rx, p - RX_WRAPPED(Rx)); 1213 } 1214 1215 STATIC void 1216 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) 1217 { 1218 /* The inversion list in the SSC is marked mortal; now we need a more 1219 * permanent copy, which is stored the same way that is done in a regular 1220 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit 1221 * map */ 1222 1223 SV* invlist = invlist_clone(ssc->invlist, NULL); 1224 1225 PERL_ARGS_ASSERT_SSC_FINALIZE; 1226 1227 assert(is_ANYOF_SYNTHETIC(ssc)); 1228 1229 /* The code in this file assumes that all but these flags aren't relevant 1230 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared 1231 * by the time we reach here */ 1232 assert(! (ANYOF_FLAGS(ssc) 1233 & ~( ANYOF_COMMON_FLAGS 1234 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared 1235 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES))); 1236 1237 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist); 1238 1239 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); 1240 SvREFCNT_dec(invlist); 1241 1242 /* Make sure is clone-safe */ 1243 ssc->invlist = NULL; 1244 1245 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { 1246 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; 1247 OP(ssc) = ANYOFPOSIXL; 1248 } 1249 else if (RExC_contains_locale) { 1250 OP(ssc) = ANYOFL; 1251 } 1252 1253 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); 1254 } 1255 1256 STATIC bool 1257 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) 1258 { 1259 /* The synthetic start class is used to hopefully quickly winnow down 1260 * places where a pattern could start a match in the target string. If it 1261 * doesn't really narrow things down that much, there isn't much point to 1262 * having the overhead of using it. This function uses some very crude 1263 * heuristics to decide if to use the ssc or not. 1264 * 1265 * It returns TRUE if 'ssc' rules out more than half what it considers to 1266 * be the "likely" possible matches, but of course it doesn't know what the 1267 * actual things being matched are going to be; these are only guesses 1268 * 1269 * For /l matches, it assumes that the only likely matches are going to be 1270 * in the 0-255 range, uniformly distributed, so half of that is 127 1271 * For /a and /d matches, it assumes that the likely matches will be just 1272 * the ASCII range, so half of that is 63 1273 * For /u and there isn't anything matching above the Latin1 range, it 1274 * assumes that that is the only range likely to be matched, and uses 1275 * half that as the cut-off: 127. If anything matches above Latin1, 1276 * it assumes that all of Unicode could match (uniformly), except for 1277 * non-Unicode code points and things in the General Category "Other" 1278 * (unassigned, private use, surrogates, controls and formats). This 1279 * is a much large number. */ 1280 1281 U32 count = 0; /* Running total of number of code points matched by 1282 'ssc' */ 1283 UV start, end; /* Start and end points of current range in inversion 1284 XXX outdated. UTF-8 locales are common, what about invert? list */ 1285 const U32 max_code_points = (LOC) 1286 ? 256 1287 : (( ! UNI_SEMANTICS 1288 || invlist_highest(ssc->invlist) < 256) 1289 ? 128 1290 : NON_OTHER_COUNT); 1291 const U32 max_match = max_code_points / 2; 1292 1293 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; 1294 1295 invlist_iterinit(ssc->invlist); 1296 while (invlist_iternext(ssc->invlist, &start, &end)) { 1297 if (start >= max_code_points) { 1298 break; 1299 } 1300 end = MIN(end, max_code_points - 1); 1301 count += end - start + 1; 1302 if (count >= max_match) { 1303 invlist_iterfinish(ssc->invlist); 1304 return FALSE; 1305 } 1306 } 1307 1308 return TRUE; 1309 } 1310 1311 static void 1312 release_RExC_state(pTHX_ void *vstate) { 1313 RExC_state_t *pRExC_state = (RExC_state_t *)vstate; 1314 1315 /* Any or all of these might be NULL. 1316 1317 There's no point in setting them to NULL after the free, since 1318 pRExC_state is about to be released. 1319 */ 1320 SvREFCNT_dec(RExC_rx_sv); 1321 Safefree(RExC_open_parens); 1322 Safefree(RExC_close_parens); 1323 Safefree(RExC_logical_to_parno); 1324 Safefree(RExC_parno_to_logical); 1325 1326 Safefree(pRExC_state); 1327 } 1328 1329 /* 1330 * Perl_re_op_compile - the perl internal RE engine's function to compile a 1331 * regular expression into internal code. 1332 * The pattern may be passed either as: 1333 * a list of SVs (patternp plus pat_count) 1334 * a list of OPs (expr) 1335 * If both are passed, the SV list is used, but the OP list indicates 1336 * which SVs are actually pre-compiled code blocks 1337 * 1338 * The SVs in the list have magic and qr overloading applied to them (and 1339 * the list may be modified in-place with replacement SVs in the latter 1340 * case). 1341 * 1342 * If the pattern hasn't changed from old_re, then old_re will be 1343 * returned. 1344 * 1345 * eng is the current engine. If that engine has an op_comp method, then 1346 * handle directly (i.e. we assume that op_comp was us); otherwise, just 1347 * do the initial concatenation of arguments and pass on to the external 1348 * engine. 1349 * 1350 * If is_bare_re is not null, set it to a boolean indicating whether the 1351 * arg list reduced (after overloading) to a single bare regex which has 1352 * been returned (i.e. /$qr/). 1353 * 1354 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. 1355 * 1356 * pm_flags contains the PMf_* flags, typically based on those from the 1357 * pm_flags field of the related PMOP. Currently we're only interested in 1358 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD. 1359 * 1360 * For many years this code had an initial sizing pass that calculated 1361 * (sometimes incorrectly, leading to security holes) the size needed for the 1362 * compiled pattern. That was changed by commit 1363 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a 1364 * node at a time, as parsing goes along. Patches welcome to fix any obsolete 1365 * references to this sizing pass. 1366 * 1367 * Now, an initial crude guess as to the size needed is made, based on the 1368 * length of the pattern. Patches welcome to improve that guess. That amount 1369 * of space is malloc'd and then immediately freed, and then clawed back node 1370 * by node. This design is to minimize, to the extent possible, memory churn 1371 * when doing the reallocs. 1372 * 1373 * A separate parentheses counting pass may be needed in some cases. 1374 * (Previously the sizing pass did this.) Patches welcome to reduce the number 1375 * of these cases. 1376 * 1377 * The existence of a sizing pass necessitated design decisions that are no 1378 * longer needed. There are potential areas of simplification. 1379 * 1380 * Beware that the optimization-preparation code in here knows about some 1381 * of the structure of the compiled regexp. [I'll say.] 1382 */ 1383 1384 REGEXP * 1385 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, 1386 OP *expr, const regexp_engine* eng, REGEXP *old_re, 1387 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags) 1388 { 1389 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */ 1390 STRLEN plen; 1391 char *exp; 1392 regnode *scan; 1393 I32 flags; 1394 SSize_t minlen = 0; 1395 U32 rx_flags; 1396 SV *pat; 1397 SV** new_patternp = patternp; 1398 1399 /* these are all flags - maybe they should be turned 1400 * into a single int with different bit masks */ 1401 I32 sawlookahead = 0; 1402 I32 sawplus = 0; 1403 I32 sawopen = 0; 1404 I32 sawminmod = 0; 1405 1406 regex_charset initial_charset = get_regex_charset(orig_rx_flags); 1407 bool recompile = 0; 1408 bool runtime_code = 0; 1409 scan_data_t data; 1410 1411 #ifdef TRIE_STUDY_OPT 1412 /* search for "restudy" in this file for a detailed explanation */ 1413 int restudied = 0; 1414 RExC_state_t copyRExC_state; 1415 #endif 1416 DECLARE_AND_GET_RE_DEBUG_FLAGS; 1417 1418 PERL_ARGS_ASSERT_RE_OP_COMPILE; 1419 1420 DEBUG_r(if (!PL_colorset) reginitcolors()); 1421 1422 RExC_state_t *pRExC_state = NULL; 1423 /* Ensure that all members of the pRExC_state is initialized to 0 1424 * at the start of regex compilation. Historically we have had issues 1425 * with people remembering to zero specific members or zeroing them 1426 * too late, etc. Doing it in one place is saner and avoid oversight 1427 * or error. */ 1428 Newxz(pRExC_state, 1, RExC_state_t); 1429 1430 SAVEDESTRUCTOR_X(release_RExC_state, pRExC_state); 1431 1432 DEBUG_r({ 1433 /* and then initialize RExC_mysv1 and RExC_mysv2 early so if 1434 * something calls regprop we don't have issues. These variables 1435 * not being set up properly motivated the use of Newxz() to initalize 1436 * the pRExC_state structure, as there were codepaths under -Uusedl 1437 * that left these unitialized, and non-null as well. */ 1438 RExC_mysv1 = sv_newmortal(); 1439 RExC_mysv2 = sv_newmortal(); 1440 }); 1441 1442 if (is_bare_re) 1443 *is_bare_re = FALSE; 1444 1445 if (expr && (expr->op_type == OP_LIST || 1446 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { 1447 /* allocate code_blocks if needed */ 1448 OP *o; 1449 int ncode = 0; 1450 1451 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) 1452 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) 1453 ncode++; /* count of DO blocks */ 1454 1455 if (ncode) 1456 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode); 1457 } 1458 1459 if (!pat_count) { 1460 /* compile-time pattern with just OP_CONSTs and DO blocks */ 1461 1462 int n; 1463 OP *o; 1464 1465 /* find how many CONSTs there are */ 1466 assert(expr); 1467 n = 0; 1468 if (expr->op_type == OP_CONST) 1469 n = 1; 1470 else 1471 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 1472 if (o->op_type == OP_CONST) 1473 n++; 1474 } 1475 1476 /* fake up an SV array */ 1477 1478 assert(!new_patternp); 1479 Newx(new_patternp, n, SV*); 1480 SAVEFREEPV(new_patternp); 1481 pat_count = n; 1482 1483 n = 0; 1484 if (expr->op_type == OP_CONST) 1485 new_patternp[n] = cSVOPx_sv(expr); 1486 else 1487 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { 1488 if (o->op_type == OP_CONST) 1489 new_patternp[n++] = cSVOPo_sv; 1490 } 1491 1492 } 1493 1494 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 1495 "Assembling pattern from %d elements%s\n", pat_count, 1496 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 1497 1498 /* set expr to the first arg op */ 1499 1500 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count 1501 && expr->op_type != OP_CONST) 1502 { 1503 expr = cLISTOPx(expr)->op_first; 1504 assert( expr->op_type == OP_PUSHMARK 1505 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) 1506 || expr->op_type == OP_PADRANGE); 1507 expr = OpSIBLING(expr); 1508 } 1509 1510 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, 1511 expr, &recompile, NULL); 1512 1513 /* handle bare (possibly after overloading) regex: foo =~ $re */ 1514 { 1515 SV *re = pat; 1516 if (SvROK(re)) 1517 re = SvRV(re); 1518 if (SvTYPE(re) == SVt_REGEXP) { 1519 if (is_bare_re) 1520 *is_bare_re = TRUE; 1521 SvREFCNT_inc(re); 1522 DEBUG_PARSE_r(Perl_re_printf( aTHX_ 1523 "Precompiled pattern%s\n", 1524 orig_rx_flags & RXf_SPLIT ? " for split" : "")); 1525 1526 return (REGEXP*)re; 1527 } 1528 } 1529 1530 exp = SvPV_nomg(pat, plen); 1531 1532 if (!eng->op_comp) { 1533 if ((SvUTF8(pat) && IN_BYTES) 1534 || SvGMAGICAL(pat) || SvAMAGIC(pat)) 1535 { 1536 /* make a temporary copy; either to convert to bytes, 1537 * or to avoid repeating get-magic / overloaded stringify */ 1538 pat = newSVpvn_flags(exp, plen, SVs_TEMP | 1539 (IN_BYTES ? 0 : SvUTF8(pat))); 1540 } 1541 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); 1542 } 1543 1544 /* ignore the utf8ness if the pattern is 0 length */ 1545 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); 1546 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); 1547 1548 1549 DEBUG_COMPILE_r({ 1550 RE_PV_QUOTED_DECL(s, RExC_utf8, RExC_mysv, exp, plen, PL_dump_re_max_len); 1551 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n", 1552 PL_colors[4], PL_colors[5], s); 1553 }); 1554 1555 /* we jump here if we have to recompile, e.g., from upgrading the pattern 1556 * to utf8 */ 1557 1558 if ((pm_flags & PMf_USE_RE_EVAL) 1559 /* this second condition covers the non-regex literal case, 1560 * i.e. $foo =~ '(?{})'. */ 1561 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL)) 1562 ) 1563 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen); 1564 1565 redo_parse: 1566 /* return old regex if pattern hasn't changed */ 1567 /* XXX: note in the below we have to check the flags as well as the 1568 * pattern. 1569 * 1570 * Things get a touch tricky as we have to compare the utf8 flag 1571 * independently from the compile flags. 1572 * 1573 * ALSO NOTE: After this point we may need to zero members of pRExC_state 1574 * explicitly. Prior to this point they should all be zeroed as part of 1575 * a struct wide Zero instruction. 1576 */ 1577 1578 if ( old_re 1579 && !recompile 1580 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8) 1581 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) 1582 && RX_PRELEN(old_re) == plen 1583 && memEQ(RX_PRECOMP(old_re), exp, plen) 1584 && !runtime_code /* with runtime code, always recompile */ ) 1585 { 1586 DEBUG_COMPILE_r({ 1587 RE_PV_QUOTED_DECL(s, RExC_utf8, RExC_mysv, exp, plen, PL_dump_re_max_len); 1588 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n", 1589 PL_colors[4], PL_colors[5], s); 1590 }); 1591 return old_re; 1592 } 1593 1594 /* Allocate the pattern's SV */ 1595 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP); 1596 RExC_rx = ReANY(Rx); 1597 if ( RExC_rx == NULL ) 1598 FAIL("Regexp out of space"); 1599 1600 rx_flags = orig_rx_flags; 1601 if (rx_flags & RXf_SPLIT) 1602 rx_flags &= ~(RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE); 1603 1604 if ( toUSE_UNI_CHARSET_NOT_DEPENDS 1605 && initial_charset == REGEX_DEPENDS_CHARSET) 1606 { 1607 1608 /* Set to use unicode semantics if the pattern is in utf8 and has the 1609 * 'depends' charset specified, as it means unicode when utf8 */ 1610 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); 1611 RExC_uni_semantics = 1; 1612 } 1613 1614 RExC_pm_flags = pm_flags; 1615 1616 if (runtime_code) { 1617 assert(TAINTING_get || !TAINT_get); 1618 if (TAINT_get) 1619 Perl_croak(aTHX_ "Eval-group in insecure regular expression"); 1620 1621 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { 1622 /* whoops, we have a non-utf8 pattern, whilst run-time code 1623 * got compiled as utf8. Try again with a utf8 pattern */ 1624 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 1625 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); 1626 goto redo_parse; 1627 } 1628 } 1629 assert(!pRExC_state->runtime_code_qr); 1630 1631 RExC_sawback = 0; 1632 1633 RExC_seen = 0; 1634 RExC_maxlen = 0; 1635 RExC_in_lookaround = 0; 1636 RExC_seen_zerolen = *exp == '^' ? -1 : 0; 1637 RExC_recode_x_to_native = 0; 1638 RExC_in_multi_char_class = 0; 1639 1640 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; 1641 RExC_precomp_end = RExC_end = exp + plen; 1642 RExC_nestroot = 0; 1643 RExC_whilem_seen = 0; 1644 RExC_end_op = NULL; 1645 RExC_recurse = NULL; 1646 RExC_study_chunk_recursed = NULL; 1647 RExC_study_chunk_recursed_bytes= 0; 1648 RExC_recurse_count = 0; 1649 RExC_sets_depth = 0; 1650 pRExC_state->code_index = 0; 1651 1652 /* Initialize the string in the compiled pattern. This is so that there is 1653 * something to output if necessary */ 1654 set_regex_pv(pRExC_state, Rx); 1655 1656 DEBUG_PARSE_r({ 1657 Perl_re_printf( aTHX_ 1658 "Starting parse and generation\n"); 1659 RExC_lastnum=0; 1660 RExC_lastparse=NULL; 1661 }); 1662 1663 /* Allocate space and zero-initialize. Note, the two step process 1664 of zeroing when in debug mode, thus anything assigned has to 1665 happen after that */ 1666 if (! RExC_size) { 1667 1668 /* On the first pass of the parse, we guess how big this will be. Then 1669 * we grow in one operation to that amount and then give it back. As 1670 * we go along, we re-allocate what we need. 1671 * 1672 * XXX Currently the guess is essentially that the pattern will be an 1673 * EXACT node with one byte input, one byte output. This is crude, and 1674 * better heuristics are welcome. 1675 * 1676 * On any subsequent passes, we guess what we actually computed in the 1677 * latest earlier pass. Such a pass probably didn't complete so is 1678 * missing stuff. We could improve those guesses by knowing where the 1679 * parse stopped, and use the length so far plus apply the above 1680 * assumption to what's left. */ 1681 RExC_size = STR_SZ(RExC_end - RExC_start); 1682 } 1683 1684 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal); 1685 if ( RExC_rxi == NULL ) 1686 FAIL("Regexp out of space"); 1687 1688 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char); 1689 RXi_SET( RExC_rx, RExC_rxi ); 1690 1691 /* We start from 0 (over from 0 in the case this is a reparse. The first 1692 * node parsed will give back any excess memory we have allocated so far). 1693 * */ 1694 RExC_size = 0; 1695 1696 /* non-zero initialization begins here */ 1697 RExC_rx->engine= eng; 1698 RExC_rx->extflags = rx_flags; 1699 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK; 1700 1701 if (pm_flags & PMf_IS_QR) { 1702 RExC_rxi->code_blocks = pRExC_state->code_blocks; 1703 if (RExC_rxi->code_blocks) { 1704 RExC_rxi->code_blocks->refcnt++; 1705 } 1706 } 1707 1708 RExC_rx->intflags = 0; 1709 1710 RExC_flags = rx_flags; /* don't let top level (?i) bleed */ 1711 RExC_parse_set(exp); 1712 1713 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv 1714 * code makes sure the final byte is an uncounted NUL. But should this 1715 * ever not be the case, lots of things could read beyond the end of the 1716 * buffer: loops like 1717 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1); 1718 * strchr(RExC_parse, "foo"); 1719 * etc. So it is worth noting. */ 1720 assert(*RExC_end == '\0'); 1721 1722 RExC_naughty = 0; 1723 RExC_npar = 1; 1724 RExC_logical_npar = 1; 1725 RExC_parens_buf_size = 0; 1726 RExC_emit_start = RExC_rxi->program; 1727 pRExC_state->code_index = 0; 1728 1729 *((char*) RExC_emit_start) = (char) REG_MAGIC; 1730 RExC_emit = NODE_STEP_REGNODE; 1731 1732 /* Do the parse */ 1733 if (reg(pRExC_state, 0, &flags, 1)) { 1734 1735 /* Success!, But we may need to redo the parse knowing how many parens 1736 * there actually are */ 1737 if (IN_PARENS_PASS) { 1738 flags |= RESTART_PARSE; 1739 } 1740 1741 /* We have that number in RExC_npar */ 1742 RExC_total_parens = RExC_npar; 1743 RExC_logical_total_parens = RExC_logical_npar; 1744 } 1745 else if (! MUST_RESTART(flags)) { 1746 ReREFCNT_dec(Rx); 1747 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags); 1748 } 1749 1750 /* Here, we either have success, or we have to redo the parse for some reason */ 1751 if (MUST_RESTART(flags)) { 1752 1753 /* It's possible to write a regexp in ascii that represents Unicode 1754 codepoints outside of the byte range, such as via \x{100}. If we 1755 detect such a sequence we have to convert the entire pattern to utf8 1756 and then recompile, as our sizing calculation will have been based 1757 on 1 byte == 1 character, but we will need to use utf8 to encode 1758 at least some part of the pattern, and therefore must convert the whole 1759 thing. 1760 -- dmq */ 1761 if (flags & NEED_UTF8) { 1762 1763 /* We have stored the offset of the final warning output so far. 1764 * That must be adjusted. Any variant characters between the start 1765 * of the pattern and this warning count for 2 bytes in the final, 1766 * so just add them again */ 1767 if (UNLIKELY(RExC_latest_warn_offset > 0)) { 1768 RExC_latest_warn_offset += 1769 variant_under_utf8_count((U8 *) exp, (U8 *) exp 1770 + RExC_latest_warn_offset); 1771 } 1772 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, 1773 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); 1774 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n")); 1775 } 1776 else { 1777 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); 1778 } 1779 1780 if (ALL_PARENS_COUNTED) { 1781 /* Make enough room for all the known parens, and zero it */ 1782 Renew(RExC_open_parens, RExC_total_parens, regnode_offset); 1783 Zero(RExC_open_parens, RExC_total_parens, regnode_offset); 1784 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ 1785 1786 Renew(RExC_close_parens, RExC_total_parens, regnode_offset); 1787 Zero(RExC_close_parens, RExC_total_parens, regnode_offset); 1788 /* we do NOT reinitialize RExC_logical_to_parno and 1789 * RExC_parno_to_logical here. We need their data on the second 1790 * pass */ 1791 } 1792 else { /* Parse did not complete. Reinitialize the parentheses 1793 structures */ 1794 RExC_total_parens = 0; 1795 if (RExC_open_parens) { 1796 Safefree(RExC_open_parens); 1797 RExC_open_parens = NULL; 1798 } 1799 if (RExC_close_parens) { 1800 Safefree(RExC_close_parens); 1801 RExC_close_parens = NULL; 1802 } 1803 if (RExC_logical_to_parno) { 1804 Safefree(RExC_logical_to_parno); 1805 RExC_logical_to_parno = NULL; 1806 } 1807 if (RExC_parno_to_logical) { 1808 Safefree(RExC_parno_to_logical); 1809 RExC_parno_to_logical = NULL; 1810 } 1811 } 1812 1813 /* Clean up what we did in this parse */ 1814 SvREFCNT_dec_NN(RExC_rx_sv); 1815 RExC_rx_sv = NULL; 1816 1817 goto redo_parse; 1818 } 1819 1820 /* Here, we have successfully parsed and generated the pattern's program 1821 * for the regex engine. We are ready to finish things up and look for 1822 * optimizations. */ 1823 1824 /* Update the string to compile, with correct modifiers, etc */ 1825 set_regex_pv(pRExC_state, Rx); 1826 1827 RExC_rx->nparens = RExC_total_parens - 1; 1828 RExC_rx->logical_nparens = RExC_logical_total_parens - 1; 1829 1830 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */ 1831 if (RExC_whilem_seen > 15) 1832 RExC_whilem_seen = 15; 1833 1834 DEBUG_PARSE_r({ 1835 Perl_re_printf( aTHX_ 1836 "Required size %" IVdf " nodes\n", (IV)RExC_size); 1837 RExC_lastnum=0; 1838 RExC_lastparse=NULL; 1839 }); 1840 1841 SetProgLen(RExC_rxi,RExC_size); 1842 1843 DEBUG_DUMP_PRE_OPTIMIZE_r({ 1844 SV * const sv = sv_newmortal(); /* can this use RExC_mysv? */ 1845 RXi_GET_DECL(RExC_rx, ri); 1846 DEBUG_RExC_seen(); 1847 Perl_re_printf( aTHX_ "Program before optimization:\n"); 1848 1849 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL, 1850 sv, 0, 0); 1851 }); 1852 1853 DEBUG_OPTIMISE_r( 1854 Perl_re_printf( aTHX_ "Starting post parse optimization\n"); 1855 ); 1856 1857 /* XXXX To minimize changes to RE engine we always allocate 1858 3-units-long substrs field. */ 1859 Newx(RExC_rx->substrs, 1, struct reg_substr_data); 1860 if (RExC_recurse_count) { 1861 Newx(RExC_recurse, RExC_recurse_count, regnode *); 1862 SAVEFREEPV(RExC_recurse); 1863 } 1864 1865 if (RExC_seen & REG_RECURSE_SEEN) { 1866 /* Note, RExC_total_parens is 1 + the number of parens in a pattern. 1867 * So its 1 if there are no parens. */ 1868 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) + 1869 ((RExC_total_parens & 0x07) != 0); 1870 Newx(RExC_study_chunk_recursed, 1871 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); 1872 SAVEFREEPV(RExC_study_chunk_recursed); 1873 } 1874 1875 reStudy: 1876 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; 1877 DEBUG_r( 1878 RExC_study_chunk_recursed_count= 0; 1879 ); 1880 Zero(RExC_rx->substrs, 1, struct reg_substr_data); 1881 if (RExC_study_chunk_recursed) { 1882 Zero(RExC_study_chunk_recursed, 1883 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8); 1884 } 1885 1886 1887 #ifdef TRIE_STUDY_OPT 1888 /* search for "restudy" in this file for a detailed explanation */ 1889 if (!restudied) { 1890 StructCopy(&zero_scan_data, &data, scan_data_t); 1891 copyRExC_state = *pRExC_state; 1892 } else { 1893 U32 seen=RExC_seen; 1894 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n")); 1895 1896 *pRExC_state = copyRExC_state; 1897 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) 1898 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 1899 else 1900 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN; 1901 StructCopy(&zero_scan_data, &data, scan_data_t); 1902 } 1903 #else 1904 StructCopy(&zero_scan_data, &data, scan_data_t); 1905 #endif 1906 1907 /* Dig out information for optimizations. */ 1908 RExC_rx->extflags = RExC_flags; /* was pm_op */ 1909 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ 1910 1911 if (UTF) 1912 SvUTF8_on(Rx); /* Unicode in it? */ 1913 RExC_rxi->regstclass = NULL; 1914 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ 1915 RExC_rx->intflags |= PREGf_NAUGHTY; 1916 scan = RExC_rxi->program + 1; /* First BRANCH. */ 1917 1918 /* testing for BRANCH here tells us whether there is "must appear" 1919 data in the pattern. If there is then we can use it for optimisations */ 1920 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice. 1921 */ 1922 SSize_t fake_deltap; 1923 STRLEN longest_length[2]; 1924 regnode_ssc ch_class; /* pointed to by data */ 1925 int stclass_flag; 1926 SSize_t last_close = 0; /* pointed to by data */ 1927 regnode *first= scan; 1928 regnode *first_next= regnext(first); 1929 regnode *last_close_op= NULL; 1930 int i; 1931 1932 /* 1933 * Skip introductions and multiplicators >= 1 1934 * so that we can extract the 'meat' of the pattern that must 1935 * match in the large if() sequence following. 1936 * NOTE that EXACT is NOT covered here, as it is normally 1937 * picked up by the optimiser separately. 1938 * 1939 * This is unfortunate as the optimiser isnt handling lookahead 1940 * properly currently. 1941 * 1942 */ 1943 while (1) 1944 { 1945 if (OP(first) == OPEN) 1946 sawopen = 1; 1947 else 1948 if (OP(first) == IFMATCH && !FLAGS(first)) 1949 /* for now we can't handle lookbehind IFMATCH */ 1950 sawlookahead = 1; 1951 else 1952 if (OP(first) == PLUS) 1953 sawplus = 1; 1954 else 1955 if (OP(first) == MINMOD) 1956 sawminmod = 1; 1957 else 1958 if (!( 1959 /* An OR of *one* alternative - should not happen now. */ 1960 (OP(first) == BRANCH && OP(first_next) != BRANCH) || 1961 /* An {n,m} with n>0 */ 1962 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) || 1963 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END) 1964 )){ 1965 break; 1966 } 1967 1968 first = REGNODE_AFTER(first); 1969 first_next= regnext(first); 1970 } 1971 1972 /* Starting-point info. */ 1973 again: 1974 DEBUG_PEEP("first:", first, 0, 0); 1975 /* Ignore EXACT as we deal with it later. */ 1976 if (REGNODE_TYPE(OP(first)) == EXACT) { 1977 if (! isEXACTFish(OP(first))) { 1978 NOOP; /* Empty, get anchored substr later. */ 1979 } 1980 else 1981 RExC_rxi->regstclass = first; 1982 } 1983 #ifdef TRIE_STCLASS 1984 else if (REGNODE_TYPE(OP(first)) == TRIE && 1985 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0) 1986 { 1987 /* this can happen only on restudy 1988 * Search for "restudy" in this file to find 1989 * a comment with details. */ 1990 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); 1991 } 1992 #endif 1993 else if (REGNODE_SIMPLE(OP(first))) 1994 RExC_rxi->regstclass = first; 1995 else if (REGNODE_TYPE(OP(first)) == BOUND || 1996 REGNODE_TYPE(OP(first)) == NBOUND) 1997 RExC_rxi->regstclass = first; 1998 else if (REGNODE_TYPE(OP(first)) == BOL) { 1999 RExC_rx->intflags |= (OP(first) == MBOL 2000 ? PREGf_ANCH_MBOL 2001 : PREGf_ANCH_SBOL); 2002 first = REGNODE_AFTER(first); 2003 goto again; 2004 } 2005 else if (OP(first) == GPOS) { 2006 RExC_rx->intflags |= PREGf_ANCH_GPOS; 2007 first = REGNODE_AFTER_type(first,tregnode_GPOS); 2008 goto again; 2009 } 2010 else if ((!sawopen || !RExC_sawback) && 2011 !sawlookahead && 2012 (OP(first) == STAR && 2013 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) && 2014 !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN)) 2015 { 2016 /* turn .* into ^.* with an implied $*=1 */ 2017 const int type = 2018 (OP(REGNODE_AFTER(first)) == REG_ANY) 2019 ? PREGf_ANCH_MBOL 2020 : PREGf_ANCH_SBOL; 2021 RExC_rx->intflags |= (type | PREGf_IMPLICIT); 2022 first = REGNODE_AFTER(first); 2023 goto again; 2024 } 2025 if (sawplus && !sawminmod && !sawlookahead 2026 && (!sawopen || !RExC_sawback) 2027 && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */ 2028 /* x+ must match at the 1st pos of run of x's */ 2029 RExC_rx->intflags |= PREGf_SKIP; 2030 2031 /* Scan is after the zeroth branch, first is atomic matcher. */ 2032 #ifdef TRIE_STUDY_OPT 2033 /* search for "restudy" in this file for a detailed explanation */ 2034 DEBUG_PARSE_r( 2035 if (!restudied) 2036 Perl_re_printf( aTHX_ "first at %" IVdf "\n", 2037 (IV)(first - scan + 1)) 2038 ); 2039 #else 2040 DEBUG_PARSE_r( 2041 Perl_re_printf( aTHX_ "first at %" IVdf "\n", 2042 (IV)(first - scan + 1)) 2043 ); 2044 #endif 2045 2046 2047 /* 2048 * If there's something expensive in the r.e., find the 2049 * longest literal string that must appear and make it the 2050 * regmust. Resolve ties in favor of later strings, since 2051 * the regstart check works with the beginning of the r.e. 2052 * and avoiding duplication strengthens checking. Not a 2053 * strong reason, but sufficient in the absence of others. 2054 * [Now we resolve ties in favor of the earlier string if 2055 * it happens that c_offset_min has been invalidated, since the 2056 * earlier string may buy us something the later one won't.] 2057 */ 2058 2059 data.substrs[0].str = newSVpvs(""); 2060 data.substrs[1].str = newSVpvs(""); 2061 data.last_found = newSVpvs(""); 2062 data.cur_is_floating = 0; /* initially any found substring is fixed */ 2063 ENTER_with_name("study_chunk"); 2064 SAVEFREESV(data.substrs[0].str); 2065 SAVEFREESV(data.substrs[1].str); 2066 SAVEFREESV(data.last_found); 2067 first = scan; 2068 if (!RExC_rxi->regstclass) { 2069 ssc_init(pRExC_state, &ch_class); 2070 data.start_class = &ch_class; 2071 stclass_flag = SCF_DO_STCLASS_AND; 2072 } else /* XXXX Check for BOUND? */ 2073 stclass_flag = 0; 2074 data.last_closep = &last_close; 2075 data.last_close_opp = &last_close_op; 2076 2077 DEBUG_RExC_seen(); 2078 /* 2079 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/ 2080 * (NO top level branches) 2081 */ 2082 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap, 2083 scan + RExC_size, /* Up to end */ 2084 &data, -1, 0, NULL, 2085 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag 2086 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0), 2087 0, TRUE); 2088 /* search for "restudy" in this file for a detailed explanation 2089 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */ 2090 2091 2092 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); 2093 2094 2095 if ( RExC_total_parens == 1 && !data.cur_is_floating 2096 && data.last_start_min == 0 && data.last_end > 0 2097 && !RExC_seen_zerolen 2098 && !(RExC_seen & REG_VERBARG_SEEN) 2099 && !(RExC_seen & REG_GPOS_SEEN) 2100 ){ 2101 RExC_rx->extflags |= RXf_CHECK_ALL; 2102 } 2103 scan_commit(pRExC_state, &data,&minlen, 0); 2104 2105 2106 /* XXX this is done in reverse order because that's the way the 2107 * code was before it was parameterised. Don't know whether it 2108 * actually needs doing in reverse order. DAPM */ 2109 for (i = 1; i >= 0; i--) { 2110 longest_length[i] = CHR_SVLEN(data.substrs[i].str); 2111 2112 if ( !( i 2113 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */ 2114 && data.substrs[0].min_offset 2115 == data.substrs[1].min_offset 2116 && SvCUR(data.substrs[0].str) 2117 == SvCUR(data.substrs[1].str) 2118 ) 2119 && S_setup_longest (aTHX_ pRExC_state, 2120 &(RExC_rx->substrs->data[i]), 2121 &(data.substrs[i]), 2122 longest_length[i])) 2123 { 2124 RExC_rx->substrs->data[i].min_offset = 2125 data.substrs[i].min_offset - data.substrs[i].lookbehind; 2126 2127 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset; 2128 /* Don't offset infinity */ 2129 if (data.substrs[i].max_offset < OPTIMIZE_INFTY) 2130 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind; 2131 SvREFCNT_inc_simple_void_NN(data.substrs[i].str); 2132 } 2133 else { 2134 RExC_rx->substrs->data[i].substr = NULL; 2135 RExC_rx->substrs->data[i].utf8_substr = NULL; 2136 longest_length[i] = 0; 2137 } 2138 } 2139 2140 LEAVE_with_name("study_chunk"); 2141 2142 if (RExC_rxi->regstclass 2143 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY)) 2144 RExC_rxi->regstclass = NULL; 2145 2146 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr) 2147 || RExC_rx->substrs->data[0].min_offset) 2148 && stclass_flag 2149 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 2150 && is_ssc_worth_it(pRExC_state, data.start_class)) 2151 { 2152 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); 2153 2154 ssc_finalize(pRExC_state, data.start_class); 2155 2156 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 2157 StructCopy(data.start_class, 2158 (regnode_ssc*)RExC_rxi->data->data[n], 2159 regnode_ssc); 2160 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; 2161 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 2162 DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); 2163 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); 2164 Perl_re_printf( aTHX_ 2165 "synthetic stclass \"%s\".\n", 2166 SvPVX_const(sv));}); 2167 data.start_class = NULL; 2168 } 2169 2170 /* A temporary algorithm prefers floated substr to fixed one of 2171 * same length to dig more info. */ 2172 i = (longest_length[0] <= longest_length[1]); 2173 RExC_rx->substrs->check_ix = i; 2174 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift; 2175 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr; 2176 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr; 2177 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset; 2178 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset; 2179 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))) 2180 RExC_rx->intflags |= PREGf_NOSCAN; 2181 2182 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) { 2183 RExC_rx->extflags |= RXf_USE_INTUIT; 2184 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8)) 2185 RExC_rx->extflags |= RXf_INTUIT_TAIL; 2186 } 2187 2188 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere) 2189 if ( (STRLEN)minlen < longest_length[1] ) 2190 minlen= longest_length[1]; 2191 if ( (STRLEN)minlen < longest_length[0] ) 2192 minlen= longest_length[0]; 2193 */ 2194 } 2195 else { 2196 /* Several toplevels. Best we can is to set minlen. */ 2197 SSize_t fake_deltap; 2198 regnode_ssc ch_class; 2199 SSize_t last_close = 0; 2200 regnode *last_close_op = NULL; 2201 2202 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n")); 2203 2204 scan = RExC_rxi->program + 1; 2205 ssc_init(pRExC_state, &ch_class); 2206 data.start_class = &ch_class; 2207 data.last_closep = &last_close; 2208 data.last_close_opp = &last_close_op; 2209 2210 DEBUG_RExC_seen(); 2211 /* 2212 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../ 2213 * (patterns WITH top level branches) 2214 */ 2215 minlen = study_chunk(pRExC_state, 2216 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL, 2217 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied 2218 ? SCF_TRIE_DOING_RESTUDY 2219 : 0), 2220 0, TRUE); 2221 /* search for "restudy" in this file for a detailed explanation 2222 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */ 2223 2224 CHECK_RESTUDY_GOTO_butfirst(NOOP); 2225 2226 RExC_rx->check_substr = NULL; 2227 RExC_rx->check_utf8 = NULL; 2228 RExC_rx->substrs->data[0].substr = NULL; 2229 RExC_rx->substrs->data[0].utf8_substr = NULL; 2230 RExC_rx->substrs->data[1].substr = NULL; 2231 RExC_rx->substrs->data[1].utf8_substr = NULL; 2232 2233 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) 2234 && is_ssc_worth_it(pRExC_state, data.start_class)) 2235 { 2236 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f")); 2237 2238 ssc_finalize(pRExC_state, data.start_class); 2239 2240 Newx(RExC_rxi->data->data[n], 1, regnode_ssc); 2241 StructCopy(data.start_class, 2242 (regnode_ssc*)RExC_rxi->data->data[n], 2243 regnode_ssc); 2244 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n]; 2245 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ 2246 DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); 2247 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state); 2248 Perl_re_printf( aTHX_ 2249 "synthetic stclass \"%s\".\n", 2250 SvPVX_const(sv));}); 2251 data.start_class = NULL; 2252 } 2253 } 2254 2255 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) { 2256 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN; 2257 RExC_rx->maxlen = REG_INFTY; 2258 } 2259 else { 2260 RExC_rx->maxlen = RExC_maxlen; 2261 } 2262 2263 /* Guard against an embedded (?=) or (?<=) with a longer minlen than 2264 the "real" pattern. */ 2265 DEBUG_OPTIMISE_r({ 2266 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n", 2267 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen); 2268 }); 2269 RExC_rx->minlenret = minlen; 2270 if (RExC_rx->minlen < minlen) 2271 RExC_rx->minlen = minlen; 2272 2273 if (RExC_seen & REG_RECURSE_SEEN ) { 2274 RExC_rx->intflags |= PREGf_RECURSE_SEEN; 2275 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *); 2276 } 2277 if (RExC_seen & REG_GPOS_SEEN) 2278 RExC_rx->intflags |= PREGf_GPOS_SEEN; 2279 2280 if (RExC_seen & REG_PESSIMIZE_SEEN) 2281 RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN; 2282 2283 if (RExC_seen & REG_LOOKBEHIND_SEEN) 2284 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the 2285 lookbehind */ 2286 if (pRExC_state->code_blocks) 2287 RExC_rx->extflags |= RXf_EVAL_SEEN; 2288 2289 if (RExC_seen & REG_VERBARG_SEEN) { 2290 RExC_rx->intflags |= PREGf_VERBARG_SEEN; 2291 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */ 2292 } 2293 2294 if (RExC_seen & REG_CUTGROUP_SEEN) 2295 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN; 2296 2297 if (pm_flags & PMf_USE_RE_EVAL) 2298 RExC_rx->intflags |= PREGf_USE_RE_EVAL; 2299 2300 if (RExC_paren_names) 2301 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); 2302 else 2303 RXp_PAREN_NAMES(RExC_rx) = NULL; 2304 2305 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED 2306 * so it can be used in pp.c */ 2307 if (RExC_rx->intflags & PREGf_ANCH) 2308 RExC_rx->extflags |= RXf_IS_ANCHORED; 2309 2310 2311 { 2312 /* this is used to identify "special" patterns that might result 2313 * in Perl NOT calling the regex engine and instead doing the match "itself", 2314 * particularly special cases in split//. By having the regex compiler 2315 * do this pattern matching at a regop level (instead of by inspecting the pattern) 2316 * we avoid weird issues with equivalent patterns resulting in different behavior, 2317 * AND we allow non Perl engines to get the same optimizations by the setting the 2318 * flags appropriately - Yves */ 2319 regnode *first = RExC_rxi->program + 1; 2320 U8 fop = OP(first); 2321 regnode *next = NULL; 2322 U8 nop = 0; 2323 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) { 2324 next = REGNODE_AFTER(first); 2325 nop = OP(next); 2326 } 2327 /* It's safe to read through *next only if OP(first) is a regop of 2328 * the right type (not EXACT, for example). 2329 */ 2330 if (REGNODE_TYPE(fop) == NOTHING && nop == END) 2331 RExC_rx->extflags |= RXf_NULL; 2332 else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END) 2333 /* when fop is SBOL first->flags will be true only when it was 2334 * produced by parsing /\A/, and not when parsing /^/. This is 2335 * very important for the split code as there we want to 2336 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. 2337 * See rt #122761 for more details. -- Yves */ 2338 RExC_rx->extflags |= RXf_START_ONLY; 2339 else if (fop == PLUS 2340 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_ 2341 && OP(regnext(first)) == END) 2342 RExC_rx->extflags |= RXf_WHITE; 2343 else if ( RExC_rx->extflags & RXf_SPLIT 2344 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop)) 2345 && STR_LEN(first) == 1 2346 && *(STRING(first)) == ' ' 2347 && OP(regnext(first)) == END ) 2348 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE); 2349 2350 } 2351 2352 if (RExC_contains_locale) { 2353 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED; 2354 } 2355 2356 #ifdef DEBUGGING 2357 if (RExC_paren_names) { 2358 RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a")); 2359 RExC_rxi->data->data[RExC_rxi->name_list_idx] 2360 = (void*)SvREFCNT_inc(RExC_paren_name_list); 2361 } else 2362 #endif 2363 RExC_rxi->name_list_idx = 0; 2364 2365 while ( RExC_recurse_count > 0 ) { 2366 const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; 2367 /* 2368 * This data structure is set up in study_chunk() and is used 2369 * to calculate the distance between a GOSUB regopcode and 2370 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's) 2371 * it refers to. 2372 * 2373 * If for some reason someone writes code that optimises 2374 * away a GOSUB opcode then the assert should be changed to 2375 * an if(scan) to guard the ARG2i_SET() - Yves 2376 * 2377 */ 2378 assert(scan && OP(scan) == GOSUB); 2379 ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan)); 2380 } 2381 if (RExC_logical_total_parens != RExC_total_parens) { 2382 Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32); 2383 /* we rebuild this below */ 2384 Zero(RExC_logical_to_parno, RExC_total_parens, I32); 2385 for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) { 2386 int logical_parno= RExC_parno_to_logical[parno]; 2387 assert(logical_parno); 2388 RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno]; 2389 RExC_logical_to_parno[logical_parno] = parno; 2390 } 2391 RExC_rx->logical_to_parno = RExC_logical_to_parno; 2392 RExC_rx->parno_to_logical = RExC_parno_to_logical; 2393 RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next; 2394 RExC_logical_to_parno = NULL; 2395 RExC_parno_to_logical = NULL; 2396 RExC_parno_to_logical_next = NULL; 2397 } else { 2398 RExC_rx->logical_to_parno = NULL; 2399 RExC_rx->parno_to_logical = NULL; 2400 RExC_rx->parno_to_logical_next = NULL; 2401 } 2402 2403 Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair); 2404 /* assume we don't need to swap parens around before we match */ 2405 DEBUG_TEST_r({ 2406 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n", 2407 (unsigned long)RExC_study_chunk_recursed_count); 2408 }); 2409 DEBUG_DUMP_r({ 2410 DEBUG_RExC_seen(); 2411 Perl_re_printf( aTHX_ "Final program:\n"); 2412 regdump(RExC_rx); 2413 }); 2414 2415 /* we're returning ownership of the SV to the caller, ensure the cleanup 2416 * doesn't release it 2417 */ 2418 RExC_rx_sv = NULL; 2419 2420 #ifdef USE_ITHREADS 2421 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated 2422 * by setting the regexp SV to readonly-only instead. If the 2423 * pattern's been recompiled, the USEDness should remain. */ 2424 if (old_re && SvREADONLY(old_re)) 2425 SvREADONLY_on(Rx); 2426 #endif 2427 return Rx; 2428 } 2429 2430 2431 2432 SV* 2433 Perl_reg_qr_package(pTHX_ REGEXP * const rx) 2434 { 2435 PERL_ARGS_ASSERT_REG_QR_PACKAGE; 2436 PERL_UNUSED_ARG(rx); 2437 if (0) 2438 return NULL; 2439 else 2440 return newSVpvs("Regexp"); 2441 } 2442 2443 /* Scans the name of a named buffer from the pattern. 2444 * If flags is REG_RSN_RETURN_NULL returns null. 2445 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name 2446 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding 2447 * to the parsed name as looked up in the RExC_paren_names hash. 2448 * If there is an error throws a vFAIL().. type exception. 2449 */ 2450 2451 #define REG_RSN_RETURN_NULL 0 2452 #define REG_RSN_RETURN_NAME 1 2453 #define REG_RSN_RETURN_DATA 2 2454 2455 STATIC SV* 2456 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) 2457 { 2458 char *name_start = RExC_parse; 2459 SV* sv_name; 2460 2461 PERL_ARGS_ASSERT_REG_SCAN_NAME; 2462 2463 assert (RExC_parse <= RExC_end); 2464 if (RExC_parse == RExC_end) NOOP; 2465 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) { 2466 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by 2467 * using do...while */ 2468 if (UTF) 2469 do { 2470 RExC_parse_inc_utf8(); 2471 } while ( RExC_parse < RExC_end 2472 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end)); 2473 else 2474 do { 2475 RExC_parse_inc_by(1); 2476 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse)); 2477 } else { 2478 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending 2479 character */ 2480 vFAIL("Group name must start with a non-digit word character"); 2481 } 2482 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), 2483 SVs_TEMP | (UTF ? SVf_UTF8 : 0)); 2484 if ( flags == REG_RSN_RETURN_NAME) 2485 return sv_name; 2486 else if (flags==REG_RSN_RETURN_DATA) { 2487 HE *he_str = NULL; 2488 SV *sv_dat = NULL; 2489 if ( ! sv_name ) /* should not happen*/ 2490 Perl_croak(aTHX_ "panic: no svname in reg_scan_name"); 2491 if (RExC_paren_names) 2492 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 ); 2493 if ( he_str ) 2494 sv_dat = HeVAL(he_str); 2495 if ( ! sv_dat ) { /* Didn't find group */ 2496 2497 /* It might be a forward reference; we can't fail until we 2498 * know, by completing the parse to get all the groups, and 2499 * then reparsing */ 2500 if (ALL_PARENS_COUNTED) { 2501 vFAIL("Reference to nonexistent named group"); 2502 } 2503 else { 2504 REQUIRE_PARENS_PASS; 2505 } 2506 } 2507 return sv_dat; 2508 } 2509 2510 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", 2511 (unsigned long) flags); 2512 } 2513 2514 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ 2515 if (RExC_lastparse!=RExC_parse) { \ 2516 Perl_re_printf( aTHX_ "%s", \ 2517 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ 2518 RExC_end - RExC_parse, 16, \ 2519 "", "", \ 2520 PERL_PV_ESCAPE_UNI_DETECT | \ 2521 PERL_PV_PRETTY_ELLIPSES | \ 2522 PERL_PV_PRETTY_LTGT | \ 2523 PERL_PV_ESCAPE_RE | \ 2524 PERL_PV_PRETTY_EXACTSIZE \ 2525 ) \ 2526 ); \ 2527 } else \ 2528 Perl_re_printf( aTHX_ "%16s",""); \ 2529 \ 2530 if (RExC_lastnum!=RExC_emit) \ 2531 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \ 2532 else \ 2533 Perl_re_printf( aTHX_ "|%4s",""); \ 2534 Perl_re_printf( aTHX_ "|%*s%-4s", \ 2535 (int)((depth*2)), "", \ 2536 (funcname) \ 2537 ); \ 2538 RExC_lastnum=RExC_emit; \ 2539 RExC_lastparse=RExC_parse; \ 2540 }) 2541 2542 2543 2544 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ 2545 DEBUG_PARSE_MSG((funcname)); \ 2546 Perl_re_printf( aTHX_ "%4s","\n"); \ 2547 }) 2548 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\ 2549 DEBUG_PARSE_MSG((funcname)); \ 2550 Perl_re_printf( aTHX_ fmt "\n",args); \ 2551 }) 2552 2553 2554 STATIC void 2555 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) 2556 { 2557 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)' 2558 * constructs, and updates RExC_flags with them. On input, RExC_parse 2559 * should point to the first flag; it is updated on output to point to the 2560 * final ')' or ':'. There needs to be at least one flag, or this will 2561 * abort */ 2562 2563 /* for (?g), (?gc), and (?o) warnings; warning 2564 about (?c) will warn about (?g) -- japhy */ 2565 2566 #define WASTED_O 0x01 2567 #define WASTED_G 0x02 2568 #define WASTED_C 0x04 2569 #define WASTED_GC (WASTED_G|WASTED_C) 2570 I32 wastedflags = 0x00; 2571 U32 posflags = 0, negflags = 0; 2572 U32 *flagsp = &posflags; 2573 char has_charset_modifier = '\0'; 2574 regex_charset cs; 2575 bool has_use_defaults = FALSE; 2576 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ 2577 int x_mod_count = 0; 2578 2579 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; 2580 2581 /* '^' as an initial flag sets certain defaults */ 2582 if (UCHARAT(RExC_parse) == '^') { 2583 RExC_parse_inc_by(1); 2584 has_use_defaults = TRUE; 2585 STD_PMMOD_FLAGS_CLEAR(&RExC_flags); 2586 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS) 2587 ? REGEX_UNICODE_CHARSET 2588 : REGEX_DEPENDS_CHARSET; 2589 set_regex_charset(&RExC_flags, cs); 2590 } 2591 else { 2592 cs = get_regex_charset(RExC_flags); 2593 if ( cs == REGEX_DEPENDS_CHARSET 2594 && (toUSE_UNI_CHARSET_NOT_DEPENDS)) 2595 { 2596 cs = REGEX_UNICODE_CHARSET; 2597 } 2598 } 2599 2600 while (RExC_parse < RExC_end) { 2601 /* && memCHRs("iogcmsx", *RExC_parse) */ 2602 /* (?g), (?gc) and (?o) are useless here 2603 and must be globally applied -- japhy */ 2604 if ((RExC_pm_flags & PMf_WILDCARD)) { 2605 if (flagsp == & negflags) { 2606 if (*RExC_parse == 'm') { 2607 RExC_parse_inc_by(1); 2608 /* diag_listed_as: Use of %s is not allowed in Unicode 2609 property wildcard subpatterns in regex; marked by <-- 2610 HERE in m/%s/ */ 2611 vFAIL("Use of modifier '-m' is not allowed in Unicode" 2612 " property wildcard subpatterns"); 2613 } 2614 } 2615 else { 2616 if (*RExC_parse == 's') { 2617 goto modifier_illegal_in_wildcard; 2618 } 2619 } 2620 } 2621 2622 switch (*RExC_parse) { 2623 2624 /* Code for the imsxn flags */ 2625 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); 2626 2627 case LOCALE_PAT_MOD: 2628 if (has_charset_modifier) { 2629 goto excess_modifier; 2630 } 2631 else if (flagsp == &negflags) { 2632 goto neg_modifier; 2633 } 2634 cs = REGEX_LOCALE_CHARSET; 2635 has_charset_modifier = LOCALE_PAT_MOD; 2636 break; 2637 case UNICODE_PAT_MOD: 2638 if (has_charset_modifier) { 2639 goto excess_modifier; 2640 } 2641 else if (flagsp == &negflags) { 2642 goto neg_modifier; 2643 } 2644 cs = REGEX_UNICODE_CHARSET; 2645 has_charset_modifier = UNICODE_PAT_MOD; 2646 break; 2647 case ASCII_RESTRICT_PAT_MOD: 2648 if (flagsp == &negflags) { 2649 goto neg_modifier; 2650 } 2651 if (has_charset_modifier) { 2652 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { 2653 goto excess_modifier; 2654 } 2655 /* Doubled modifier implies more restricted */ 2656 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; 2657 } 2658 else { 2659 cs = REGEX_ASCII_RESTRICTED_CHARSET; 2660 } 2661 has_charset_modifier = ASCII_RESTRICT_PAT_MOD; 2662 break; 2663 case DEPENDS_PAT_MOD: 2664 if (has_use_defaults) { 2665 goto fail_modifiers; 2666 } 2667 else if (flagsp == &negflags) { 2668 goto neg_modifier; 2669 } 2670 else if (has_charset_modifier) { 2671 goto excess_modifier; 2672 } 2673 2674 /* The dual charset means unicode semantics if the 2675 * pattern (or target, not known until runtime) are 2676 * utf8, or something in the pattern indicates unicode 2677 * semantics */ 2678 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS) 2679 ? REGEX_UNICODE_CHARSET 2680 : REGEX_DEPENDS_CHARSET; 2681 has_charset_modifier = DEPENDS_PAT_MOD; 2682 break; 2683 excess_modifier: 2684 RExC_parse_inc_by(1); 2685 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { 2686 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); 2687 } 2688 else if (has_charset_modifier == *(RExC_parse - 1)) { 2689 vFAIL2("Regexp modifier \"%c\" may not appear twice", 2690 *(RExC_parse - 1)); 2691 } 2692 else { 2693 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); 2694 } 2695 NOT_REACHED; /*NOTREACHED*/ 2696 neg_modifier: 2697 RExC_parse_inc_by(1); 2698 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", 2699 *(RExC_parse - 1)); 2700 NOT_REACHED; /*NOTREACHED*/ 2701 case GLOBAL_PAT_MOD: /* 'g' */ 2702 if (RExC_pm_flags & PMf_WILDCARD) { 2703 goto modifier_illegal_in_wildcard; 2704 } 2705 /*FALLTHROUGH*/ 2706 case ONCE_PAT_MOD: /* 'o' */ 2707 if (ckWARN(WARN_REGEXP)) { 2708 const I32 wflagbit = *RExC_parse == 'o' 2709 ? WASTED_O 2710 : WASTED_G; 2711 if (! (wastedflags & wflagbit) ) { 2712 wastedflags |= wflagbit; 2713 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 2714 vWARN5( 2715 RExC_parse + 1, 2716 "Useless (%s%c) - %suse /%c modifier", 2717 flagsp == &negflags ? "?-" : "?", 2718 *RExC_parse, 2719 flagsp == &negflags ? "don't " : "", 2720 *RExC_parse 2721 ); 2722 } 2723 } 2724 break; 2725 2726 case CONTINUE_PAT_MOD: /* 'c' */ 2727 if (RExC_pm_flags & PMf_WILDCARD) { 2728 goto modifier_illegal_in_wildcard; 2729 } 2730 if (ckWARN(WARN_REGEXP)) { 2731 if (! (wastedflags & WASTED_C) ) { 2732 wastedflags |= WASTED_GC; 2733 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ 2734 vWARN3( 2735 RExC_parse + 1, 2736 "Useless (%sc) - %suse /gc modifier", 2737 flagsp == &negflags ? "?-" : "?", 2738 flagsp == &negflags ? "don't " : "" 2739 ); 2740 } 2741 } 2742 break; 2743 case KEEPCOPY_PAT_MOD: /* 'p' */ 2744 if (RExC_pm_flags & PMf_WILDCARD) { 2745 goto modifier_illegal_in_wildcard; 2746 } 2747 if (flagsp == &negflags) { 2748 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); 2749 } else { 2750 *flagsp |= RXf_PMf_KEEPCOPY; 2751 } 2752 break; 2753 case '-': 2754 /* A flag is a default iff it is following a minus, so 2755 * if there is a minus, it means will be trying to 2756 * re-specify a default which is an error */ 2757 if (has_use_defaults || flagsp == &negflags) { 2758 goto fail_modifiers; 2759 } 2760 flagsp = &negflags; 2761 wastedflags = 0; /* reset so (?g-c) warns twice */ 2762 x_mod_count = 0; 2763 break; 2764 case ':': 2765 case ')': 2766 2767 if ( (RExC_pm_flags & PMf_WILDCARD) 2768 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET) 2769 { 2770 RExC_parse_inc_by(1); 2771 /* diag_listed_as: Use of %s is not allowed in Unicode 2772 property wildcard subpatterns in regex; marked by <-- 2773 HERE in m/%s/ */ 2774 vFAIL2("Use of modifier '%c' is not allowed in Unicode" 2775 " property wildcard subpatterns", 2776 has_charset_modifier); 2777 } 2778 2779 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) { 2780 negflags |= RXf_PMf_EXTENDED_MORE; 2781 } 2782 RExC_flags |= posflags; 2783 2784 if (negflags & RXf_PMf_EXTENDED) { 2785 negflags |= RXf_PMf_EXTENDED_MORE; 2786 } 2787 RExC_flags &= ~negflags; 2788 set_regex_charset(&RExC_flags, cs); 2789 2790 return; 2791 default: 2792 fail_modifiers: 2793 RExC_parse_inc_if_char(); 2794 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 2795 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized", 2796 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 2797 NOT_REACHED; /*NOTREACHED*/ 2798 } 2799 2800 RExC_parse_inc(); 2801 } 2802 2803 vFAIL("Sequence (?... not terminated"); 2804 2805 modifier_illegal_in_wildcard: 2806 RExC_parse_inc_by(1); 2807 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard 2808 subpatterns in regex; marked by <-- HERE in m/%s/ */ 2809 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard" 2810 " subpatterns", *(RExC_parse - 1)); 2811 } 2812 2813 /* 2814 - reg - regular expression, i.e. main body or parenthesized thing 2815 * 2816 * Caller must absorb opening parenthesis. 2817 * 2818 * Combining parenthesis handling with the base level of regular expression 2819 * is a trifle forced, but the need to tie the tails of the branches to what 2820 * follows makes it hard to avoid. 2821 */ 2822 2823 STATIC regnode_offset 2824 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, 2825 I32 *flagp, 2826 char * backref_parse_start, 2827 char ch 2828 ) 2829 { 2830 regnode_offset ret; 2831 char* name_start = RExC_parse; 2832 U32 num = 0; 2833 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); 2834 DECLARE_AND_GET_RE_DEBUG_FLAGS; 2835 2836 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; 2837 2838 if (RExC_parse != name_start && ch == '}') { 2839 while (isBLANK(*RExC_parse)) { 2840 RExC_parse_inc_by(1); 2841 } 2842 } 2843 if (RExC_parse == name_start || *RExC_parse != ch) { 2844 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 2845 vFAIL2("Sequence %.3s... not terminated", backref_parse_start); 2846 } 2847 2848 if (sv_dat) { 2849 num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); 2850 RExC_rxi->data->data[num]=(void*)sv_dat; 2851 SvREFCNT_inc_simple_void_NN(sv_dat); 2852 } 2853 RExC_sawback = 1; 2854 ret = reg2node(pRExC_state, 2855 ((! FOLD) 2856 ? REFN 2857 : (ASCII_FOLD_RESTRICTED) 2858 ? REFFAN 2859 : (AT_LEAST_UNI_SEMANTICS) 2860 ? REFFUN 2861 : (LOC) 2862 ? REFFLN 2863 : REFFN), 2864 num, RExC_nestroot); 2865 if (RExC_nestroot && num >= (U32)RExC_nestroot) 2866 FLAGS(REGNODE_p(ret)) = VOLATILE_REF; 2867 *flagp |= HASWIDTH; 2868 2869 nextchar(pRExC_state); 2870 return ret; 2871 } 2872 2873 /* reg_la_NOTHING() 2874 * 2875 * Maybe parse a parenthesized lookaround construct that is equivalent to a 2876 * NOTHING regop when the construct is empty. 2877 * 2878 * Calls skip_to_be_ignored_text() before checking if the construct is empty. 2879 * 2880 * Checks for unterminated constructs and throws a "not terminated" error 2881 * with the appropriate type if necessary 2882 * 2883 * Assuming it does not throw an exception increments RExC_seen_zerolen. 2884 * 2885 * If the construct is empty generates a NOTHING op and returns its 2886 * regnode_offset, which the caller would then return to its caller. 2887 * 2888 * If the construct is not empty increments RExC_in_lookaround, and turns 2889 * on any flags provided in RExC_seen, and then returns 0 to signify 2890 * that parsing should continue. 2891 * 2892 * PS: I would have called this reg_parse_lookaround_NOTHING() but then 2893 * any use of it would have had to be broken onto multiple lines, hence 2894 * the abbreviation. 2895 */ 2896 STATIC regnode_offset 2897 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags, 2898 const char *type) 2899 { 2900 2901 PERL_ARGS_ASSERT_REG_LA_NOTHING; 2902 2903 /* false below so we do not force /x */ 2904 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE); 2905 2906 if (RExC_parse >= RExC_end) 2907 vFAIL2("Sequence (%s... not terminated", type); 2908 2909 /* Always increment as NOTHING regops are zerolen */ 2910 RExC_seen_zerolen++; 2911 2912 if (*RExC_parse == ')') { 2913 regnode_offset ret= reg_node(pRExC_state, NOTHING); 2914 nextchar(pRExC_state); 2915 return ret; 2916 } 2917 2918 RExC_seen |= flags; 2919 RExC_in_lookaround++; 2920 return 0; /* keep parsing! */ 2921 } 2922 2923 /* reg_la_OPFAIL() 2924 * 2925 * Maybe parse a parenthesized lookaround construct that is equivalent to a 2926 * OPFAIL regop when the construct is empty. 2927 * 2928 * Calls skip_to_be_ignored_text() before checking if the construct is empty. 2929 * 2930 * Checks for unterminated constructs and throws a "not terminated" error 2931 * if necessary. 2932 * 2933 * If the construct is empty generates an OPFAIL op and returns its 2934 * regnode_offset which the caller should then return to its caller. 2935 * 2936 * If the construct is not empty increments RExC_in_lookaround, and also 2937 * increments RExC_seen_zerolen, and turns on the flags provided in 2938 * RExC_seen, and then returns 0 to signify that parsing should continue. 2939 * 2940 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then 2941 * any use of it would have had to be broken onto multiple lines, hence 2942 * the abbreviation. 2943 */ 2944 2945 STATIC regnode_offset 2946 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags, 2947 const char *type) 2948 { 2949 2950 PERL_ARGS_ASSERT_REG_LA_OPFAIL; 2951 2952 /* FALSE so we don't force to /x below */; 2953 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE); 2954 2955 if (RExC_parse >= RExC_end) 2956 vFAIL2("Sequence (%s... not terminated", type); 2957 2958 if (*RExC_parse == ')') { 2959 regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0); 2960 nextchar(pRExC_state); 2961 return ret; /* return produced regop */ 2962 } 2963 2964 /* only increment zerolen *after* we check if we produce an OPFAIL 2965 * as an OPFAIL does not match a zero length construct, as it 2966 * does not match ever. */ 2967 RExC_seen_zerolen++; 2968 RExC_seen |= flags; 2969 RExC_in_lookaround++; 2970 return 0; /* keep parsing! */ 2971 } 2972 2973 /* Below are the main parsing routines. 2974 * 2975 * S_reg() parses a whole pattern or subpattern. It itself handles things 2976 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each 2977 * alternation '|' in the '...' pattern. 2978 * S_regbranch() effectively implements the concatenation operator, handling 2979 * one alternative of '|', repeatedly calling S_regpiece on each 2980 * segment of the input. 2981 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input, 2982 * and then adds any quantifier for that chunk. 2983 * S_regatom() parses the next chunk of the input, returning when it 2984 * determines it has found a complete atomic chunk. The chunk may 2985 * be a nested subpattern, in which case S_reg is called 2986 * recursively 2987 * 2988 * The functions generate regnodes as they go along, appending each to the 2989 * pattern data structure so far. They return the offset of the current final 2990 * node into that structure, or 0 on failure. 2991 * 2992 * There are three parameters common to all of them: 2993 * pRExC_state is a structure with much information about the current 2994 * state of the parse. It's easy to add new elements to 2995 * convey new information, but beware that an error return may 2996 * require clearing the element. 2997 * flagp is a pointer to bit flags set in a lower level to pass up 2998 * to higher levels information, such as the cause of a 2999 * failure, or some characteristic about the generated node 3000 * depth is roughly the recursion depth, mostly unused except for 3001 * pretty printing debugging info. 3002 * 3003 * There are ancillary functions that these may farm work out to, using the 3004 * same parameters. 3005 * 3006 * The protocol for handling flags is that each function will, before 3007 * returning, add into *flagp the flags it needs to pass up. Each function has 3008 * a second flags variable, typically named 'flags', which it sets and clears 3009 * at will. Flag bits in it are used in that function, and it calls the next 3010 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return, 3011 * 'flags' will contain whatever it had before the call, plus whatever that 3012 * function passed up. If it wants to pass any of these up to its caller, it 3013 * has to add them to its *flagp. This means that it takes extra steps to keep 3014 * passing a flag upwards, and otherwise the flag bit is cleared for higher 3015 * functions. 3016 */ 3017 3018 /* On success, returns the offset at which any next node should be placed into 3019 * the regex engine program being compiled. 3020 * 3021 * Returns 0 otherwise, with *flagp set to indicate why: 3022 * TRYAGAIN at the end of (?) that only sets flags. 3023 * RESTART_PARSE if the parse needs to be restarted, or'd with 3024 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8. 3025 * Otherwise would only return 0 if regbranch() returns 0, which cannot 3026 * happen. */ 3027 STATIC regnode_offset 3028 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) 3029 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. 3030 * 2 is like 1, but indicates that nextchar() has been called to advance 3031 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and 3032 * this flag alerts us to the need to check for that */ 3033 { 3034 regnode_offset ret = 0; /* Will be the head of the group. */ 3035 regnode_offset br; 3036 regnode_offset lastbr; 3037 regnode_offset ender = 0; 3038 I32 logical_parno = 0; 3039 I32 parno = 0; 3040 I32 flags; 3041 U32 oregflags = RExC_flags; 3042 bool have_branch = 0; 3043 bool is_open = 0; 3044 I32 freeze_paren = 0; 3045 I32 after_freeze = 0; 3046 I32 num; /* numeric backreferences */ 3047 SV * max_open; /* Max number of unclosed parens */ 3048 I32 was_in_lookaround = RExC_in_lookaround; 3049 I32 fake_eval = 0; /* matches paren */ 3050 3051 /* The difference between the following variables can be seen with * 3052 * the broken pattern /(?:foo/ where segment_parse_start will point * 3053 * at the 'f', and reg_parse_start will point at the '(' */ 3054 3055 /* the following is used for unmatched '(' errors */ 3056 char * const reg_parse_start = RExC_parse; 3057 3058 /* the following is used to track where various segments of 3059 * the pattern that we parse out started. */ 3060 char * segment_parse_start = RExC_parse; 3061 3062 DECLARE_AND_GET_RE_DEBUG_FLAGS; 3063 3064 PERL_ARGS_ASSERT_REG; 3065 DEBUG_PARSE("reg "); 3066 3067 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD); 3068 assert(max_open); 3069 if (!SvIOK(max_open)) { 3070 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT); 3071 } 3072 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each 3073 open paren */ 3074 vFAIL("Too many nested open parens"); 3075 } 3076 3077 *flagp = 0; /* Initialize. */ 3078 3079 /* Having this true makes it feasible to have a lot fewer tests for the 3080 * parse pointer being in scope. For example, we can write 3081 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1); 3082 * instead of 3083 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1); 3084 */ 3085 assert(*RExC_end == '\0'); 3086 3087 /* Make an OPEN node, if parenthesized. */ 3088 if (paren) { 3089 3090 /* Under /x, space and comments can be gobbled up between the '(' and 3091 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such 3092 * intervening space, as the sequence is a token, and a token should be 3093 * indivisible */ 3094 bool has_intervening_patws = (paren == 2) 3095 && *(RExC_parse - 1) != '('; 3096 3097 if (RExC_parse >= RExC_end) { 3098 vFAIL("Unmatched ("); 3099 } 3100 3101 if (paren == 'r') { /* Atomic script run */ 3102 paren = '>'; 3103 goto parse_rest; 3104 } 3105 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */ 3106 if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */ 3107 fake_eval = '{'; 3108 goto handle_qmark; 3109 } 3110 3111 char *start_verb = RExC_parse + 1; 3112 STRLEN verb_len; 3113 char *start_arg = NULL; 3114 unsigned char op = 0; 3115 int arg_required = 0; 3116 int internal_argval = -1; /* if > -1 no argument allowed */ 3117 bool has_upper = FALSE; 3118 U32 seen_flag_set = 0; /* RExC_seen flags we must set */ 3119 3120 if (has_intervening_patws) { 3121 RExC_parse_inc_by(1); /* past the '*' */ 3122 3123 /* For strict backwards compatibility, don't change the message 3124 * now that we also have lowercase operands */ 3125 if (isUPPER(*RExC_parse)) { 3126 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); 3127 } 3128 else { 3129 vFAIL("In '(*...)', the '(' and '*' must be adjacent"); 3130 } 3131 } 3132 while (RExC_parse < RExC_end && *RExC_parse != ')' ) { 3133 if ( *RExC_parse == ':' ) { 3134 start_arg = RExC_parse + 1; 3135 break; 3136 } 3137 else if (! UTF) { 3138 if (isUPPER(*RExC_parse)) { 3139 has_upper = TRUE; 3140 } 3141 RExC_parse_inc_by(1); 3142 } 3143 else { 3144 RExC_parse_inc_utf8(); 3145 } 3146 } 3147 verb_len = RExC_parse - start_verb; 3148 if ( start_arg ) { 3149 if (RExC_parse >= RExC_end) { 3150 goto unterminated_verb_pattern; 3151 } 3152 3153 RExC_parse_inc(); 3154 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) { 3155 RExC_parse_inc(); 3156 } 3157 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) { 3158 unterminated_verb_pattern: 3159 if (has_upper) { 3160 vFAIL("Unterminated verb pattern argument"); 3161 } 3162 else { 3163 vFAIL("Unterminated '(*...' argument"); 3164 } 3165 } 3166 } else { 3167 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) { 3168 if (has_upper) { 3169 vFAIL("Unterminated verb pattern"); 3170 } 3171 else { 3172 vFAIL("Unterminated '(*...' construct"); 3173 } 3174 } 3175 } 3176 3177 /* Here, we know that RExC_parse < RExC_end */ 3178 3179 switch ( *start_verb ) { 3180 case 'A': /* (*ACCEPT) */ 3181 if ( memEQs(start_verb, verb_len,"ACCEPT") ) { 3182 op = ACCEPT; 3183 internal_argval = RExC_nestroot; 3184 } 3185 break; 3186 case 'C': /* (*COMMIT) */ 3187 if ( memEQs(start_verb, verb_len,"COMMIT") ) 3188 op = COMMIT; 3189 break; 3190 case 'F': /* (*FAIL) */ 3191 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) { 3192 op = OPFAIL; 3193 } 3194 break; 3195 case ':': /* (*:NAME) */ 3196 case 'M': /* (*MARK:NAME) */ 3197 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) { 3198 op = MARKPOINT; 3199 arg_required = 1; 3200 } 3201 break; 3202 case 'P': /* (*PRUNE) */ 3203 if ( memEQs(start_verb, verb_len,"PRUNE") ) 3204 op = PRUNE; 3205 break; 3206 case 'S': /* (*SKIP) */ 3207 if ( memEQs(start_verb, verb_len,"SKIP") ) 3208 op = SKIP; 3209 break; 3210 case 'T': /* (*THEN) */ 3211 /* [19:06] <TimToady> :: is then */ 3212 if ( memEQs(start_verb, verb_len,"THEN") ) { 3213 op = CUTGROUP; 3214 RExC_seen |= REG_CUTGROUP_SEEN; 3215 } 3216 break; 3217 case 'a': 3218 if ( memEQs(start_verb, verb_len, "asr") 3219 || memEQs(start_verb, verb_len, "atomic_script_run")) 3220 { 3221 paren = 'r'; /* Mnemonic: recursed run */ 3222 goto script_run; 3223 } 3224 else if (memEQs(start_verb, verb_len, "atomic")) { 3225 paren = 't'; /* AtOMIC */ 3226 goto alpha_assertions; 3227 } 3228 break; 3229 case 'p': 3230 if ( memEQs(start_verb, verb_len, "plb") 3231 || memEQs(start_verb, verb_len, "positive_lookbehind")) 3232 { 3233 paren = 'b'; 3234 goto lookbehind_alpha_assertions; 3235 } 3236 else if ( memEQs(start_verb, verb_len, "pla") 3237 || memEQs(start_verb, verb_len, "positive_lookahead")) 3238 { 3239 paren = 'a'; 3240 goto alpha_assertions; 3241 } 3242 break; 3243 case 'n': 3244 if ( memEQs(start_verb, verb_len, "nlb") 3245 || memEQs(start_verb, verb_len, "negative_lookbehind")) 3246 { 3247 paren = 'B'; 3248 goto lookbehind_alpha_assertions; 3249 } 3250 else if ( memEQs(start_verb, verb_len, "nla") 3251 || memEQs(start_verb, verb_len, "negative_lookahead")) 3252 { 3253 paren = 'A'; 3254 goto alpha_assertions; 3255 } 3256 break; 3257 case 's': 3258 if ( memEQs(start_verb, verb_len, "sr") 3259 || memEQs(start_verb, verb_len, "script_run")) 3260 { 3261 regnode_offset atomic; 3262 3263 paren = 's'; 3264 3265 script_run: 3266 3267 /* This indicates Unicode rules. */ 3268 REQUIRE_UNI_RULES(flagp, 0); 3269 3270 if (! start_arg) { 3271 goto no_colon; 3272 } 3273 3274 RExC_parse_set(start_arg); 3275 3276 if (RExC_in_script_run) { 3277 3278 /* Nested script runs are treated as no-ops, because 3279 * if the nested one fails, the outer one must as 3280 * well. It could fail sooner, and avoid (??{} with 3281 * side effects, but that is explicitly documented as 3282 * undefined behavior. */ 3283 3284 ret = 0; 3285 3286 if (paren == 's') { 3287 paren = ':'; 3288 goto parse_rest; 3289 } 3290 3291 /* But, the atomic part of a nested atomic script run 3292 * isn't a no-op, but can be treated just like a '(?>' 3293 * */ 3294 paren = '>'; 3295 goto parse_rest; 3296 } 3297 3298 if (paren == 's') { 3299 /* Here, we're starting a new regular script run */ 3300 ret = reg_node(pRExC_state, SROPEN); 3301 RExC_in_script_run = 1; 3302 is_open = 1; 3303 goto parse_rest; 3304 } 3305 3306 /* Here, we are starting an atomic script run. This is 3307 * handled by recursing to deal with the atomic portion 3308 * separately, enclosed in SROPEN ... SRCLOSE nodes */ 3309 3310 ret = reg_node(pRExC_state, SROPEN); 3311 3312 RExC_in_script_run = 1; 3313 3314 atomic = reg(pRExC_state, 'r', &flags, depth); 3315 if (flags & (RESTART_PARSE|NEED_UTF8)) { 3316 *flagp = flags & (RESTART_PARSE|NEED_UTF8); 3317 return 0; 3318 } 3319 3320 if (! REGTAIL(pRExC_state, ret, atomic)) { 3321 REQUIRE_BRANCHJ(flagp, 0); 3322 } 3323 3324 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state, 3325 SRCLOSE))) 3326 { 3327 REQUIRE_BRANCHJ(flagp, 0); 3328 } 3329 3330 RExC_in_script_run = 0; 3331 return ret; 3332 } 3333 3334 break; 3335 3336 lookbehind_alpha_assertions: 3337 seen_flag_set = REG_LOOKBEHIND_SEEN; 3338 /*FALLTHROUGH*/ 3339 3340 alpha_assertions: 3341 3342 if ( !start_arg ) { 3343 goto no_colon; 3344 } 3345 3346 if ( RExC_parse == start_arg ) { 3347 if ( paren == 'A' || paren == 'B' ) { 3348 /* An empty negative lookaround assertion is failure. 3349 * See also: S_reg_la_OPFAIL() */ 3350 3351 /* Note: OPFAIL is *not* zerolen. */ 3352 ret = reg1node(pRExC_state, OPFAIL, 0); 3353 nextchar(pRExC_state); 3354 return ret; 3355 } 3356 else 3357 if ( paren == 'a' || paren == 'b' ) { 3358 /* An empty positive lookaround assertion is success. 3359 * See also: S_reg_la_NOTHING() */ 3360 3361 /* Note: NOTHING is zerolen, so increment here */ 3362 RExC_seen_zerolen++; 3363 ret = reg_node(pRExC_state, NOTHING); 3364 nextchar(pRExC_state); 3365 return ret; 3366 } 3367 } 3368 3369 RExC_seen_zerolen++; 3370 RExC_in_lookaround++; 3371 RExC_seen |= seen_flag_set; 3372 3373 RExC_parse_set(start_arg); 3374 goto parse_rest; 3375 3376 no_colon: 3377 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'", 3378 UTF8fARG(UTF, verb_len, start_verb)); 3379 NOT_REACHED; /*NOTREACHED*/ 3380 3381 } /* End of switch */ 3382 if ( ! op ) { 3383 RExC_parse_inc_safe(); 3384 if (has_upper || verb_len == 0) { 3385 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'", 3386 UTF8fARG(UTF, verb_len, start_verb)); 3387 } 3388 else { 3389 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'", 3390 UTF8fARG(UTF, verb_len, start_verb)); 3391 } 3392 } 3393 if ( RExC_parse == start_arg ) { 3394 start_arg = NULL; 3395 } 3396 if ( arg_required && !start_arg ) { 3397 vFAIL3( "Verb pattern '%.*s' has a mandatory argument", 3398 (int) verb_len, start_verb); 3399 } 3400 if (internal_argval == -1) { 3401 ret = reg1node(pRExC_state, op, 0); 3402 } else { 3403 ret = reg2node(pRExC_state, op, 0, internal_argval); 3404 } 3405 RExC_seen |= REG_VERBARG_SEEN; 3406 if (start_arg) { 3407 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg); 3408 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state, 3409 STR_WITH_LEN("S")); 3410 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv; 3411 FLAGS(REGNODE_p(ret)) = 1; 3412 } else { 3413 FLAGS(REGNODE_p(ret)) = 0; 3414 } 3415 if ( internal_argval != -1 ) 3416 ARG2i_SET(REGNODE_p(ret), internal_argval); 3417 nextchar(pRExC_state); 3418 return ret; 3419 } 3420 else if (*RExC_parse == '?') { /* (?...) */ 3421 handle_qmark: 3422 ; /* make sure the label has a statement associated with it*/ 3423 bool is_logical = 0, is_optimistic = 0; 3424 const char * const seqstart = RExC_parse; 3425 const char * endptr; 3426 const char non_existent_group_msg[] 3427 = "Reference to nonexistent group"; 3428 const char impossible_group[] = "Invalid reference to group"; 3429 3430 if (has_intervening_patws) { 3431 RExC_parse_inc_by(1); 3432 vFAIL("In '(?...)', the '(' and '?' must be adjacent"); 3433 } 3434 3435 RExC_parse_inc_by(1); /* past the '?' */ 3436 if (!fake_eval) { 3437 paren = *RExC_parse; /* might be a trailing NUL, if not 3438 well-formed */ 3439 is_optimistic = 0; 3440 } else { 3441 is_optimistic = 1; 3442 paren = fake_eval; 3443 } 3444 RExC_parse_inc(); 3445 if (RExC_parse > RExC_end) { 3446 paren = '\0'; 3447 } 3448 ret = 0; /* For look-ahead/behind. */ 3449 switch (paren) { 3450 3451 case 'P': /* (?P...) variants for those used to PCRE/Python */ 3452 paren = *RExC_parse; 3453 if ( paren == '<') { /* (?P<...>) named capture */ 3454 RExC_parse_inc_by(1); 3455 if (RExC_parse >= RExC_end) { 3456 vFAIL("Sequence (?P<... not terminated"); 3457 } 3458 goto named_capture; 3459 } 3460 else if (paren == '>') { /* (?P>name) named recursion */ 3461 RExC_parse_inc_by(1); 3462 if (RExC_parse >= RExC_end) { 3463 vFAIL("Sequence (?P>... not terminated"); 3464 } 3465 goto named_recursion; 3466 } 3467 else if (paren == '=') { /* (?P=...) named backref */ 3468 RExC_parse_inc_by(1); 3469 return handle_named_backref(pRExC_state, flagp, 3470 segment_parse_start, ')'); 3471 } 3472 RExC_parse_inc_if_char(); 3473 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 3474 vFAIL3("Sequence (%.*s...) not recognized", 3475 (int) (RExC_parse - seqstart), seqstart); 3476 NOT_REACHED; /*NOTREACHED*/ 3477 case '<': /* (?<...) */ 3478 /* If you want to support (?<*...), first reconcile with GH #17363 */ 3479 if (*RExC_parse == '!') { 3480 paren = ','; /* negative lookbehind (?<! ... ) */ 3481 RExC_parse_inc_by(1); 3482 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!"))) 3483 return ret; 3484 break; 3485 } 3486 else 3487 if (*RExC_parse == '=') { 3488 /* paren = '<' - negative lookahead (?<= ... ) */ 3489 RExC_parse_inc_by(1); 3490 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<="))) 3491 return ret; 3492 break; 3493 } 3494 else 3495 named_capture: 3496 { /* (?<...>) */ 3497 char *name_start; 3498 SV *svname; 3499 paren= '>'; 3500 /* FALLTHROUGH */ 3501 case '\'': /* (?'...') */ 3502 name_start = RExC_parse; 3503 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME); 3504 if ( RExC_parse == name_start 3505 || RExC_parse >= RExC_end 3506 || *RExC_parse != paren) 3507 { 3508 vFAIL2("Sequence (?%c... not terminated", 3509 paren=='>' ? '<' : (char) paren); 3510 } 3511 { 3512 HE *he_str; 3513 SV *sv_dat = NULL; 3514 if (!svname) /* shouldn't happen */ 3515 Perl_croak(aTHX_ 3516 "panic: reg_scan_name returned NULL"); 3517 if (!RExC_paren_names) { 3518 RExC_paren_names= newHV(); 3519 sv_2mortal(MUTABLE_SV(RExC_paren_names)); 3520 #ifdef DEBUGGING 3521 RExC_paren_name_list= newAV(); 3522 sv_2mortal(MUTABLE_SV(RExC_paren_name_list)); 3523 #endif 3524 } 3525 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 ); 3526 if ( he_str ) 3527 sv_dat = HeVAL(he_str); 3528 if ( ! sv_dat ) { 3529 /* croak baby croak */ 3530 Perl_croak(aTHX_ 3531 "panic: paren_name hash element allocation failed"); 3532 } else if ( SvPOK(sv_dat) ) { 3533 /* (?|...) can mean we have dupes so scan to check 3534 its already been stored. Maybe a flag indicating 3535 we are inside such a construct would be useful, 3536 but the arrays are likely to be quite small, so 3537 for now we punt -- dmq */ 3538 IV count = SvIV(sv_dat); 3539 I32 *pv = (I32*)SvPVX(sv_dat); 3540 IV i; 3541 for ( i = 0 ; i < count ; i++ ) { 3542 if ( pv[i] == RExC_npar ) { 3543 count = 0; 3544 break; 3545 } 3546 } 3547 if ( count ) { 3548 pv = (I32*)SvGROW(sv_dat, 3549 SvCUR(sv_dat) + sizeof(I32)+1); 3550 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); 3551 pv[count] = RExC_npar; 3552 SvIV_set(sv_dat, SvIVX(sv_dat) + 1); 3553 } 3554 } else { 3555 (void)SvUPGRADE(sv_dat, SVt_PVNV); 3556 sv_setpvn(sv_dat, (char *)&(RExC_npar), 3557 sizeof(I32)); 3558 SvIOK_on(sv_dat); 3559 SvIV_set(sv_dat, 1); 3560 } 3561 #ifdef DEBUGGING 3562 /* No, this does not cause a memory leak under 3563 * debugging. RExC_paren_name_list is freed later 3564 * on in the dump process. - Yves 3565 */ 3566 if (!av_store(RExC_paren_name_list, 3567 RExC_npar, SvREFCNT_inc_NN(svname))) 3568 SvREFCNT_dec_NN(svname); 3569 #endif 3570 3571 } 3572 nextchar(pRExC_state); 3573 paren = 1; 3574 goto capturing_parens; 3575 } 3576 NOT_REACHED; /*NOTREACHED*/ 3577 case '=': /* (?=...) */ 3578 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?="))) 3579 return ret; 3580 break; 3581 case '!': /* (?!...) */ 3582 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!"))) 3583 return ret; 3584 break; 3585 case '|': /* (?|...) */ 3586 /* branch reset, behave like a (?:...) except that 3587 buffers in alternations share the same numbers */ 3588 paren = ':'; 3589 after_freeze = freeze_paren = RExC_logical_npar; 3590 3591 /* XXX This construct currently requires an extra pass. 3592 * Investigation would be required to see if that could be 3593 * changed */ 3594 REQUIRE_PARENS_PASS; 3595 break; 3596 case ':': /* (?:...) */ 3597 case '>': /* (?>...) */ 3598 break; 3599 case '$': /* (?$...) */ 3600 case '@': /* (?@...) */ 3601 vFAIL2("Sequence (?%c...) not implemented", (int)paren); 3602 break; 3603 case '0' : /* (?0) */ 3604 case 'R' : /* (?R) */ 3605 if (RExC_parse == RExC_end || *RExC_parse != ')') 3606 FAIL("Sequence (?R) not terminated"); 3607 num = 0; 3608 RExC_seen |= REG_RECURSE_SEEN; 3609 3610 /* XXX These constructs currently require an extra pass. 3611 * It probably could be changed */ 3612 REQUIRE_PARENS_PASS; 3613 3614 *flagp |= POSTPONED; 3615 goto gen_recurse_regop; 3616 /*notreached*/ 3617 /* named and numeric backreferences */ 3618 case '&': /* (?&NAME) */ 3619 segment_parse_start = RExC_parse - 1; 3620 named_recursion: 3621 { 3622 SV *sv_dat = reg_scan_name(pRExC_state, 3623 REG_RSN_RETURN_DATA); 3624 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; 3625 } 3626 if (RExC_parse >= RExC_end || *RExC_parse != ')') 3627 vFAIL("Sequence (?&... not terminated"); 3628 goto gen_recurse_regop; 3629 /* NOTREACHED */ 3630 case '+': 3631 if (! inRANGE(RExC_parse[0], '1', '9')) { 3632 RExC_parse_inc_by(1); 3633 vFAIL("Illegal pattern"); 3634 } 3635 goto parse_recursion; 3636 /* NOTREACHED*/ 3637 case '-': /* (?-1) */ 3638 if (! inRANGE(RExC_parse[0], '1', '9')) { 3639 RExC_parse--; /* rewind to let it be handled later */ 3640 goto parse_flags; 3641 } 3642 /* FALLTHROUGH */ 3643 case '1': case '2': case '3': case '4': /* (?1) */ 3644 case '5': case '6': case '7': case '8': case '9': 3645 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */ 3646 parse_recursion: 3647 { 3648 bool is_neg = FALSE; 3649 UV unum; 3650 segment_parse_start = RExC_parse - 1; 3651 if (*RExC_parse == '-') { 3652 RExC_parse_inc_by(1); 3653 is_neg = TRUE; 3654 } 3655 endptr = RExC_end; 3656 if (grok_atoUV(RExC_parse, &unum, &endptr) 3657 && unum <= I32_MAX 3658 ) { 3659 num = (I32)unum; 3660 RExC_parse_set((char*)endptr); 3661 } 3662 else { /* Overflow, or something like that. Position 3663 beyond all digits for the message */ 3664 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) { 3665 RExC_parse_inc_by(1); 3666 } 3667 vFAIL(impossible_group); 3668 } 3669 if (is_neg) { 3670 /* -num is always representable on 1 and 2's complement 3671 * machines */ 3672 num = -num; 3673 } 3674 } 3675 if (*RExC_parse!=')') 3676 vFAIL("Expecting close bracket"); 3677 3678 if (paren == '-' || paren == '+') { 3679 3680 /* Don't overflow */ 3681 if (UNLIKELY(I32_MAX - RExC_npar < num)) { 3682 RExC_parse_inc_by(1); 3683 vFAIL(impossible_group); 3684 } 3685 3686 /* 3687 Diagram of capture buffer numbering. 3688 Top line is the normal capture buffer numbers 3689 Bottom line is the negative indexing as from 3690 the X (the (?-2)) 3691 3692 1 2 3 4 5 X Y 6 7 3693 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/ 3694 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/ 3695 - 5 4 3 2 1 X Y x x 3696 3697 Resolve to absolute group. Recall that RExC_npar is +1 of 3698 the actual parenthesis group number. For lookahead, we 3699 have to compensate for that. Using the above example, when 3700 we get to Y in the parse, num is 2 and RExC_npar is 6. We 3701 want 7 for +2, and 4 for -2. 3702 */ 3703 if ( paren == '+' ) { 3704 num--; 3705 } 3706 3707 num += RExC_npar; 3708 3709 if (paren == '-' && num < 1) { 3710 RExC_parse_inc_by(1); 3711 vFAIL(non_existent_group_msg); 3712 } 3713 } 3714 else 3715 if (num && num < RExC_logical_npar) { 3716 num = RExC_logical_to_parno[num]; 3717 } 3718 else 3719 if (ALL_PARENS_COUNTED) { 3720 if (num < RExC_logical_total_parens) { 3721 num = RExC_logical_to_parno[num]; 3722 } 3723 else { 3724 RExC_parse_inc_by(1); 3725 vFAIL(non_existent_group_msg); 3726 } 3727 } 3728 else { 3729 REQUIRE_PARENS_PASS; 3730 } 3731 3732 3733 gen_recurse_regop: 3734 if (num >= RExC_npar) { 3735 3736 /* It might be a forward reference; we can't fail until we 3737 * know, by completing the parse to get all the groups, and 3738 * then reparsing */ 3739 if (ALL_PARENS_COUNTED) { 3740 if (num >= RExC_total_parens) { 3741 RExC_parse_inc_by(1); 3742 vFAIL(non_existent_group_msg); 3743 } 3744 } 3745 else { 3746 REQUIRE_PARENS_PASS; 3747 } 3748 } 3749 3750 /* We keep track how many GOSUB items we have produced. 3751 To start off the ARG2i() of the GOSUB holds its "id", 3752 which is used later in conjunction with RExC_recurse 3753 to calculate the offset we need to jump for the GOSUB, 3754 which it will store in the final representation. 3755 We have to defer the actual calculation until much later 3756 as the regop may move. 3757 */ 3758 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count); 3759 RExC_recurse_count++; 3760 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 3761 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", 3762 22, "| |", (int)(depth * 2 + 1), "", 3763 (UV)ARG1u(REGNODE_p(ret)), 3764 (IV)ARG2i(REGNODE_p(ret)))); 3765 RExC_seen |= REG_RECURSE_SEEN; 3766 3767 *flagp |= POSTPONED; 3768 assert(*RExC_parse == ')'); 3769 nextchar(pRExC_state); 3770 return ret; 3771 3772 /* NOTREACHED */ 3773 3774 case '?': /* (??...) */ 3775 is_logical = 1; 3776 if (*RExC_parse != '{') { 3777 RExC_parse_inc_if_char(); 3778 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ 3779 vFAIL2utf8f( 3780 "Sequence (%" UTF8f "...) not recognized", 3781 UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); 3782 NOT_REACHED; /*NOTREACHED*/ 3783 } 3784 *flagp |= POSTPONED; 3785 paren = '{'; 3786 RExC_parse_inc_by(1); 3787 /* FALLTHROUGH */ 3788 case '{': /* (?{...}) */ 3789 { 3790 U32 n = 0; 3791 struct reg_code_block *cb; 3792 OP * o; 3793 3794 RExC_seen_zerolen++; 3795 3796 if ( !pRExC_state->code_blocks 3797 || pRExC_state->code_index 3798 >= pRExC_state->code_blocks->count 3799 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start 3800 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) 3801 - RExC_start) 3802 ) { 3803 if (RExC_pm_flags & PMf_USE_RE_EVAL) 3804 FAIL("panic: Sequence (?{...}): no code block found\n"); 3805 FAIL("Eval-group not allowed at runtime, use re 'eval'"); 3806 } 3807 /* this is a pre-compiled code block (?{...}) */ 3808 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index]; 3809 RExC_parse_set(RExC_start + cb->end); 3810 o = cb->block; 3811 if (cb->src_regex) { 3812 n = reg_add_data(pRExC_state, STR_WITH_LEN("rl")); 3813 RExC_rxi->data->data[n] = 3814 (void*)SvREFCNT_inc((SV*)cb->src_regex); 3815 RExC_rxi->data->data[n+1] = (void*)o; 3816 } 3817 else { 3818 n = reg_add_data(pRExC_state, 3819 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1); 3820 RExC_rxi->data->data[n] = (void*)o; 3821 } 3822 pRExC_state->code_index++; 3823 nextchar(pRExC_state); 3824 if (!is_optimistic) 3825 RExC_seen |= REG_PESSIMIZE_SEEN; 3826 3827 if (is_logical) { 3828 regnode_offset eval; 3829 ret = reg_node(pRExC_state, LOGICAL); 3830 FLAGS(REGNODE_p(ret)) = 2; 3831 3832 eval = reg2node(pRExC_state, EVAL, 3833 n, 3834 3835 /* for later propagation into (??{}) 3836 * return value */ 3837 RExC_flags & RXf_PMf_COMPILETIME 3838 ); 3839 FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG; 3840 if (! REGTAIL(pRExC_state, ret, eval)) { 3841 REQUIRE_BRANCHJ(flagp, 0); 3842 } 3843 return ret; 3844 } 3845 ret = reg2node(pRExC_state, EVAL, n, 0); 3846 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG; 3847 3848 return ret; 3849 } 3850 case '(': /* (?(?{...})...) and (?(?=...)...) */ 3851 { 3852 int is_define= 0; 3853 const int DEFINE_len = sizeof("DEFINE") - 1; 3854 if ( RExC_parse < RExC_end - 1 3855 && ( ( RExC_parse[0] == '?' /* (?(?...)) */ 3856 && ( RExC_parse[1] == '=' 3857 || RExC_parse[1] == '!' 3858 || RExC_parse[1] == '<' 3859 || RExC_parse[1] == '{')) 3860 || ( RExC_parse[0] == '*' /* (?(*...)) */ 3861 && ( RExC_parse[1] == '{' 3862 || ( memBEGINs(RExC_parse + 1, 3863 (Size_t) (RExC_end - (RExC_parse + 1)), 3864 "pla:") 3865 || memBEGINs(RExC_parse + 1, 3866 (Size_t) (RExC_end - (RExC_parse + 1)), 3867 "plb:") 3868 || memBEGINs(RExC_parse + 1, 3869 (Size_t) (RExC_end - (RExC_parse + 1)), 3870 "nla:") 3871 || memBEGINs(RExC_parse + 1, 3872 (Size_t) (RExC_end - (RExC_parse + 1)), 3873 "nlb:") 3874 || memBEGINs(RExC_parse + 1, 3875 (Size_t) (RExC_end - (RExC_parse + 1)), 3876 "positive_lookahead:") 3877 || memBEGINs(RExC_parse + 1, 3878 (Size_t) (RExC_end - (RExC_parse + 1)), 3879 "positive_lookbehind:") 3880 || memBEGINs(RExC_parse + 1, 3881 (Size_t) (RExC_end - (RExC_parse + 1)), 3882 "negative_lookahead:") 3883 || memBEGINs(RExC_parse + 1, 3884 (Size_t) (RExC_end - (RExC_parse + 1)), 3885 "negative_lookbehind:"))))) 3886 ) { /* Lookahead or eval. */ 3887 I32 flag; 3888 regnode_offset tail; 3889 3890 ret = reg_node(pRExC_state, LOGICAL); 3891 FLAGS(REGNODE_p(ret)) = 1; 3892 3893 tail = reg(pRExC_state, 1, &flag, depth+1); 3894 RETURN_FAIL_ON_RESTART(flag, flagp); 3895 if (! REGTAIL(pRExC_state, ret, tail)) { 3896 REQUIRE_BRANCHJ(flagp, 0); 3897 } 3898 goto insert_if; 3899 } 3900 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ 3901 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ 3902 { 3903 char ch = RExC_parse[0] == '<' ? '>' : '\''; 3904 char *name_start= RExC_parse; 3905 RExC_parse_inc_by(1); 3906 U32 num = 0; 3907 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA); 3908 if ( RExC_parse == name_start 3909 || RExC_parse >= RExC_end 3910 || *RExC_parse != ch) 3911 { 3912 vFAIL2("Sequence (?(%c... not terminated", 3913 (ch == '>' ? '<' : ch)); 3914 } 3915 RExC_parse_inc_by(1); 3916 if (sv_dat) { 3917 num = reg_add_data( pRExC_state, STR_WITH_LEN("S")); 3918 RExC_rxi->data->data[num]=(void*)sv_dat; 3919 SvREFCNT_inc_simple_void_NN(sv_dat); 3920 } 3921 ret = reg1node(pRExC_state, GROUPPN, num); 3922 goto insert_if_check_paren; 3923 } 3924 else if (memBEGINs(RExC_parse, 3925 (STRLEN) (RExC_end - RExC_parse), 3926 "DEFINE")) 3927 { 3928 ret = reg1node(pRExC_state, DEFINEP, 0); 3929 RExC_parse_inc_by(DEFINE_len); 3930 is_define = 1; 3931 goto insert_if_check_paren; 3932 } 3933 else if (RExC_parse[0] == 'R') { 3934 RExC_parse_inc_by(1); 3935 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval" 3936 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)" 3937 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)" 3938 */ 3939 parno = 0; 3940 if (RExC_parse[0] == '0') { 3941 parno = 1; 3942 RExC_parse_inc_by(1); 3943 } 3944 else if (inRANGE(RExC_parse[0], '1', '9')) { 3945 UV uv; 3946 endptr = RExC_end; 3947 if (grok_atoUV(RExC_parse, &uv, &endptr) 3948 && uv <= I32_MAX 3949 ) { 3950 parno = (I32)uv + 1; 3951 RExC_parse_set((char*)endptr); 3952 } 3953 /* else "Switch condition not recognized" below */ 3954 } else if (RExC_parse[0] == '&') { 3955 SV *sv_dat; 3956 RExC_parse_inc_by(1); 3957 sv_dat = reg_scan_name(pRExC_state, 3958 REG_RSN_RETURN_DATA); 3959 if (sv_dat) 3960 parno = 1 + *((I32 *)SvPVX(sv_dat)); 3961 } 3962 ret = reg1node(pRExC_state, INSUBP, parno); 3963 goto insert_if_check_paren; 3964 } 3965 else if (inRANGE(RExC_parse[0], '1', '9')) { 3966 /* (?(1)...) */ 3967 char c; 3968 UV uv; 3969 endptr = RExC_end; 3970 if (grok_atoUV(RExC_parse, &uv, &endptr) 3971 && uv <= I32_MAX 3972 ) { 3973 parno = (I32)uv; 3974 RExC_parse_set((char*)endptr); 3975 } 3976 else { 3977 vFAIL("panic: grok_atoUV returned FALSE"); 3978 } 3979 ret = reg1node(pRExC_state, GROUPP, parno); 3980 3981 insert_if_check_paren: 3982 if (UCHARAT(RExC_parse) != ')') { 3983 RExC_parse_inc_safe(); 3984 vFAIL("Switch condition not recognized"); 3985 } 3986 nextchar(pRExC_state); 3987 insert_if: 3988 if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state, 3989 IFTHEN, 0))) 3990 { 3991 REQUIRE_BRANCHJ(flagp, 0); 3992 } 3993 br = regbranch(pRExC_state, &flags, 1, depth+1); 3994 if (br == 0) { 3995 RETURN_FAIL_ON_RESTART(flags,flagp); 3996 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, 3997 (UV) flags); 3998 } else 3999 if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state, 4000 LONGJMP, 0))) 4001 { 4002 REQUIRE_BRANCHJ(flagp, 0); 4003 } 4004 c = UCHARAT(RExC_parse); 4005 nextchar(pRExC_state); 4006 if (flags&HASWIDTH) 4007 *flagp |= HASWIDTH; 4008 if (c == '|') { 4009 if (is_define) 4010 vFAIL("(?(DEFINE)....) does not allow branches"); 4011 4012 /* Fake one for optimizer. */ 4013 lastbr = reg1node(pRExC_state, IFTHEN, 0); 4014 4015 if (!regbranch(pRExC_state, &flags, 1, depth+1)) { 4016 RETURN_FAIL_ON_RESTART(flags, flagp); 4017 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, 4018 (UV) flags); 4019 } 4020 if (! REGTAIL(pRExC_state, ret, lastbr)) { 4021 REQUIRE_BRANCHJ(flagp, 0); 4022 } 4023 if (flags&HASWIDTH) 4024 *flagp |= HASWIDTH; 4025 c = UCHARAT(RExC_parse); 4026 nextchar(pRExC_state); 4027 } 4028 else 4029 lastbr = 0; 4030 if (c != ')') { 4031 if (RExC_parse >= RExC_end) 4032 vFAIL("Switch (?(condition)... not terminated"); 4033 else 4034 vFAIL("Switch (?(condition)... contains too many branches"); 4035 } 4036 ender = reg_node(pRExC_state, TAIL); 4037 if (! REGTAIL(pRExC_state, br, ender)) { 4038 REQUIRE_BRANCHJ(flagp, 0); 4039 } 4040 if (lastbr) { 4041 if (! REGTAIL(pRExC_state, lastbr, ender)) { 4042 REQUIRE_BRANCHJ(flagp, 0); 4043 } 4044 if (! REGTAIL(pRExC_state, 4045 REGNODE_OFFSET( 4046 REGNODE_AFTER(REGNODE_p(lastbr))), 4047 ender)) 4048 { 4049 REQUIRE_BRANCHJ(flagp, 0); 4050 } 4051 } 4052 else 4053 if (! REGTAIL(pRExC_state, ret, ender)) { 4054 REQUIRE_BRANCHJ(flagp, 0); 4055 } 4056 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */ 4057 RExC_size++; /* XXX WHY do we need this?!! 4058 For large programs it seems to be required 4059 but I can't figure out why. -- dmq*/ 4060 #endif 4061 return ret; 4062 } 4063 RExC_parse_inc_safe(); 4064 vFAIL("Unknown switch condition (?(...))"); 4065 } 4066 case '[': /* (?[ ... ]) */ 4067 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1); 4068 case 0: /* A NUL */ 4069 RExC_parse--; /* for vFAIL to print correctly */ 4070 vFAIL("Sequence (? incomplete"); 4071 break; 4072 4073 case ')': 4074 if (RExC_strict) { /* [perl #132851] */ 4075 ckWARNreg(RExC_parse, "Empty (?) without any modifiers"); 4076 } 4077 /* FALLTHROUGH */ 4078 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */ 4079 /* FALLTHROUGH */ 4080 default: /* e.g., (?i) */ 4081 RExC_parse_set((char *) seqstart + 1); 4082 parse_flags: 4083 parse_lparen_question_flags(pRExC_state); 4084 if (UCHARAT(RExC_parse) != ':') { 4085 if (RExC_parse < RExC_end) 4086 nextchar(pRExC_state); 4087 *flagp = TRYAGAIN; 4088 return 0; 4089 } 4090 paren = ':'; 4091 nextchar(pRExC_state); 4092 ret = 0; 4093 goto parse_rest; 4094 } /* end switch */ 4095 } 4096 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ 4097 capturing_parens: 4098 parno = RExC_npar; 4099 RExC_npar++; 4100 if (RExC_npar >= U16_MAX) 4101 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar); 4102 4103 logical_parno = RExC_logical_npar; 4104 RExC_logical_npar++; 4105 if (! ALL_PARENS_COUNTED) { 4106 /* If we are in our first pass through (and maybe only pass), 4107 * we need to allocate memory for the capturing parentheses 4108 * data structures. 4109 */ 4110 4111 if (!RExC_parens_buf_size) { 4112 /* first guess at number of parens we might encounter */ 4113 RExC_parens_buf_size = 10; 4114 4115 /* setup RExC_open_parens, which holds the address of each 4116 * OPEN tag, and to make things simpler for the 0 index the 4117 * start of the program - this is used later for offsets */ 4118 Newxz(RExC_open_parens, RExC_parens_buf_size, 4119 regnode_offset); 4120 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ 4121 4122 /* setup RExC_close_parens, which holds the address of each 4123 * CLOSE tag, and to make things simpler for the 0 index 4124 * the end of the program - this is used later for offsets 4125 * */ 4126 Newxz(RExC_close_parens, RExC_parens_buf_size, 4127 regnode_offset); 4128 /* we don't know where end op starts yet, so we don't need to 4129 * set RExC_close_parens[0] like we do RExC_open_parens[0] 4130 * above */ 4131 4132 Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32); 4133 Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32); 4134 } 4135 else if (RExC_npar > RExC_parens_buf_size) { 4136 I32 old_size = RExC_parens_buf_size; 4137 4138 RExC_parens_buf_size *= 2; 4139 4140 Renew(RExC_open_parens, RExC_parens_buf_size, 4141 regnode_offset); 4142 Zero(RExC_open_parens + old_size, 4143 RExC_parens_buf_size - old_size, regnode_offset); 4144 4145 Renew(RExC_close_parens, RExC_parens_buf_size, 4146 regnode_offset); 4147 Zero(RExC_close_parens + old_size, 4148 RExC_parens_buf_size - old_size, regnode_offset); 4149 4150 Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32); 4151 Zero(RExC_logical_to_parno + old_size, 4152 RExC_parens_buf_size - old_size, I32); 4153 4154 Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32); 4155 Zero(RExC_parno_to_logical + old_size, 4156 RExC_parens_buf_size - old_size, I32); 4157 } 4158 } 4159 4160 ret = reg1node(pRExC_state, OPEN, parno); 4161 if (!RExC_nestroot) 4162 RExC_nestroot = parno; 4163 if (RExC_open_parens && !RExC_open_parens[parno]) 4164 { 4165 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 4166 "%*s%*s Setting open paren #%" IVdf " to %zu\n", 4167 22, "| |", (int)(depth * 2 + 1), "", 4168 (IV)parno, ret)); 4169 RExC_open_parens[parno]= ret; 4170 } 4171 if (RExC_parno_to_logical) { 4172 RExC_parno_to_logical[parno] = logical_parno; 4173 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno]) 4174 RExC_logical_to_parno[logical_parno] = parno; 4175 } 4176 is_open = 1; 4177 } else { 4178 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ 4179 paren = ':'; 4180 ret = 0; 4181 } 4182 } 4183 else /* ! paren */ 4184 ret = 0; 4185 4186 parse_rest: 4187 /* Pick up the branches, linking them together. */ 4188 segment_parse_start = RExC_parse; 4189 I32 npar_before_regbranch = RExC_npar - 1; 4190 br = regbranch(pRExC_state, &flags, 1, depth+1); 4191 4192 /* branch_len = (paren != 0); */ 4193 4194 if (br == 0) { 4195 RETURN_FAIL_ON_RESTART(flags, flagp); 4196 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); 4197 } 4198 if (*RExC_parse == '|') { 4199 if (RExC_use_BRANCHJ) { 4200 reginsert(pRExC_state, BRANCHJ, br, depth+1); 4201 ARG2a_SET(REGNODE_p(br), npar_before_regbranch); 4202 ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1); 4203 } 4204 else { 4205 reginsert(pRExC_state, BRANCH, br, depth+1); 4206 ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch); 4207 ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1); 4208 } 4209 have_branch = 1; 4210 } 4211 else if (paren == ':') { 4212 *flagp |= flags&SIMPLE; 4213 } 4214 if (is_open) { /* Starts with OPEN. */ 4215 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */ 4216 REQUIRE_BRANCHJ(flagp, 0); 4217 } 4218 } 4219 else if (paren != '?') /* Not Conditional */ 4220 ret = br; 4221 *flagp |= flags & (HASWIDTH | POSTPONED); 4222 lastbr = br; 4223 while (*RExC_parse == '|') { 4224 if (RExC_use_BRANCHJ) { 4225 bool shut_gcc_up; 4226 4227 ender = reg1node(pRExC_state, LONGJMP, 0); 4228 4229 /* Append to the previous. */ 4230 shut_gcc_up = REGTAIL(pRExC_state, 4231 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))), 4232 ender); 4233 PERL_UNUSED_VAR(shut_gcc_up); 4234 } 4235 nextchar(pRExC_state); 4236 if (freeze_paren) { 4237 if (RExC_logical_npar > after_freeze) 4238 after_freeze = RExC_logical_npar; 4239 RExC_logical_npar = freeze_paren; 4240 } 4241 br = regbranch(pRExC_state, &flags, 0, depth+1); 4242 4243 if (br == 0) { 4244 RETURN_FAIL_ON_RESTART(flags, flagp); 4245 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); 4246 } 4247 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */ 4248 REQUIRE_BRANCHJ(flagp, 0); 4249 } 4250 assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ); 4251 assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ); 4252 if (OP(REGNODE_p(br)) == BRANCH) { 4253 if (OP(REGNODE_p(lastbr)) == BRANCH) 4254 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br))); 4255 else 4256 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br))); 4257 } 4258 else 4259 if (OP(REGNODE_p(br)) == BRANCHJ) { 4260 if (OP(REGNODE_p(lastbr)) == BRANCH) 4261 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br))); 4262 else 4263 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br))); 4264 } 4265 4266 lastbr = br; 4267 *flagp |= flags & (HASWIDTH | POSTPONED); 4268 } 4269 4270 if (have_branch || paren != ':') { 4271 regnode * br; 4272 4273 /* Make a closing node, and hook it on the end. */ 4274 switch (paren) { 4275 case ':': 4276 ender = reg_node(pRExC_state, TAIL); 4277 break; 4278 case 1: case 2: 4279 ender = reg1node(pRExC_state, CLOSE, parno); 4280 if ( RExC_close_parens ) { 4281 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 4282 "%*s%*s Setting close paren #%" IVdf " to %zu\n", 4283 22, "| |", (int)(depth * 2 + 1), "", 4284 (IV)parno, ender)); 4285 RExC_close_parens[parno]= ender; 4286 if (RExC_nestroot == parno) 4287 RExC_nestroot = 0; 4288 } 4289 break; 4290 case 's': 4291 ender = reg_node(pRExC_state, SRCLOSE); 4292 RExC_in_script_run = 0; 4293 break; 4294 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */ 4295 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */ 4296 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */ 4297 case '<': /* (?<= ... ) */ 4298 case ',': /* (?<! ... ) */ 4299 *flagp &= ~HASWIDTH; 4300 ender = reg_node(pRExC_state, LOOKBEHIND_END); 4301 break; 4302 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */ 4303 case 'a': 4304 case 'A': 4305 case '=': 4306 case '!': 4307 *flagp &= ~HASWIDTH; 4308 /* FALLTHROUGH */ 4309 case 't': /* aTomic */ 4310 case '>': 4311 ender = reg_node(pRExC_state, SUCCEED); 4312 break; 4313 case 0: 4314 ender = reg_node(pRExC_state, END); 4315 assert(!RExC_end_op); /* there can only be one! */ 4316 RExC_end_op = REGNODE_p(ender); 4317 if (RExC_close_parens) { 4318 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ 4319 "%*s%*s Setting close paren #0 (END) to %zu\n", 4320 22, "| |", (int)(depth * 2 + 1), "", 4321 ender)); 4322 4323 RExC_close_parens[0]= ender; 4324 } 4325 break; 4326 } 4327 DEBUG_PARSE_r({ 4328 DEBUG_PARSE_MSG("lsbr"); 4329 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state); 4330 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state); 4331 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", 4332 SvPV_nolen_const(RExC_mysv1), 4333 (IV)lastbr, 4334 SvPV_nolen_const(RExC_mysv2), 4335 (IV)ender, 4336 (IV)(ender - lastbr) 4337 ); 4338 }); 4339 if (OP(REGNODE_p(lastbr)) == BRANCH) { 4340 ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1); 4341 } 4342 else 4343 if (OP(REGNODE_p(lastbr)) == BRANCHJ) { 4344 ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1); 4345 } 4346 4347 if (! REGTAIL(pRExC_state, lastbr, ender)) { 4348 REQUIRE_BRANCHJ(flagp, 0); 4349 } 4350 4351 if (have_branch) { 4352 char is_nothing= 1; 4353 if (depth==1) 4354 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN; 4355 4356 /* Hook the tails of the branches to the closing node. */ 4357 for (br = REGNODE_p(ret); br; br = regnext(br)) { 4358 const U8 op = REGNODE_TYPE(OP(br)); 4359 regnode *nextoper = REGNODE_AFTER(br); 4360 if (op == BRANCH) { 4361 if (! REGTAIL_STUDY(pRExC_state, 4362 REGNODE_OFFSET(nextoper), 4363 ender)) 4364 { 4365 REQUIRE_BRANCHJ(flagp, 0); 4366 } 4367 if ( OP(nextoper) != NOTHING 4368 || regnext(nextoper) != REGNODE_p(ender)) 4369 is_nothing= 0; 4370 } 4371 else if (op == BRANCHJ) { 4372 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state, 4373 REGNODE_OFFSET(nextoper), 4374 ender); 4375 PERL_UNUSED_VAR(shut_gcc_up); 4376 /* for now we always disable this optimisation * / 4377 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ); 4378 if ( OP(nopr) != NOTHING 4379 || regnext(nopr) != REGNODE_p(ender)) 4380 */ 4381 is_nothing= 0; 4382 } 4383 } 4384 if (is_nothing) { 4385 regnode * ret_as_regnode = REGNODE_p(ret); 4386 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH 4387 ? regnext(ret_as_regnode) 4388 : ret_as_regnode; 4389 DEBUG_PARSE_r({ 4390 DEBUG_PARSE_MSG("NADA"); 4391 regprop(RExC_rx, RExC_mysv1, ret_as_regnode, 4392 NULL, pRExC_state); 4393 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), 4394 NULL, pRExC_state); 4395 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", 4396 SvPV_nolen_const(RExC_mysv1), 4397 (IV)REG_NODE_NUM(ret_as_regnode), 4398 SvPV_nolen_const(RExC_mysv2), 4399 (IV)ender, 4400 (IV)(ender - ret) 4401 ); 4402 }); 4403 OP(br)= NOTHING; 4404 if (OP(REGNODE_p(ender)) == TAIL) { 4405 NEXT_OFF(br)= 0; 4406 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE; 4407 } else { 4408 regnode *opt; 4409 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ ) 4410 OP(opt)= OPTIMIZED; 4411 NEXT_OFF(br)= REGNODE_p(ender) - br; 4412 } 4413 } 4414 } 4415 } 4416 4417 { 4418 const char *p; 4419 /* Even/odd or x=don't care: 010101x10x */ 4420 static const char parens[] = "=!aA<,>Bbt"; 4421 /* flag below is set to 0 up through 'A'; 1 for larger */ 4422 4423 if (paren && (p = strchr(parens, paren))) { 4424 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH; 4425 int flag = (p - parens) > 3; 4426 4427 if (paren == '>' || paren == 't') { 4428 node = SUSPEND, flag = 0; 4429 } 4430 4431 reginsert(pRExC_state, node, ret, depth+1); 4432 FLAGS(REGNODE_p(ret)) = flag; 4433 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL))) 4434 { 4435 REQUIRE_BRANCHJ(flagp, 0); 4436 } 4437 } 4438 } 4439 4440 /* Check for proper termination. */ 4441 if (paren) { 4442 /* restore original flags, but keep (?p) and, if we've encountered 4443 * something in the parse that changes /d rules into /u, keep the /u */ 4444 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); 4445 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) { 4446 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 4447 } 4448 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { 4449 RExC_parse_set(reg_parse_start); 4450 vFAIL("Unmatched ("); 4451 } 4452 nextchar(pRExC_state); 4453 } 4454 else if (!paren && RExC_parse < RExC_end) { 4455 if (*RExC_parse == ')') { 4456 RExC_parse_inc_by(1); 4457 vFAIL("Unmatched )"); 4458 } 4459 else 4460 FAIL("Junk on end of regexp"); /* "Can't happen". */ 4461 NOT_REACHED; /* NOTREACHED */ 4462 } 4463 4464 if (after_freeze > RExC_logical_npar) 4465 RExC_logical_npar = after_freeze; 4466 4467 RExC_in_lookaround = was_in_lookaround; 4468 4469 return(ret); 4470 } 4471 4472 /* 4473 - regbranch - one alternative of an | operator 4474 * 4475 * Implements the concatenation operator. 4476 * 4477 * On success, returns the offset at which any next node should be placed into 4478 * the regex engine program being compiled. 4479 * 4480 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs 4481 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to 4482 * UTF-8 4483 */ 4484 STATIC regnode_offset 4485 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) 4486 { 4487 regnode_offset ret; 4488 regnode_offset chain = 0; 4489 regnode_offset latest; 4490 regnode *branch_node = NULL; 4491 I32 flags = 0, c = 0; 4492 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4493 4494 PERL_ARGS_ASSERT_REGBRANCH; 4495 4496 DEBUG_PARSE("brnc"); 4497 4498 if (first) 4499 ret = 0; 4500 else { 4501 if (RExC_use_BRANCHJ) { 4502 ret = reg2node(pRExC_state, BRANCHJ, 0, 0); 4503 branch_node = REGNODE_p(ret); 4504 ARG2a_SET(branch_node, (U16)RExC_npar-1); 4505 } else { 4506 ret = reg1node(pRExC_state, BRANCH, 0); 4507 branch_node = REGNODE_p(ret); 4508 ARG1a_SET(branch_node, (U16)RExC_npar-1); 4509 } 4510 } 4511 4512 *flagp = 0; /* Initialize. */ 4513 4514 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 4515 FALSE /* Don't force to /x */ ); 4516 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { 4517 flags &= ~TRYAGAIN; 4518 latest = regpiece(pRExC_state, &flags, depth+1); 4519 if (latest == 0) { 4520 if (flags & TRYAGAIN) 4521 continue; 4522 RETURN_FAIL_ON_RESTART(flags, flagp); 4523 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags); 4524 } 4525 else if (ret == 0) 4526 ret = latest; 4527 *flagp |= flags&(HASWIDTH|POSTPONED); 4528 if (chain != 0) { 4529 /* FIXME adding one for every branch after the first is probably 4530 * excessive now we have TRIE support. (hv) */ 4531 MARK_NAUGHTY(1); 4532 if (! REGTAIL(pRExC_state, chain, latest)) { 4533 /* XXX We could just redo this branch, but figuring out what 4534 * bookkeeping needs to be reset is a pain, and it's likely 4535 * that other branches that goto END will also be too large */ 4536 REQUIRE_BRANCHJ(flagp, 0); 4537 } 4538 } 4539 chain = latest; 4540 c++; 4541 } 4542 if (chain == 0) { /* Loop ran zero times. */ 4543 chain = reg_node(pRExC_state, NOTHING); 4544 if (ret == 0) 4545 ret = chain; 4546 } 4547 if (c == 1) { 4548 *flagp |= flags & SIMPLE; 4549 } 4550 return ret; 4551 } 4552 4553 #define RBRACE 0 4554 #define MIN_S 1 4555 #define MIN_E 2 4556 #define MAX_S 3 4557 #define MAX_E 4 4558 4559 #ifndef PERL_IN_XSUB_RE 4560 bool 4561 Perl_regcurly(const char *s, const char *e, const char * result[5]) 4562 { 4563 /* This function matches a {m,n} quantifier. When called with a NULL final 4564 * argument, it simply parses the input from 's' up through 'e-1', and 4565 * returns a boolean as to whether or not this input is syntactically a 4566 * {m,n} quantifier. 4567 * 4568 * When called with a non-NULL final parameter, and when the function 4569 * returns TRUE, it additionally stores information into the array 4570 * specified by that parameter about what it found in the parse. The 4571 * parameter must be a pointer into a 5 element array of 'const char *' 4572 * elements. The returned information is as follows: 4573 * result[RBRACE] points to the closing brace 4574 * result[MIN_S] points to the first byte of the lower bound 4575 * result[MIN_E] points to one beyond the final byte of the lower bound 4576 * result[MAX_S] points to the first byte of the upper bound 4577 * result[MAX_E] points to one beyond the final byte of the upper bound 4578 * 4579 * If the quantifier is of the form {m,} (meaning an infinite upper 4580 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point 4581 * to is irrelevant, just that it's the same place 4582 * 4583 * If instead the quantifier is of the form {m} there is actually only 4584 * one bound, and both the upper and lower result[] elements are set to 4585 * point to it. 4586 * 4587 * This function checks only for syntactic validity; it leaves checking for 4588 * semantic validity and raising any diagnostics to the caller. This 4589 * function is called in multiple places to check for syntax, but only from 4590 * one for semantics. It makes it as simple as possible for the 4591 * syntax-only callers, while furnishing just enough information for the 4592 * semantic caller. 4593 */ 4594 4595 const char * min_start = NULL; 4596 const char * max_start = NULL; 4597 const char * min_end = NULL; 4598 const char * max_end = NULL; 4599 4600 bool has_comma = FALSE; 4601 4602 PERL_ARGS_ASSERT_REGCURLY; 4603 4604 if (s >= e || *s++ != '{') 4605 return FALSE; 4606 4607 while (s < e && isBLANK(*s)) { 4608 s++; 4609 } 4610 4611 if isDIGIT(*s) { 4612 min_start = s; 4613 do { 4614 s++; 4615 } while (s < e && isDIGIT(*s)); 4616 min_end = s; 4617 } 4618 4619 while (s < e && isBLANK(*s)) { 4620 s++; 4621 } 4622 4623 if (*s == ',') { 4624 has_comma = TRUE; 4625 s++; 4626 4627 while (s < e && isBLANK(*s)) { 4628 s++; 4629 } 4630 4631 if isDIGIT(*s) { 4632 max_start = s; 4633 do { 4634 s++; 4635 } while (s < e && isDIGIT(*s)); 4636 max_end = s; 4637 } 4638 } 4639 4640 while (s < e && isBLANK(*s)) { 4641 s++; 4642 } 4643 /* Need at least one number */ 4644 if (s >= e || *s != '}' || (! min_start && ! max_end)) { 4645 return FALSE; 4646 } 4647 4648 if (result) { 4649 4650 result[RBRACE] = s; 4651 4652 result[MIN_S] = min_start; 4653 result[MIN_E] = min_end; 4654 if (has_comma) { 4655 if (max_start) { 4656 result[MAX_S] = max_start; 4657 result[MAX_E] = max_end; 4658 } 4659 else { 4660 /* Having no value after the comma is signalled by setting 4661 * start and end to the same value. What that value is isn't 4662 * relevant; NULL is chosen simply because it will fail if the 4663 * caller mistakenly uses it */ 4664 result[MAX_S] = result[MAX_E] = NULL; 4665 } 4666 } 4667 else { /* No comma means lower and upper bounds are the same */ 4668 result[MAX_S] = min_start; 4669 result[MAX_E] = min_end; 4670 } 4671 } 4672 4673 return TRUE; 4674 } 4675 #endif 4676 4677 U32 4678 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, 4679 const char * start, const char * end) 4680 { 4681 /* This is a helper function for regpiece() to compute, given the 4682 * quantifier {m,n}, the value of either m or n, based on the starting 4683 * position 'start' in the string, through the byte 'end-1', returning it 4684 * if valid, and failing appropriately if not. It knows the restrictions 4685 * imposed on quantifier values */ 4686 4687 UV uv; 4688 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX); 4689 4690 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE; 4691 4692 if (grok_atoUV(start, &uv, &end)) { 4693 if (uv < REG_INFTY) { /* A valid, small-enough number */ 4694 return (U32) uv; 4695 } 4696 } 4697 else if (*start == '0') { /* grok_atoUV() fails for only two reasons: 4698 leading zeros or overflow */ 4699 RExC_parse_set((char * ) end); 4700 4701 /* Perhaps too generic a msg for what is only failure from having 4702 * leading zeros, but this is how it's always behaved. */ 4703 vFAIL("Invalid quantifier in {,}"); 4704 NOT_REACHED; /*NOTREACHED*/ 4705 } 4706 4707 /* Here, found a quantifier, but was too large; either it overflowed or was 4708 * too big a legal number */ 4709 RExC_parse_set((char * ) end); 4710 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); 4711 4712 NOT_REACHED; /*NOTREACHED*/ 4713 return U32_MAX; /* Perhaps some compilers will be expecting a return */ 4714 } 4715 4716 /* 4717 - regpiece - something followed by possible quantifier * + ? {n,m} 4718 * 4719 * Note that the branching code sequences used for ? and the general cases 4720 * of * and + are somewhat optimized: they use the same NOTHING node as 4721 * both the endmarker for their branch list and the body of the last branch. 4722 * It might seem that this node could be dispensed with entirely, but the 4723 * endmarker role is not redundant. 4724 * 4725 * On success, returns the offset at which any next node should be placed into 4726 * the regex engine program being compiled. 4727 * 4728 * Returns 0 otherwise, with *flagp set to indicate why: 4729 * TRYAGAIN if regatom() returns 0 with TRYAGAIN. 4730 * RESTART_PARSE if the parse needs to be restarted, or'd with 4731 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8. 4732 */ 4733 STATIC regnode_offset 4734 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 4735 { 4736 regnode_offset ret; 4737 char op; 4738 I32 flags; 4739 const char * const origparse = RExC_parse; 4740 I32 min; 4741 I32 max = REG_INFTY; 4742 I32 npar_before = RExC_npar-1; 4743 4744 /* Save the original in case we change the emitted regop to a FAIL. */ 4745 const regnode_offset orig_emit = RExC_emit; 4746 4747 DECLARE_AND_GET_RE_DEBUG_FLAGS; 4748 4749 PERL_ARGS_ASSERT_REGPIECE; 4750 4751 DEBUG_PARSE("piec"); 4752 4753 ret = regatom(pRExC_state, &flags, depth+1); 4754 if (ret == 0) { 4755 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN); 4756 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); 4757 } 4758 I32 npar_after = RExC_npar-1; 4759 4760 op = *RExC_parse; 4761 switch (op) { 4762 const char * regcurly_return[5]; 4763 4764 case '*': 4765 nextchar(pRExC_state); 4766 min = 0; 4767 break; 4768 4769 case '+': 4770 nextchar(pRExC_state); 4771 min = 1; 4772 break; 4773 4774 case '?': 4775 nextchar(pRExC_state); 4776 min = 0; max = 1; 4777 break; 4778 4779 case '{': /* A '{' may or may not indicate a quantifier; call regcurly() 4780 to determine which */ 4781 if (regcurly(RExC_parse, RExC_end, regcurly_return)) { 4782 const char * min_start = regcurly_return[MIN_S]; 4783 const char * min_end = regcurly_return[MIN_E]; 4784 const char * max_start = regcurly_return[MAX_S]; 4785 const char * max_end = regcurly_return[MAX_E]; 4786 4787 if (min_start) { 4788 min = get_quantifier_value(pRExC_state, min_start, min_end); 4789 } 4790 else { 4791 min = 0; 4792 } 4793 4794 if (max_start == max_end) { /* Was of the form {m,} */ 4795 max = REG_INFTY; 4796 } 4797 else if (max_start == min_start) { /* Was of the form {m} */ 4798 max = min; 4799 } 4800 else { /* Was of the form {m,n} */ 4801 assert(max_end >= max_start); 4802 4803 max = get_quantifier_value(pRExC_state, max_start, max_end); 4804 } 4805 4806 RExC_parse_set((char *) regcurly_return[RBRACE]); 4807 nextchar(pRExC_state); 4808 4809 if (max < min) { /* If can't match, warn and optimize to fail 4810 unconditionally */ 4811 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); 4812 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); 4813 NEXT_OFF(REGNODE_p(orig_emit)) = 4814 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE; 4815 return ret; 4816 } 4817 else if (min == max && *RExC_parse == '?') { 4818 ckWARN2reg(RExC_parse + 1, 4819 "Useless use of greediness modifier '%c'", 4820 *RExC_parse); 4821 } 4822 4823 break; 4824 } /* End of is {m,n} */ 4825 4826 /* Here was a '{', but what followed it didn't form a quantifier. */ 4827 /* FALLTHROUGH */ 4828 4829 default: 4830 *flagp = flags; 4831 return(ret); 4832 NOT_REACHED; /*NOTREACHED*/ 4833 } 4834 4835 /* Here we have a quantifier, and have calculated 'min' and 'max'. 4836 * 4837 * Check and possibly adjust a zero width operand */ 4838 if (! (flags & (HASWIDTH|POSTPONED))) { 4839 if (max > REG_INFTY/3) { 4840 ckWARN2reg(RExC_parse, 4841 "%" UTF8f " matches null string many times", 4842 UTF8fARG(UTF, (RExC_parse >= origparse 4843 ? RExC_parse - origparse 4844 : 0), 4845 origparse)); 4846 } 4847 4848 /* There's no point in trying to match something 0 length more than 4849 * once except for extra side effects, which we don't have here since 4850 * not POSTPONED */ 4851 if (max > 1) { 4852 max = 1; 4853 if (min > max) { 4854 min = max; 4855 } 4856 } 4857 } 4858 4859 /* If this is a code block pass it up */ 4860 *flagp |= (flags & POSTPONED); 4861 4862 if (max > 0) { 4863 *flagp |= (flags & HASWIDTH); 4864 if (max == REG_INFTY) 4865 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; 4866 } 4867 4868 /* 'SIMPLE' operands don't require full generality */ 4869 if ((flags&SIMPLE)) { 4870 if (max == REG_INFTY) { 4871 if (min == 0) { 4872 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) { 4873 goto min0_maxINF_wildcard_forbidden; 4874 } 4875 4876 reginsert(pRExC_state, STAR, ret, depth+1); 4877 MARK_NAUGHTY(4); 4878 goto done_main_op; 4879 } 4880 else if (min == 1) { 4881 reginsert(pRExC_state, PLUS, ret, depth+1); 4882 MARK_NAUGHTY(3); 4883 goto done_main_op; 4884 } 4885 } 4886 4887 /* Here, SIMPLE, but not the '*' and '+' special cases */ 4888 4889 MARK_NAUGHTY_EXP(2, 2); 4890 reginsert(pRExC_state, CURLY, ret, depth+1); 4891 } 4892 else { /* not SIMPLE */ 4893 const regnode_offset w = reg_node(pRExC_state, WHILEM); 4894 4895 FLAGS(REGNODE_p(w)) = 0; 4896 if (! REGTAIL(pRExC_state, ret, w)) { 4897 REQUIRE_BRANCHJ(flagp, 0); 4898 } 4899 if (RExC_use_BRANCHJ) { 4900 reginsert(pRExC_state, LONGJMP, ret, depth+1); 4901 reginsert(pRExC_state, NOTHING, ret, depth+1); 4902 REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP); 4903 } 4904 reginsert(pRExC_state, CURLYX, ret, depth+1); 4905 if (RExC_use_BRANCHJ) 4906 /* Go over NOTHING to LONGJMP. */ 4907 REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING); 4908 4909 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state, 4910 NOTHING))) 4911 { 4912 REQUIRE_BRANCHJ(flagp, 0); 4913 } 4914 RExC_whilem_seen++; 4915 MARK_NAUGHTY_EXP(1, 4); /* compound interest */ 4916 } 4917 4918 /* Finish up the CURLY/CURLYX case */ 4919 FLAGS(REGNODE_p(ret)) = 0; 4920 4921 ARG1i_SET(REGNODE_p(ret), min); 4922 ARG2i_SET(REGNODE_p(ret), max); 4923 4924 /* if we had a npar_after then we need to increment npar_before, 4925 * we want to track the range of parens we need to reset each iteration 4926 */ 4927 if (npar_after!=npar_before) { 4928 ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1); 4929 ARG3b_SET(REGNODE_p(ret), (U16)npar_after); 4930 } else { 4931 ARG3a_SET(REGNODE_p(ret), 0); 4932 ARG3b_SET(REGNODE_p(ret), 0); 4933 } 4934 4935 done_main_op: 4936 4937 /* Process any greediness modifiers */ 4938 if (*RExC_parse == '?') { 4939 nextchar(pRExC_state); 4940 reginsert(pRExC_state, MINMOD, ret, depth+1); 4941 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) { 4942 REQUIRE_BRANCHJ(flagp, 0); 4943 } 4944 } 4945 else if (*RExC_parse == '+') { 4946 regnode_offset ender; 4947 nextchar(pRExC_state); 4948 ender = reg_node(pRExC_state, SUCCEED); 4949 if (! REGTAIL(pRExC_state, ret, ender)) { 4950 REQUIRE_BRANCHJ(flagp, 0); 4951 } 4952 reginsert(pRExC_state, SUSPEND, ret, depth+1); 4953 ender = reg_node(pRExC_state, TAIL); 4954 if (! REGTAIL(pRExC_state, ret, ender)) { 4955 REQUIRE_BRANCHJ(flagp, 0); 4956 } 4957 } 4958 4959 /* Forbid extra quantifiers */ 4960 if (isQUANTIFIER(RExC_parse, RExC_end)) { 4961 RExC_parse_inc_by(1); 4962 vFAIL("Nested quantifiers"); 4963 } 4964 4965 return(ret); 4966 4967 min0_maxINF_wildcard_forbidden: 4968 4969 /* Here we are in a wildcard match, and the minimum match length is 0, and 4970 * the max could be infinity. This is currently forbidden. The only 4971 * reason is to make it harder to write patterns that take a long long time 4972 * to halt, and because the use of this construct isn't necessary in 4973 * matching Unicode property values */ 4974 RExC_parse_inc_by(1); 4975 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard 4976 subpatterns in regex; marked by <-- HERE in m/%s/ 4977 */ 4978 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard" 4979 " subpatterns"); 4980 4981 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't 4982 * legal at all in wildcards, so can't get this far */ 4983 4984 NOT_REACHED; /*NOTREACHED*/ 4985 } 4986 4987 STATIC bool 4988 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, 4989 regnode_offset * node_p, 4990 UV * code_point_p, 4991 int * cp_count, 4992 I32 * flagp, 4993 const bool strict, 4994 const U32 depth 4995 ) 4996 { 4997 /* This routine teases apart the various meanings of \N and returns 4998 * accordingly. The input parameters constrain which meaning(s) is/are valid 4999 * in the current context. 5000 * 5001 * Exactly one of <node_p> and <code_point_p> must be non-NULL. 5002 * 5003 * If <code_point_p> is not NULL, the context is expecting the result to be a 5004 * single code point. If this \N instance turns out to a single code point, 5005 * the function returns TRUE and sets *code_point_p to that code point. 5006 * 5007 * If <node_p> is not NULL, the context is expecting the result to be one of 5008 * the things representable by a regnode. If this \N instance turns out to be 5009 * one such, the function generates the regnode, returns TRUE and sets *node_p 5010 * to point to the offset of that regnode into the regex engine program being 5011 * compiled. 5012 * 5013 * If this instance of \N isn't legal in any context, this function will 5014 * generate a fatal error and not return. 5015 * 5016 * On input, RExC_parse should point to the first char following the \N at the 5017 * time of the call. On successful return, RExC_parse will have been updated 5018 * to point to just after the sequence identified by this routine. Also 5019 * *flagp has been updated as needed. 5020 * 5021 * When there is some problem with the current context and this \N instance, 5022 * the function returns FALSE, without advancing RExC_parse, nor setting 5023 * *node_p, nor *code_point_p, nor *flagp. 5024 * 5025 * If <cp_count> is not NULL, the caller wants to know the length (in code 5026 * points) that this \N sequence matches. This is set, and the input is 5027 * parsed for errors, even if the function returns FALSE, as detailed below. 5028 * 5029 * There are 6 possibilities here, as detailed in the next 6 paragraphs. 5030 * 5031 * Probably the most common case is for the \N to specify a single code point. 5032 * *cp_count will be set to 1, and *code_point_p will be set to that code 5033 * point. 5034 * 5035 * Another possibility is for the input to be an empty \N{}. This is no 5036 * longer accepted, and will generate a fatal error. 5037 * 5038 * Another possibility is for a custom charnames handler to be in effect which 5039 * translates the input name to an empty string. *cp_count will be set to 0. 5040 * *node_p will be set to a generated NOTHING node. 5041 * 5042 * Still another possibility is for the \N to mean [^\n]. *cp_count will be 5043 * set to 0. *node_p will be set to a generated REG_ANY node. 5044 * 5045 * The fifth possibility is that \N resolves to a sequence of more than one 5046 * code points. *cp_count will be set to the number of code points in the 5047 * sequence. *node_p will be set to a generated node returned by this 5048 * function calling S_reg(). 5049 * 5050 * The sixth and final possibility is that it is premature to be calling this 5051 * function; the parse needs to be restarted. This can happen when this 5052 * changes from /d to /u rules, or when the pattern needs to be upgraded to 5053 * UTF-8. The latter occurs only when the fifth possibility would otherwise 5054 * be in effect, and is because one of those code points requires the pattern 5055 * to be recompiled as UTF-8. The function returns FALSE, and sets the 5056 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this 5057 * happens, the caller needs to desist from continuing parsing, and return 5058 * this information to its caller. This is not set for when there is only one 5059 * code point, as this can be called as part of an ANYOF node, and they can 5060 * store above-Latin1 code points without the pattern having to be in UTF-8. 5061 * 5062 * For non-single-quoted regexes, the tokenizer has resolved character and 5063 * sequence names inside \N{...} into their Unicode values, normalizing the 5064 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the 5065 * hex-represented code points in the sequence. This is done there because 5066 * the names can vary based on what charnames pragma is in scope at the time, 5067 * so we need a way to take a snapshot of what they resolve to at the time of 5068 * the original parse. [perl #56444]. 5069 * 5070 * That parsing is skipped for single-quoted regexes, so here we may get 5071 * '\N{NAME}', which is parsed now. If the single-quoted regex is something 5072 * like '\N{U+41}', that code point is Unicode, and has to be translated into 5073 * the native character set for non-ASCII platforms. The other possibilities 5074 * are already native, so no translation is done. */ 5075 5076 char * endbrace; /* points to '}' following the name */ 5077 char * e; /* points to final non-blank before endbrace */ 5078 char* p = RExC_parse; /* Temporary */ 5079 5080 SV * substitute_parse = NULL; 5081 char *orig_end; 5082 char *save_start; 5083 I32 flags; 5084 5085 DECLARE_AND_GET_RE_DEBUG_FLAGS; 5086 5087 PERL_ARGS_ASSERT_GROK_BSLASH_N; 5088 5089 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ 5090 assert(! (node_p && cp_count)); /* At most 1 should be set */ 5091 5092 if (cp_count) { /* Initialize return for the most common case */ 5093 *cp_count = 1; 5094 } 5095 5096 /* The [^\n] meaning of \N ignores spaces and comments under the /x 5097 * modifier. The other meanings do not (except blanks adjacent to and 5098 * within the braces), so use a temporary until we find out which we are 5099 * being called with */ 5100 skip_to_be_ignored_text(pRExC_state, &p, 5101 FALSE /* Don't force to /x */ ); 5102 5103 /* Disambiguate between \N meaning a named character versus \N meaning 5104 * [^\n]. The latter is assumed when the {...} following the \N is a legal 5105 * quantifier, or if there is no '{' at all */ 5106 if (*p != '{' || regcurly(p, RExC_end, NULL)) { 5107 RExC_parse_set(p); 5108 if (cp_count) { 5109 *cp_count = -1; 5110 } 5111 5112 if (! node_p) { 5113 return FALSE; 5114 } 5115 5116 *node_p = reg_node(pRExC_state, REG_ANY); 5117 *flagp |= HASWIDTH|SIMPLE; 5118 MARK_NAUGHTY(1); 5119 return TRUE; 5120 } 5121 5122 /* The test above made sure that the next real character is a '{', but 5123 * under the /x modifier, it could be separated by space (or a comment and 5124 * \n) and this is not allowed (for consistency with \x{...} and the 5125 * tokenizer handling of \N{NAME}). */ 5126 if (*RExC_parse != '{') { 5127 vFAIL("Missing braces on \\N{}"); 5128 } 5129 5130 RExC_parse_inc_by(1); /* Skip past the '{' */ 5131 5132 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 5133 if (! endbrace) { /* no trailing brace */ 5134 vFAIL2("Missing right brace on \\%c{}", 'N'); 5135 } 5136 5137 /* Here, we have decided it should be a named character or sequence. These 5138 * imply Unicode semantics */ 5139 REQUIRE_UNI_RULES(flagp, FALSE); 5140 5141 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to 5142 * nothing at all (not allowed under strict) */ 5143 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') { 5144 RExC_parse_set(endbrace); 5145 if (strict) { 5146 RExC_parse_inc_by(1); /* Position after the "}" */ 5147 vFAIL("Zero length \\N{}"); 5148 } 5149 5150 if (cp_count) { 5151 *cp_count = 0; 5152 } 5153 nextchar(pRExC_state); 5154 if (! node_p) { 5155 return FALSE; 5156 } 5157 5158 *node_p = reg_node(pRExC_state, NOTHING); 5159 return TRUE; 5160 } 5161 5162 while (isBLANK(*RExC_parse)) { 5163 RExC_parse_inc_by(1); 5164 } 5165 5166 e = endbrace; 5167 while (RExC_parse < e && isBLANK(*(e-1))) { 5168 e--; 5169 } 5170 5171 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) { 5172 5173 /* Here, the name isn't of the form U+.... This can happen if the 5174 * pattern is single-quoted, so didn't get evaluated in toke.c. Now 5175 * is the time to find out what the name means */ 5176 5177 const STRLEN name_len = e - RExC_parse; 5178 SV * value_sv; /* What does this name evaluate to */ 5179 SV ** value_svp; 5180 const U8 * value; /* string of name's value */ 5181 STRLEN value_len; /* and its length */ 5182 5183 /* RExC_unlexed_names is a hash of names that weren't evaluated by 5184 * toke.c, and their values. Make sure is initialized */ 5185 if (! RExC_unlexed_names) { 5186 RExC_unlexed_names = newHV(); 5187 } 5188 5189 /* If we have already seen this name in this pattern, use that. This 5190 * allows us to only call the charnames handler once per name per 5191 * pattern. A broken or malicious handler could return something 5192 * different each time, which could cause the results to vary depending 5193 * on if something gets added or subtracted from the pattern that 5194 * causes the number of passes to change, for example */ 5195 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse, 5196 name_len, 0))) 5197 { 5198 value_sv = *value_svp; 5199 } 5200 else { /* Otherwise we have to go out and get the name */ 5201 const char * error_msg = NULL; 5202 value_sv = get_and_check_backslash_N_name(RExC_parse, e, 5203 UTF, 5204 &error_msg); 5205 if (error_msg) { 5206 RExC_parse_set(endbrace); 5207 vFAIL(error_msg); 5208 } 5209 5210 /* If no error message, should have gotten a valid return */ 5211 assert (value_sv); 5212 5213 /* Save the name's meaning for later use */ 5214 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len, 5215 value_sv, 0)) 5216 { 5217 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); 5218 } 5219 } 5220 5221 /* Here, we have the value the name evaluates to in 'value_sv' */ 5222 value = (U8 *) SvPV(value_sv, value_len); 5223 5224 /* See if the result is one code point vs 0 or multiple */ 5225 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv) 5226 ? UTF8SKIP(value) 5227 : 1))) 5228 { 5229 /* Here, exactly one code point. If that isn't what is wanted, 5230 * fail */ 5231 if (! code_point_p) { 5232 RExC_parse_set(p); 5233 return FALSE; 5234 } 5235 5236 /* Convert from string to numeric code point */ 5237 *code_point_p = (SvUTF8(value_sv)) 5238 ? valid_utf8_to_uvchr(value, NULL) 5239 : *value; 5240 5241 /* Have parsed this entire single code point \N{...}. *cp_count 5242 * has already been set to 1, so don't do it again. */ 5243 RExC_parse_set(endbrace); 5244 nextchar(pRExC_state); 5245 return TRUE; 5246 } /* End of is a single code point */ 5247 5248 /* Count the code points, if caller desires. The API says to do this 5249 * even if we will later return FALSE */ 5250 if (cp_count) { 5251 *cp_count = 0; 5252 5253 *cp_count = (SvUTF8(value_sv)) 5254 ? utf8_length(value, value + value_len) 5255 : value_len; 5256 } 5257 5258 /* Fail if caller doesn't want to handle a multi-code-point sequence. 5259 * But don't back the pointer up if the caller wants to know how many 5260 * code points there are (they need to handle it themselves in this 5261 * case). */ 5262 if (! node_p) { 5263 if (! cp_count) { 5264 RExC_parse_set(p); 5265 } 5266 return FALSE; 5267 } 5268 5269 /* Convert this to a sub-pattern of the form "(?: ... )", and then call 5270 * reg recursively to parse it. That way, it retains its atomicness, 5271 * while not having to worry about any special handling that some code 5272 * points may have. */ 5273 5274 substitute_parse = newSVpvs("?:"); 5275 sv_catsv(substitute_parse, value_sv); 5276 sv_catpv(substitute_parse, ")"); 5277 5278 /* The value should already be native, so no need to convert on EBCDIC 5279 * platforms.*/ 5280 assert(! RExC_recode_x_to_native); 5281 5282 } 5283 else { /* \N{U+...} */ 5284 Size_t count = 0; /* code point count kept internally */ 5285 5286 /* We can get to here when the input is \N{U+...} or when toke.c has 5287 * converted a name to the \N{U+...} form. This include changing a 5288 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */ 5289 5290 RExC_parse_inc_by(2); /* Skip past the 'U+' */ 5291 5292 /* Code points are separated by dots. The '}' terminates the whole 5293 * thing. */ 5294 5295 do { /* Loop until the ending brace */ 5296 I32 flags = PERL_SCAN_SILENT_OVERFLOW 5297 | PERL_SCAN_SILENT_ILLDIGIT 5298 | PERL_SCAN_NOTIFY_ILLDIGIT 5299 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES 5300 | PERL_SCAN_DISALLOW_PREFIX; 5301 STRLEN len = e - RExC_parse; 5302 NV overflow_value; 5303 char * start_digit = RExC_parse; 5304 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value); 5305 5306 if (len == 0) { 5307 RExC_parse_inc_by(1); 5308 bad_NU: 5309 vFAIL("Invalid hexadecimal number in \\N{U+...}"); 5310 } 5311 5312 RExC_parse_inc_by(len); 5313 5314 if (cp > MAX_LEGAL_CP) { 5315 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0)); 5316 } 5317 5318 if (RExC_parse >= e) { /* Got to the closing '}' */ 5319 if (count) { 5320 goto do_concat; 5321 } 5322 5323 /* Here, is a single code point; fail if doesn't want that */ 5324 if (! code_point_p) { 5325 RExC_parse_set(p); 5326 return FALSE; 5327 } 5328 5329 /* A single code point is easy to handle; just return it */ 5330 *code_point_p = UNI_TO_NATIVE(cp); 5331 RExC_parse_set(endbrace); 5332 nextchar(pRExC_state); 5333 return TRUE; 5334 } 5335 5336 /* Here, the parse stopped bfore the ending brace. This is legal 5337 * only if that character is a dot separating code points, like a 5338 * multiple character sequence (of the form "\N{U+c1.c2. ... }". 5339 * So the next character must be a dot (and the one after that 5340 * can't be the ending brace, or we'd have something like 5341 * \N{U+100.} ) 5342 * */ 5343 if (*RExC_parse != '.' || RExC_parse + 1 >= e) { 5344 /*point to after 1st invalid */ 5345 RExC_parse_incf(RExC_orig_utf8); 5346 /*Guard against malformed utf8*/ 5347 RExC_parse_set(MIN(e, RExC_parse)); 5348 goto bad_NU; 5349 } 5350 5351 /* Here, looks like its really a multiple character sequence. Fail 5352 * if that's not what the caller wants. But continue with counting 5353 * and error checking if they still want a count */ 5354 if (! node_p && ! cp_count) { 5355 return FALSE; 5356 } 5357 5358 /* What is done here is to convert this to a sub-pattern of the 5359 * form \x{char1}\x{char2}... and then call reg recursively to 5360 * parse it (enclosing in "(?: ... )" ). That way, it retains its 5361 * atomicness, while not having to worry about special handling 5362 * that some code points may have. We don't create a subpattern, 5363 * but go through the motions of code point counting and error 5364 * checking, if the caller doesn't want a node returned. */ 5365 5366 if (node_p && ! substitute_parse) { 5367 substitute_parse = newSVpvs("?:"); 5368 } 5369 5370 do_concat: 5371 5372 if (node_p) { 5373 /* Convert to notation the rest of the code understands */ 5374 sv_catpvs(substitute_parse, "\\x{"); 5375 sv_catpvn(substitute_parse, start_digit, 5376 RExC_parse - start_digit); 5377 sv_catpvs(substitute_parse, "}"); 5378 } 5379 5380 /* Move to after the dot (or ending brace the final time through.) 5381 * */ 5382 RExC_parse_inc_by(1); 5383 count++; 5384 5385 } while (RExC_parse < e); 5386 5387 if (! node_p) { /* Doesn't want the node */ 5388 assert (cp_count); 5389 5390 *cp_count = count; 5391 return FALSE; 5392 } 5393 5394 sv_catpvs(substitute_parse, ")"); 5395 5396 /* The values are Unicode, and therefore have to be converted to native 5397 * on a non-Unicode (meaning non-ASCII) platform. */ 5398 SET_recode_x_to_native(1); 5399 } 5400 5401 /* Here, we have the string the name evaluates to, ready to be parsed, 5402 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}" 5403 * constructs. This can be called from within a substitute parse already. 5404 * The error reporting mechanism doesn't work for 2 levels of this, but the 5405 * code above has validated this new construct, so there should be no 5406 * errors generated by the below. And this isn't an exact copy, so the 5407 * mechanism to seamlessly deal with this won't work, so turn off warnings 5408 * during it */ 5409 save_start = RExC_start; 5410 orig_end = RExC_end; 5411 5412 RExC_start = SvPVX(substitute_parse); 5413 RExC_parse_set(RExC_start); 5414 RExC_end = RExC_parse + SvCUR(substitute_parse); 5415 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE; 5416 5417 *node_p = reg(pRExC_state, 1, &flags, depth+1); 5418 5419 /* Restore the saved values */ 5420 RESTORE_WARNINGS; 5421 RExC_start = save_start; 5422 RExC_parse_set(endbrace); 5423 RExC_end = orig_end; 5424 SET_recode_x_to_native(0); 5425 5426 SvREFCNT_dec_NN(substitute_parse); 5427 5428 if (! *node_p) { 5429 RETURN_FAIL_ON_RESTART(flags, flagp); 5430 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf, 5431 (UV) flags); 5432 } 5433 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED); 5434 5435 nextchar(pRExC_state); 5436 5437 return TRUE; 5438 } 5439 5440 5441 STATIC U8 5442 S_compute_EXACTish(RExC_state_t *pRExC_state) 5443 { 5444 U8 op; 5445 5446 PERL_ARGS_ASSERT_COMPUTE_EXACTISH; 5447 5448 if (! FOLD) { 5449 return (LOC) 5450 ? EXACTL 5451 : EXACT; 5452 } 5453 5454 op = get_regex_charset(RExC_flags); 5455 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { 5456 op--; /* /a is same as /u, and map /aa's offset to what /a's would have 5457 been, so there is no hole */ 5458 } 5459 5460 return op + EXACTF; 5461 } 5462 5463 /* Parse backref decimal value, unless it's too big to sensibly be a backref, 5464 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ 5465 5466 static I32 5467 S_backref_value(char *p, char *e) 5468 { 5469 const char* endptr = e; 5470 UV val; 5471 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX) 5472 return (I32)val; 5473 return I32_MAX; 5474 } 5475 5476 5477 /* 5478 - regatom - the lowest level 5479 5480 Try to identify anything special at the start of the current parse position. 5481 If there is, then handle it as required. This may involve generating a 5482 single regop, such as for an assertion; or it may involve recursing, such as 5483 to handle a () structure. 5484 5485 If the string doesn't start with something special then we gobble up 5486 as much literal text as we can. If we encounter a quantifier, we have to 5487 back off the final literal character, as that quantifier applies to just it 5488 and not to the whole string of literals. 5489 5490 Once we have been able to handle whatever type of thing started the 5491 sequence, we return the offset into the regex engine program being compiled 5492 at which any next regnode should be placed. 5493 5494 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN. 5495 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be 5496 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 5497 Otherwise does not return 0. 5498 5499 Note: we have to be careful with escapes, as they can be both literal 5500 and special, and in the case of \10 and friends, context determines which. 5501 5502 A summary of the code structure is: 5503 5504 switch (first_byte) { 5505 cases for each special: 5506 handle this special; 5507 break; 5508 case '\\': 5509 switch (2nd byte) { 5510 cases for each unambiguous special: 5511 handle this special; 5512 break; 5513 cases for each ambiguous special/literal: 5514 disambiguate; 5515 if (special) handle here 5516 else goto defchar; 5517 default: // unambiguously literal: 5518 goto defchar; 5519 } 5520 default: // is a literal char 5521 // FALL THROUGH 5522 defchar: 5523 create EXACTish node for literal; 5524 while (more input and node isn't full) { 5525 switch (input_byte) { 5526 cases for each special; 5527 make sure parse pointer is set so that the next call to 5528 regatom will see this special first 5529 goto loopdone; // EXACTish node terminated by prev. char 5530 default: 5531 append char to EXACTISH node; 5532 } 5533 get next input byte; 5534 } 5535 loopdone: 5536 } 5537 return the generated node; 5538 5539 Specifically there are two separate switches for handling 5540 escape sequences, with the one for handling literal escapes requiring 5541 a dummy entry for all of the special escapes that are actually handled 5542 by the other. 5543 5544 */ 5545 5546 STATIC regnode_offset 5547 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) 5548 { 5549 regnode_offset ret = 0; 5550 I32 flags = 0; 5551 char *atom_parse_start; 5552 U8 op; 5553 int invert = 0; 5554 5555 DECLARE_AND_GET_RE_DEBUG_FLAGS; 5556 5557 *flagp = 0; /* Initialize. */ 5558 5559 DEBUG_PARSE("atom"); 5560 5561 PERL_ARGS_ASSERT_REGATOM; 5562 5563 tryagain: 5564 atom_parse_start = RExC_parse; 5565 assert(RExC_parse < RExC_end); 5566 switch ((U8)*RExC_parse) { 5567 case '^': 5568 RExC_seen_zerolen++; 5569 nextchar(pRExC_state); 5570 if (RExC_flags & RXf_PMf_MULTILINE) 5571 ret = reg_node(pRExC_state, MBOL); 5572 else 5573 ret = reg_node(pRExC_state, SBOL); 5574 break; 5575 case '$': 5576 nextchar(pRExC_state); 5577 if (*RExC_parse) 5578 RExC_seen_zerolen++; 5579 if (RExC_flags & RXf_PMf_MULTILINE) 5580 ret = reg_node(pRExC_state, MEOL); 5581 else 5582 ret = reg_node(pRExC_state, SEOL); 5583 break; 5584 case '.': 5585 nextchar(pRExC_state); 5586 if (RExC_flags & RXf_PMf_SINGLELINE) 5587 ret = reg_node(pRExC_state, SANY); 5588 else 5589 ret = reg_node(pRExC_state, REG_ANY); 5590 *flagp |= HASWIDTH|SIMPLE; 5591 MARK_NAUGHTY(1); 5592 break; 5593 case '[': 5594 { 5595 char * const cc_parse_start = ++RExC_parse; 5596 ret = regclass(pRExC_state, flagp, depth+1, 5597 FALSE, /* means parse the whole char class */ 5598 TRUE, /* allow multi-char folds */ 5599 FALSE, /* don't silence non-portable warnings. */ 5600 (bool) RExC_strict, 5601 TRUE, /* Allow an optimized regnode result */ 5602 NULL); 5603 if (ret == 0) { 5604 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 5605 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, 5606 (UV) *flagp); 5607 } 5608 if (*RExC_parse != ']') { 5609 RExC_parse_set(cc_parse_start); 5610 vFAIL("Unmatched ["); 5611 } 5612 nextchar(pRExC_state); 5613 break; 5614 } 5615 case '(': 5616 nextchar(pRExC_state); 5617 ret = reg(pRExC_state, 2, &flags, depth+1); 5618 if (ret == 0) { 5619 if (flags & TRYAGAIN) { 5620 if (RExC_parse >= RExC_end) { 5621 /* Make parent create an empty node if needed. */ 5622 *flagp |= TRYAGAIN; 5623 return(0); 5624 } 5625 goto tryagain; 5626 } 5627 RETURN_FAIL_ON_RESTART(flags, flagp); 5628 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf, 5629 (UV) flags); 5630 } 5631 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED); 5632 break; 5633 case '|': 5634 case ')': 5635 if (flags & TRYAGAIN) { 5636 *flagp |= TRYAGAIN; 5637 return 0; 5638 } 5639 vFAIL("Internal urp"); 5640 /* Supposed to be caught earlier. */ 5641 break; 5642 case '?': 5643 case '+': 5644 case '*': 5645 RExC_parse_inc_by(1); 5646 vFAIL("Quantifier follows nothing"); 5647 break; 5648 case '\\': 5649 /* Special Escapes 5650 5651 This switch handles escape sequences that resolve to some kind 5652 of special regop and not to literal text. Escape sequences that 5653 resolve to literal text are handled below in the switch marked 5654 "Literal Escapes". 5655 5656 Every entry in this switch *must* have a corresponding entry 5657 in the literal escape switch. However, the opposite is not 5658 required, as the default for this switch is to jump to the 5659 literal text handling code. 5660 */ 5661 RExC_parse_inc_by(1); 5662 switch ((U8)*RExC_parse) { 5663 /* Special Escapes */ 5664 case 'A': 5665 RExC_seen_zerolen++; 5666 /* Under wildcards, this is changed to match \n; should be 5667 * invisible to the user, as they have to compile under /m */ 5668 if (RExC_pm_flags & PMf_WILDCARD) { 5669 ret = reg_node(pRExC_state, MBOL); 5670 } 5671 else { 5672 ret = reg_node(pRExC_state, SBOL); 5673 /* SBOL is shared with /^/ so we set the flags so we can tell 5674 * /\A/ from /^/ in split. */ 5675 FLAGS(REGNODE_p(ret)) = 1; 5676 } 5677 goto finish_meta_pat; 5678 case 'G': 5679 if (RExC_pm_flags & PMf_WILDCARD) { 5680 RExC_parse_inc_by(1); 5681 /* diag_listed_as: Use of %s is not allowed in Unicode property 5682 wildcard subpatterns in regex; marked by <-- HERE in m/%s/ 5683 */ 5684 vFAIL("Use of '\\G' is not allowed in Unicode property" 5685 " wildcard subpatterns"); 5686 } 5687 ret = reg_node(pRExC_state, GPOS); 5688 RExC_seen |= REG_GPOS_SEEN; 5689 goto finish_meta_pat; 5690 case 'K': 5691 if (!RExC_in_lookaround) { 5692 RExC_seen_zerolen++; 5693 ret = reg_node(pRExC_state, KEEPS); 5694 /* XXX:dmq : disabling in-place substitution seems to 5695 * be necessary here to avoid cases of memory corruption, as 5696 * with: C<$_="x" x 80; s/x\K/y/> -- rgs 5697 */ 5698 RExC_seen |= REG_LOOKBEHIND_SEEN; 5699 goto finish_meta_pat; 5700 } 5701 else { 5702 ++RExC_parse; /* advance past the 'K' */ 5703 vFAIL("\\K not permitted in lookahead/lookbehind"); 5704 } 5705 case 'Z': 5706 if (RExC_pm_flags & PMf_WILDCARD) { 5707 /* See comment under \A above */ 5708 ret = reg_node(pRExC_state, MEOL); 5709 } 5710 else { 5711 ret = reg_node(pRExC_state, SEOL); 5712 } 5713 RExC_seen_zerolen++; /* Do not optimize RE away */ 5714 goto finish_meta_pat; 5715 case 'z': 5716 if (RExC_pm_flags & PMf_WILDCARD) { 5717 /* See comment under \A above */ 5718 ret = reg_node(pRExC_state, MEOL); 5719 } 5720 else { 5721 ret = reg_node(pRExC_state, EOS); 5722 } 5723 RExC_seen_zerolen++; /* Do not optimize RE away */ 5724 goto finish_meta_pat; 5725 case 'C': 5726 vFAIL("\\C no longer supported"); 5727 case 'X': 5728 ret = reg_node(pRExC_state, CLUMP); 5729 *flagp |= HASWIDTH; 5730 goto finish_meta_pat; 5731 5732 case 'B': 5733 invert = 1; 5734 /* FALLTHROUGH */ 5735 case 'b': 5736 { 5737 U8 flags = 0; 5738 regex_charset charset = get_regex_charset(RExC_flags); 5739 5740 RExC_seen_zerolen++; 5741 RExC_seen |= REG_LOOKBEHIND_SEEN; 5742 op = BOUND + charset; 5743 5744 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { 5745 flags = TRADITIONAL_BOUND; 5746 if (op > BOUNDA) { /* /aa is same as /a */ 5747 op = BOUNDA; 5748 } 5749 } 5750 else { 5751 STRLEN length; 5752 char name = *RExC_parse; 5753 char * endbrace = (char *) memchr(RExC_parse, '}', 5754 RExC_end - RExC_parse); 5755 char * e = endbrace; 5756 5757 RExC_parse_inc_by(2); 5758 5759 if (! endbrace) { 5760 vFAIL2("Missing right brace on \\%c{}", name); 5761 } 5762 5763 while (isBLANK(*RExC_parse)) { 5764 RExC_parse_inc_by(1); 5765 } 5766 5767 while (RExC_parse < e && isBLANK(*(e - 1))) { 5768 e--; 5769 } 5770 5771 if (e == RExC_parse) { 5772 RExC_parse_set(endbrace + 1); /* After the '}' */ 5773 vFAIL2("Empty \\%c{}", name); 5774 } 5775 5776 length = e - RExC_parse; 5777 5778 switch (*RExC_parse) { 5779 case 'g': 5780 if ( length != 1 5781 && (memNEs(RExC_parse + 1, length - 1, "cb"))) 5782 { 5783 goto bad_bound_type; 5784 } 5785 flags = GCB_BOUND; 5786 break; 5787 case 'l': 5788 if (length != 2 || *(RExC_parse + 1) != 'b') { 5789 goto bad_bound_type; 5790 } 5791 flags = LB_BOUND; 5792 break; 5793 case 's': 5794 if (length != 2 || *(RExC_parse + 1) != 'b') { 5795 goto bad_bound_type; 5796 } 5797 flags = SB_BOUND; 5798 break; 5799 case 'w': 5800 if (length != 2 || *(RExC_parse + 1) != 'b') { 5801 goto bad_bound_type; 5802 } 5803 flags = WB_BOUND; 5804 break; 5805 default: 5806 bad_bound_type: 5807 RExC_parse_set(e); 5808 vFAIL2utf8f( 5809 "'%" UTF8f "' is an unknown bound type", 5810 UTF8fARG(UTF, length, e - length)); 5811 NOT_REACHED; /*NOTREACHED*/ 5812 } 5813 RExC_parse_set(endbrace); 5814 REQUIRE_UNI_RULES(flagp, 0); 5815 5816 if (op == BOUND) { 5817 op = BOUNDU; 5818 } 5819 else if (op >= BOUNDA) { /* /aa is same as /a */ 5820 op = BOUNDU; 5821 length += 4; 5822 5823 /* Don't have to worry about UTF-8, in this message because 5824 * to get here the contents of the \b must be ASCII */ 5825 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ 5826 "Using /u for '%.*s' instead of /%s", 5827 (unsigned) length, 5828 endbrace - length + 1, 5829 (charset == REGEX_ASCII_RESTRICTED_CHARSET) 5830 ? ASCII_RESTRICT_PAT_MODS 5831 : ASCII_MORE_RESTRICT_PAT_MODS); 5832 } 5833 } 5834 5835 if (op == BOUND) { 5836 RExC_seen_d_op = TRUE; 5837 } 5838 else if (op == BOUNDL) { 5839 RExC_contains_locale = 1; 5840 } 5841 5842 if (invert) { 5843 op += NBOUND - BOUND; 5844 } 5845 5846 ret = reg_node(pRExC_state, op); 5847 FLAGS(REGNODE_p(ret)) = flags; 5848 5849 goto finish_meta_pat; 5850 } 5851 5852 case 'R': 5853 ret = reg_node(pRExC_state, LNBREAK); 5854 *flagp |= HASWIDTH|SIMPLE; 5855 goto finish_meta_pat; 5856 5857 case 'd': 5858 case 'D': 5859 case 'h': 5860 case 'H': 5861 case 'p': 5862 case 'P': 5863 case 's': 5864 case 'S': 5865 case 'v': 5866 case 'V': 5867 case 'w': 5868 case 'W': 5869 /* These all have the same meaning inside [brackets], and it knows 5870 * how to do the best optimizations for them. So, pretend we found 5871 * these within brackets, and let it do the work */ 5872 RExC_parse--; 5873 5874 ret = regclass(pRExC_state, flagp, depth+1, 5875 TRUE, /* means just parse this element */ 5876 FALSE, /* don't allow multi-char folds */ 5877 FALSE, /* don't silence non-portable warnings. It 5878 would be a bug if these returned 5879 non-portables */ 5880 (bool) RExC_strict, 5881 TRUE, /* Allow an optimized regnode result */ 5882 NULL); 5883 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 5884 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 5885 * multi-char folds are allowed. */ 5886 if (!ret) 5887 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, 5888 (UV) *flagp); 5889 5890 RExC_parse--; /* regclass() leaves this one too far ahead */ 5891 5892 finish_meta_pat: 5893 /* The escapes above that don't take a parameter can't be 5894 * followed by a '{'. But 'pX', 'p{foo}' and 5895 * correspondingly 'P' can be */ 5896 if ( RExC_parse - atom_parse_start == 1 5897 && UCHARAT(RExC_parse + 1) == '{' 5898 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL))) 5899 { 5900 RExC_parse_inc_by(2); 5901 vFAIL("Unescaped left brace in regex is illegal here"); 5902 } 5903 nextchar(pRExC_state); 5904 break; 5905 case 'N': 5906 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the 5907 * \N{...} evaluates to a sequence of more than one code points). 5908 * The function call below returns a regnode, which is our result. 5909 * The parameters cause it to fail if the \N{} evaluates to a 5910 * single code point; we handle those like any other literal. The 5911 * reason that the multicharacter case is handled here and not as 5912 * part of the EXACtish code is because of quantifiers. In 5913 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it 5914 * this way makes that Just Happen. dmq. 5915 * join_exact() will join this up with adjacent EXACTish nodes 5916 * later on, if appropriate. */ 5917 ++RExC_parse; 5918 if (grok_bslash_N(pRExC_state, 5919 &ret, /* Want a regnode returned */ 5920 NULL, /* Fail if evaluates to a single code 5921 point */ 5922 NULL, /* Don't need a count of how many code 5923 points */ 5924 flagp, 5925 RExC_strict, 5926 depth) 5927 ) { 5928 break; 5929 } 5930 5931 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 5932 5933 /* Here, evaluates to a single code point. Go get that */ 5934 RExC_parse_set(atom_parse_start); 5935 goto defchar; 5936 5937 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */ 5938 parse_named_seq: /* Also handle non-numeric \g{...} */ 5939 { 5940 char ch; 5941 if ( RExC_parse >= RExC_end - 1 5942 || (( ch = RExC_parse[1]) != '<' 5943 && ch != '\'' 5944 && ch != '{')) 5945 { 5946 RExC_parse_inc_by(1); 5947 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ 5948 vFAIL2("Sequence %.2s... not terminated", atom_parse_start); 5949 } else { 5950 RExC_parse_inc_by(2); 5951 if (ch == '{') { 5952 while (isBLANK(*RExC_parse)) { 5953 RExC_parse_inc_by(1); 5954 } 5955 } 5956 ret = handle_named_backref(pRExC_state, 5957 flagp, 5958 atom_parse_start, 5959 (ch == '<') 5960 ? '>' 5961 : (ch == '{') 5962 ? '}' 5963 : '\''); 5964 } 5965 break; 5966 } 5967 case 'g': 5968 case '1': case '2': case '3': case '4': 5969 case '5': case '6': case '7': case '8': case '9': 5970 { 5971 I32 num; 5972 char * endbrace = NULL; 5973 char * s = RExC_parse; 5974 char * e = RExC_end; 5975 5976 if (*s == 'g') { 5977 bool isrel = 0; 5978 5979 s++; 5980 if (*s == '{') { 5981 endbrace = (char *) memchr(s, '}', RExC_end - s); 5982 if (! endbrace ) { 5983 5984 /* Missing '}'. Position after the number to give 5985 * a better indication to the user of where the 5986 * problem is. */ 5987 s++; 5988 if (*s == '-') { 5989 s++; 5990 } 5991 5992 /* If it looks to be a name and not a number, go 5993 * handle it there */ 5994 if (! isDIGIT(*s)) { 5995 goto parse_named_seq; 5996 } 5997 5998 do { 5999 s++; 6000 } while isDIGIT(*s); 6001 6002 RExC_parse_set(s); 6003 vFAIL("Unterminated \\g{...} pattern"); 6004 } 6005 6006 s++; /* Past the '{' */ 6007 6008 while (isBLANK(*s)) { 6009 s++; 6010 } 6011 6012 /* Ignore trailing blanks */ 6013 e = endbrace; 6014 while (s < e && isBLANK(*(e - 1))) { 6015 e--; 6016 } 6017 } 6018 6019 /* Here, have isolated the meat of the construct from any 6020 * surrounding braces */ 6021 6022 if (*s == '-') { 6023 isrel = 1; 6024 s++; 6025 } 6026 6027 if (endbrace && !isDIGIT(*s)) { 6028 goto parse_named_seq; 6029 } 6030 6031 RExC_parse_set(s); 6032 num = S_backref_value(RExC_parse, RExC_end); 6033 if (num == 0) 6034 vFAIL("Reference to invalid group 0"); 6035 else if (num == I32_MAX) { 6036 if (isDIGIT(*RExC_parse)) 6037 vFAIL("Reference to nonexistent group"); 6038 else 6039 vFAIL("Unterminated \\g... pattern"); 6040 } 6041 6042 if (isrel) { 6043 num = RExC_npar - num; 6044 if (num < 1) 6045 vFAIL("Reference to nonexistent or unclosed group"); 6046 } 6047 else 6048 if (num < RExC_logical_npar) { 6049 num = RExC_logical_to_parno[num]; 6050 } 6051 else 6052 if (ALL_PARENS_COUNTED) { 6053 if (num < RExC_logical_total_parens) 6054 num = RExC_logical_to_parno[num]; 6055 else { 6056 num = -1; 6057 } 6058 } 6059 else{ 6060 REQUIRE_PARENS_PASS; 6061 } 6062 } 6063 else { 6064 num = S_backref_value(RExC_parse, RExC_end); 6065 /* bare \NNN might be backref or octal - if it is larger 6066 * than or equal RExC_npar then it is assumed to be an 6067 * octal escape. Note RExC_npar is +1 from the actual 6068 * number of parens. */ 6069 /* Note we do NOT check if num == I32_MAX here, as that is 6070 * handled by the RExC_npar check */ 6071 6072 if ( /* any numeric escape < 10 is always a backref */ 6073 num > 9 6074 /* any numeric escape < RExC_npar is a backref */ 6075 && num >= RExC_logical_npar 6076 /* cannot be an octal escape if it starts with [89] 6077 * */ 6078 && ! inRANGE(*RExC_parse, '8', '9') 6079 ) { 6080 /* Probably not meant to be a backref, instead likely 6081 * to be an octal character escape, e.g. \35 or \777. 6082 * The above logic should make it obvious why using 6083 * octal escapes in patterns is problematic. - Yves */ 6084 RExC_parse_set(atom_parse_start); 6085 goto defchar; 6086 } 6087 if (num < RExC_logical_npar) { 6088 num = RExC_logical_to_parno[num]; 6089 } 6090 else 6091 if (ALL_PARENS_COUNTED) { 6092 if (num < RExC_logical_total_parens) { 6093 num = RExC_logical_to_parno[num]; 6094 } else { 6095 num = -1; 6096 } 6097 } else { 6098 REQUIRE_PARENS_PASS; 6099 } 6100 } 6101 6102 /* At this point RExC_parse points at a numeric escape like 6103 * \12 or \88 or the digits in \g{34} or \g34 or something 6104 * similar, which we should NOT treat as an octal escape. It 6105 * may or may not be a valid backref escape. For instance 6106 * \88888888 is unlikely to be a valid backref. 6107 * 6108 * We've already figured out what value the digits represent. 6109 * Now, move the parse to beyond them. */ 6110 if (endbrace) { 6111 RExC_parse_set(endbrace + 1); 6112 } 6113 else while (isDIGIT(*RExC_parse)) { 6114 RExC_parse_inc_by(1); 6115 } 6116 if (num < 0) 6117 vFAIL("Reference to nonexistent group"); 6118 6119 if (num >= (I32)RExC_npar) { 6120 /* It might be a forward reference; we can't fail until we 6121 * know, by completing the parse to get all the groups, and 6122 * then reparsing */ 6123 if (ALL_PARENS_COUNTED) { 6124 if (num >= RExC_total_parens) { 6125 vFAIL("Reference to nonexistent group"); 6126 } 6127 } 6128 else { 6129 REQUIRE_PARENS_PASS; 6130 } 6131 } 6132 RExC_sawback = 1; 6133 ret = reg2node(pRExC_state, 6134 ((! FOLD) 6135 ? REF 6136 : (ASCII_FOLD_RESTRICTED) 6137 ? REFFA 6138 : (AT_LEAST_UNI_SEMANTICS) 6139 ? REFFU 6140 : (LOC) 6141 ? REFFL 6142 : REFF), 6143 num, RExC_nestroot); 6144 if (RExC_nestroot && num >= RExC_nestroot) 6145 FLAGS(REGNODE_p(ret)) = VOLATILE_REF; 6146 if (OP(REGNODE_p(ret)) == REFF) { 6147 RExC_seen_d_op = TRUE; 6148 } 6149 *flagp |= HASWIDTH; 6150 6151 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 6152 FALSE /* Don't force to /x */ ); 6153 } 6154 break; 6155 case '\0': 6156 if (RExC_parse >= RExC_end) 6157 FAIL("Trailing \\"); 6158 /* FALLTHROUGH */ 6159 default: 6160 /* Do not generate "unrecognized" warnings here, we fall 6161 back into the quick-grab loop below */ 6162 RExC_parse_set(atom_parse_start); 6163 goto defchar; 6164 } /* end of switch on a \foo sequence */ 6165 break; 6166 6167 case '#': 6168 6169 /* '#' comments should have been spaced over before this function was 6170 * called */ 6171 assert((RExC_flags & RXf_PMf_EXTENDED) == 0); 6172 /* 6173 if (RExC_flags & RXf_PMf_EXTENDED) { 6174 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) ); 6175 if (RExC_parse < RExC_end) 6176 goto tryagain; 6177 } 6178 */ 6179 6180 /* FALLTHROUGH */ 6181 6182 default: 6183 defchar: { 6184 6185 /* Here, we have determined that the next thing is probably a 6186 * literal character. RExC_parse points to the first byte of its 6187 * definition. (It still may be an escape sequence that evaluates 6188 * to a single character) */ 6189 6190 STRLEN len = 0; 6191 UV ender = 0; 6192 char *p; 6193 char *s, *old_s = NULL, *old_old_s = NULL; 6194 char *s0; 6195 U32 max_string_len = 255; 6196 6197 /* We may have to reparse the node, artificially stopping filling 6198 * it early, based on info gleaned in the first parse. This 6199 * variable gives where we stop. Make it above the normal stopping 6200 * place first time through; otherwise it would stop too early */ 6201 U32 upper_fill = max_string_len + 1; 6202 6203 /* We start out as an EXACT node, even if under /i, until we find a 6204 * character which is in a fold. The algorithm now segregates into 6205 * separate nodes, characters that fold from those that don't under 6206 * /i. (This hopefully will create nodes that are fixed strings 6207 * even under /i, giving the optimizer something to grab on to.) 6208 * So, if a node has something in it and the next character is in 6209 * the opposite category, that node is closed up, and the function 6210 * returns. Then regatom is called again, and a new node is 6211 * created for the new category. */ 6212 U8 node_type = EXACT; 6213 6214 /* Assume the node will be fully used; the excess is given back at 6215 * the end. Under /i, we may need to temporarily add the fold of 6216 * an extra character or two at the end to check for splitting 6217 * multi-char folds, so allocate extra space for that. We can't 6218 * make any other length assumptions, as a byte input sequence 6219 * could shrink down. */ 6220 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len 6221 + ((! FOLD) 6222 ? 0 6223 : 2 * ((UTF) 6224 ? UTF8_MAXBYTES_CASE 6225 /* Max non-UTF-8 expansion is 2 */ : 2))); 6226 6227 bool next_is_quantifier; 6228 char * oldp = NULL; 6229 6230 /* We can convert EXACTF nodes to EXACTFU if they contain only 6231 * characters that match identically regardless of the target 6232 * string's UTF8ness. The reason to do this is that EXACTF is not 6233 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at 6234 * runtime. 6235 * 6236 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they 6237 * contain only above-Latin1 characters (hence must be in UTF8), 6238 * which don't participate in folds with Latin1-range characters, 6239 * as the latter's folds aren't known until runtime. */ 6240 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); 6241 6242 /* Single-character EXACTish nodes are almost always SIMPLE. This 6243 * allows us to override this as encountered */ 6244 U8 maybe_SIMPLE = SIMPLE; 6245 6246 /* Does this node contain something that can't match unless the 6247 * target string is (also) in UTF-8 */ 6248 bool requires_utf8_target = FALSE; 6249 6250 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */ 6251 bool has_ss = FALSE; 6252 6253 /* So is the MICRO SIGN */ 6254 bool has_micro_sign = FALSE; 6255 6256 /* Set when we fill up the current node and there is still more 6257 * text to process */ 6258 bool overflowed; 6259 6260 /* Allocate an EXACT node. The node_type may change below to 6261 * another EXACTish node, but since the size of the node doesn't 6262 * change, it works */ 6263 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes); 6264 FILL_NODE(ret, node_type); 6265 RExC_emit += NODE_STEP_REGNODE; 6266 6267 s = STRING(REGNODE_p(ret)); 6268 6269 s0 = s; 6270 6271 reparse: 6272 6273 p = RExC_parse; 6274 len = 0; 6275 s = s0; 6276 node_type = EXACT; 6277 oldp = NULL; 6278 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC); 6279 maybe_SIMPLE = SIMPLE; 6280 requires_utf8_target = FALSE; 6281 has_ss = FALSE; 6282 has_micro_sign = FALSE; 6283 6284 continue_parse: 6285 6286 /* This breaks under rare circumstances. If folding, we do not 6287 * want to split a node at a character that is a non-final in a 6288 * multi-char fold, as an input string could just happen to want to 6289 * match across the node boundary. The code at the end of the loop 6290 * looks for this, and backs off until it finds not such a 6291 * character, but it is possible (though extremely, extremely 6292 * unlikely) for all characters in the node to be non-final fold 6293 * ones, in which case we just leave the node fully filled, and 6294 * hope that it doesn't match the string in just the wrong place */ 6295 6296 assert( ! UTF /* Is at the beginning of a character */ 6297 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) 6298 || UTF8_IS_START(UCHARAT(RExC_parse))); 6299 6300 overflowed = FALSE; 6301 6302 /* Here, we have a literal character. Find the maximal string of 6303 * them in the input that we can fit into a single EXACTish node. 6304 * We quit at the first non-literal or when the node gets full, or 6305 * under /i the categorization of folding/non-folding character 6306 * changes */ 6307 while (p < RExC_end && len < upper_fill) { 6308 6309 /* In most cases each iteration adds one byte to the output. 6310 * The exceptions override this */ 6311 Size_t added_len = 1; 6312 6313 oldp = p; 6314 old_old_s = old_s; 6315 old_s = s; 6316 6317 /* White space has already been ignored */ 6318 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 6319 || ! is_PATWS_safe((p), RExC_end, UTF)); 6320 6321 switch ((U8)*p) { 6322 const char* message; 6323 U32 packed_warn; 6324 U8 grok_c_char; 6325 6326 case '^': 6327 case '$': 6328 case '.': 6329 case '[': 6330 case '(': 6331 case ')': 6332 case '|': 6333 goto loopdone; 6334 case '\\': 6335 /* Literal Escapes Switch 6336 6337 This switch is meant to handle escape sequences that 6338 resolve to a literal character. 6339 6340 Every escape sequence that represents something 6341 else, like an assertion or a char class, is handled 6342 in the switch marked 'Special Escapes' above in this 6343 routine, but also has an entry here as anything that 6344 isn't explicitly mentioned here will be treated as 6345 an unescaped equivalent literal. 6346 */ 6347 6348 switch ((U8)*++p) { 6349 6350 /* These are all the special escapes. */ 6351 case 'A': /* Start assertion */ 6352 case 'b': case 'B': /* Word-boundary assertion*/ 6353 case 'C': /* Single char !DANGEROUS! */ 6354 case 'd': case 'D': /* digit class */ 6355 case 'g': case 'G': /* generic-backref, pos assertion */ 6356 case 'h': case 'H': /* HORIZWS */ 6357 case 'k': case 'K': /* named backref, keep marker */ 6358 case 'p': case 'P': /* Unicode property */ 6359 case 'R': /* LNBREAK */ 6360 case 's': case 'S': /* space class */ 6361 case 'v': case 'V': /* VERTWS */ 6362 case 'w': case 'W': /* word class */ 6363 case 'X': /* eXtended Unicode "combining 6364 character sequence" */ 6365 case 'z': case 'Z': /* End of line/string assertion */ 6366 --p; 6367 goto loopdone; 6368 6369 /* Anything after here is an escape that resolves to a 6370 literal. (Except digits, which may or may not) 6371 */ 6372 case 'n': 6373 ender = '\n'; 6374 p++; 6375 break; 6376 case 'N': /* Handle a single-code point named character. */ 6377 RExC_parse_set( p + 1 ); 6378 if (! grok_bslash_N(pRExC_state, 6379 NULL, /* Fail if evaluates to 6380 anything other than a 6381 single code point */ 6382 &ender, /* The returned single code 6383 point */ 6384 NULL, /* Don't need a count of 6385 how many code points */ 6386 flagp, 6387 RExC_strict, 6388 depth) 6389 ) { 6390 if (*flagp & NEED_UTF8) 6391 FAIL("panic: grok_bslash_N set NEED_UTF8"); 6392 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 6393 6394 /* Here, it wasn't a single code point. Go close 6395 * up this EXACTish node. The switch() prior to 6396 * this switch handles the other cases */ 6397 p = oldp; 6398 RExC_parse_set(p); 6399 goto loopdone; 6400 } 6401 p = RExC_parse; 6402 RExC_parse_set(atom_parse_start); 6403 6404 /* The \N{} means the pattern, if previously /d, 6405 * becomes /u. That means it can't be an EXACTF node, 6406 * but an EXACTFU */ 6407 if (node_type == EXACTF) { 6408 node_type = EXACTFU; 6409 6410 /* If the node already contains something that 6411 * differs between EXACTF and EXACTFU, reparse it 6412 * as EXACTFU */ 6413 if (! maybe_exactfu) { 6414 len = 0; 6415 s = s0; 6416 goto reparse; 6417 } 6418 } 6419 6420 break; 6421 case 'r': 6422 ender = '\r'; 6423 p++; 6424 break; 6425 case 't': 6426 ender = '\t'; 6427 p++; 6428 break; 6429 case 'f': 6430 ender = '\f'; 6431 p++; 6432 break; 6433 case 'e': 6434 ender = ESC_NATIVE; 6435 p++; 6436 break; 6437 case 'a': 6438 ender = '\a'; 6439 p++; 6440 break; 6441 case 'o': 6442 if (! grok_bslash_o(&p, 6443 RExC_end, 6444 &ender, 6445 &message, 6446 &packed_warn, 6447 (bool) RExC_strict, 6448 FALSE, /* No illegal cp's */ 6449 UTF)) 6450 { 6451 RExC_parse_set(p); /* going to die anyway; point to 6452 exact spot of failure */ 6453 vFAIL(message); 6454 } 6455 6456 if (message && TO_OUTPUT_WARNINGS(p)) { 6457 warn_non_literal_string(p, packed_warn, message); 6458 } 6459 break; 6460 case 'x': 6461 if (! grok_bslash_x(&p, 6462 RExC_end, 6463 &ender, 6464 &message, 6465 &packed_warn, 6466 (bool) RExC_strict, 6467 FALSE, /* No illegal cp's */ 6468 UTF)) 6469 { 6470 RExC_parse_set(p); /* going to die anyway; point 6471 to exact spot of failure */ 6472 vFAIL(message); 6473 } 6474 6475 if (message && TO_OUTPUT_WARNINGS(p)) { 6476 warn_non_literal_string(p, packed_warn, message); 6477 } 6478 6479 #ifdef EBCDIC 6480 if (ender < 0x100) { 6481 if (RExC_recode_x_to_native) { 6482 ender = LATIN1_TO_NATIVE(ender); 6483 } 6484 } 6485 #endif 6486 break; 6487 case 'c': 6488 p++; 6489 if (! grok_bslash_c(*p, &grok_c_char, 6490 &message, &packed_warn)) 6491 { 6492 /* going to die anyway; point to exact spot of 6493 * failure */ 6494 char *new_p= p + ((UTF) 6495 ? UTF8_SAFE_SKIP(p, RExC_end) 6496 : 1); 6497 RExC_parse_set(new_p); 6498 vFAIL(message); 6499 } 6500 6501 ender = grok_c_char; 6502 p++; 6503 if (message && TO_OUTPUT_WARNINGS(p)) { 6504 warn_non_literal_string(p, packed_warn, message); 6505 } 6506 6507 break; 6508 case '8': case '9': /* must be a backreference */ 6509 --p; 6510 /* we have an escape like \8 which cannot be an octal escape 6511 * so we exit the loop, and let the outer loop handle this 6512 * escape which may or may not be a legitimate backref. */ 6513 goto loopdone; 6514 case '1': case '2': case '3':case '4': 6515 case '5': case '6': case '7': 6516 6517 /* When we parse backslash escapes there is ambiguity 6518 * between backreferences and octal escapes. Any escape 6519 * from \1 - \9 is a backreference, any multi-digit 6520 * escape which does not start with 0 and which when 6521 * evaluated as decimal could refer to an already 6522 * parsed capture buffer is a back reference. Anything 6523 * else is octal. 6524 * 6525 * Note this implies that \118 could be interpreted as 6526 * 118 OR as "\11" . "8" depending on whether there 6527 * were 118 capture buffers defined already in the 6528 * pattern. */ 6529 6530 /* NOTE, RExC_npar is 1 more than the actual number of 6531 * parens we have seen so far, hence the "<" as opposed 6532 * to "<=" */ 6533 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar) 6534 { /* Not to be treated as an octal constant, go 6535 find backref */ 6536 p = oldp; 6537 goto loopdone; 6538 } 6539 /* FALLTHROUGH */ 6540 case '0': 6541 { 6542 I32 flags = PERL_SCAN_SILENT_ILLDIGIT 6543 | PERL_SCAN_NOTIFY_ILLDIGIT; 6544 STRLEN numlen = 3; 6545 ender = grok_oct(p, &numlen, &flags, NULL); 6546 p += numlen; 6547 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT) 6548 && isDIGIT(*p) /* like \08, \178 */ 6549 && ckWARN(WARN_REGEXP)) 6550 { 6551 reg_warn_non_literal_string( 6552 p + 1, 6553 form_alien_digit_msg(8, numlen, p, 6554 RExC_end, UTF, FALSE)); 6555 } 6556 } 6557 break; 6558 case '\0': 6559 if (p >= RExC_end) 6560 FAIL("Trailing \\"); 6561 /* FALLTHROUGH */ 6562 default: 6563 if (isALPHANUMERIC(*p)) { 6564 /* An alpha followed by '{' is going to fail next 6565 * iteration, so don't output this warning in that 6566 * case */ 6567 if (! isALPHA(*p) || *(p + 1) != '{') { 6568 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s" 6569 " passed through", p); 6570 } 6571 } 6572 goto normal_default; 6573 } /* End of switch on '\' */ 6574 break; 6575 case '{': 6576 /* Trying to gain new uses for '{' without breaking too 6577 * much existing code is hard. The solution currently 6578 * adopted is: 6579 * 1) If there is no ambiguity that a '{' should always 6580 * be taken literally, at the start of a construct, we 6581 * just do so. 6582 * 2) If the literal '{' conflicts with our desired use 6583 * of it as a metacharacter, we die. The deprecation 6584 * cycles for this have come and gone. 6585 * 3) If there is ambiguity, we raise a simple warning. 6586 * This could happen, for example, if the user 6587 * intended it to introduce a quantifier, but slightly 6588 * misspelled the quantifier. Without this warning, 6589 * the quantifier would silently be taken as a literal 6590 * string of characters instead of a meta construct */ 6591 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) { 6592 if ( RExC_strict 6593 || ( p > atom_parse_start + 1 6594 && isALPHA_A(*(p - 1)) 6595 && *(p - 2) == '\\')) 6596 { 6597 RExC_parse_set(p + 1); 6598 vFAIL("Unescaped left brace in regex is " 6599 "illegal here"); 6600 } 6601 ckWARNreg(p + 1, "Unescaped left brace in regex is" 6602 " passed through"); 6603 } 6604 goto normal_default; 6605 case '}': 6606 case ']': 6607 if (p > RExC_parse && RExC_strict) { 6608 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p); 6609 } 6610 /*FALLTHROUGH*/ 6611 default: /* A literal character */ 6612 normal_default: 6613 if (! UTF8_IS_INVARIANT(*p) && UTF) { 6614 STRLEN numlen; 6615 ender = utf8n_to_uvchr((U8*)p, RExC_end - p, 6616 &numlen, UTF8_ALLOW_DEFAULT); 6617 p += numlen; 6618 } 6619 else 6620 ender = (U8) *p++; 6621 break; 6622 } /* End of switch on the literal */ 6623 6624 /* Here, have looked at the literal character, and <ender> 6625 * contains its ordinal; <p> points to the character after it. 6626 * */ 6627 6628 if (ender > 255) { 6629 REQUIRE_UTF8(flagp); 6630 if ( UNICODE_IS_PERL_EXTENDED(ender) 6631 && TO_OUTPUT_WARNINGS(p)) 6632 { 6633 ckWARN2_non_literal_string(p, 6634 packWARN(WARN_PORTABLE), 6635 PL_extended_cp_format, 6636 ender); 6637 } 6638 } 6639 6640 /* We need to check if the next non-ignored thing is a 6641 * quantifier. Move <p> to after anything that should be 6642 * ignored, which, as a side effect, positions <p> for the next 6643 * loop iteration */ 6644 skip_to_be_ignored_text(pRExC_state, &p, 6645 FALSE /* Don't force to /x */ ); 6646 6647 /* If the next thing is a quantifier, it applies to this 6648 * character only, which means that this character has to be in 6649 * its own node and can't just be appended to the string in an 6650 * existing node, so if there are already other characters in 6651 * the node, close the node with just them, and set up to do 6652 * this character again next time through, when it will be the 6653 * only thing in its new node */ 6654 6655 next_is_quantifier = LIKELY(p < RExC_end) 6656 && UNLIKELY(isQUANTIFIER(p, RExC_end)); 6657 6658 if (next_is_quantifier && LIKELY(len)) { 6659 p = oldp; 6660 goto loopdone; 6661 } 6662 6663 /* Ready to add 'ender' to the node */ 6664 6665 if (! FOLD) { /* The simple case, just append the literal */ 6666 not_fold_common: 6667 6668 /* Don't output if it would overflow */ 6669 if (UNLIKELY(len > max_string_len - ((UTF) 6670 ? UVCHR_SKIP(ender) 6671 : 1))) 6672 { 6673 overflowed = TRUE; 6674 break; 6675 } 6676 6677 if (UVCHR_IS_INVARIANT(ender) || ! UTF) { 6678 *(s++) = (char) ender; 6679 } 6680 else { 6681 U8 * new_s = uvchr_to_utf8((U8*)s, ender); 6682 added_len = (char *) new_s - s; 6683 s = (char *) new_s; 6684 6685 if (ender > 255) { 6686 requires_utf8_target = TRUE; 6687 } 6688 } 6689 } 6690 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { 6691 6692 /* Here are folding under /l, and the code point is 6693 * problematic. If this is the first character in the 6694 * node, change the node type to folding. Otherwise, if 6695 * this is the first problematic character, close up the 6696 * existing node, so can start a new node with this one */ 6697 if (! len) { 6698 node_type = EXACTFL; 6699 RExC_contains_locale = 1; 6700 } 6701 else if (node_type == EXACT) { 6702 p = oldp; 6703 goto loopdone; 6704 } 6705 6706 /* This problematic code point means we can't simplify 6707 * things */ 6708 maybe_exactfu = FALSE; 6709 6710 /* Although these two characters have folds that are 6711 * locale-problematic, they also have folds to above Latin1 6712 * that aren't a problem. Doing these now helps at 6713 * runtime. */ 6714 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU 6715 || ender == LATIN_CAPITAL_LETTER_SHARP_S)) 6716 { 6717 goto fold_anyway; 6718 } 6719 6720 /* Here, we are adding a problematic fold character. 6721 * "Problematic" in this context means that its fold isn't 6722 * known until runtime. (The non-problematic code points 6723 * are the above-Latin1 ones that fold to also all 6724 * above-Latin1. Their folds don't vary no matter what the 6725 * locale is.) But here we have characters whose fold 6726 * depends on the locale. We just add in the unfolded 6727 * character, and wait until runtime to fold it */ 6728 goto not_fold_common; 6729 } 6730 else /* regular fold; see if actually is in a fold */ 6731 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender)) 6732 || (ender > 255 6733 && ! _invlist_contains_cp(PL_in_some_fold, ender))) 6734 { 6735 /* Here, folding, but the character isn't in a fold. 6736 * 6737 * Start a new node if previous characters in the node were 6738 * folded */ 6739 if (len && node_type != EXACT) { 6740 p = oldp; 6741 goto loopdone; 6742 } 6743 6744 /* Here, continuing a node with non-folded characters. Add 6745 * this one */ 6746 goto not_fold_common; 6747 } 6748 else { /* Here, does participate in some fold */ 6749 6750 /* If this is the first character in the node, change its 6751 * type to folding. Otherwise, if this is the first 6752 * folding character in the node, close up the existing 6753 * node, so can start a new node with this one. */ 6754 if (! len) { 6755 node_type = compute_EXACTish(pRExC_state); 6756 } 6757 else if (node_type == EXACT) { 6758 p = oldp; 6759 goto loopdone; 6760 } 6761 6762 if (UTF) { /* Alway use the folded value for UTF-8 6763 patterns */ 6764 if (UVCHR_IS_INVARIANT(ender)) { 6765 if (UNLIKELY(len + 1 > max_string_len)) { 6766 overflowed = TRUE; 6767 break; 6768 } 6769 6770 *(s)++ = (U8) toFOLD(ender); 6771 } 6772 else { 6773 UV folded; 6774 6775 fold_anyway: 6776 folded = _to_uni_fold_flags( 6777 ender, 6778 (U8 *) s, /* We have allocated extra space 6779 in 's' so can't run off the 6780 end */ 6781 &added_len, 6782 FOLD_FLAGS_FULL 6783 | (( ASCII_FOLD_RESTRICTED 6784 || node_type == EXACTFL) 6785 ? FOLD_FLAGS_NOMIX_ASCII 6786 : 0)); 6787 if (UNLIKELY(len + added_len > max_string_len)) { 6788 overflowed = TRUE; 6789 break; 6790 } 6791 6792 s += added_len; 6793 6794 if ( folded > 255 6795 && LIKELY(folded != GREEK_SMALL_LETTER_MU)) 6796 { 6797 /* U+B5 folds to the MU, so its possible for a 6798 * non-UTF-8 target to match it */ 6799 requires_utf8_target = TRUE; 6800 } 6801 } 6802 } 6803 else { /* Here is non-UTF8. */ 6804 6805 /* The fold will be one or (rarely) two characters. 6806 * Check that there's room for at least a single one 6807 * before setting any flags, etc. Because otherwise an 6808 * overflowing character could cause a flag to be set 6809 * even though it doesn't end up in this node. (For 6810 * the two character fold, we check again, before 6811 * setting any flags) */ 6812 if (UNLIKELY(len + 1 > max_string_len)) { 6813 overflowed = TRUE; 6814 break; 6815 } 6816 6817 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ 6818 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ 6819 || UNICODE_DOT_DOT_VERSION > 0) 6820 6821 /* On non-ancient Unicodes, check for the only possible 6822 * multi-char fold */ 6823 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { 6824 6825 /* This potential multi-char fold means the node 6826 * can't be simple (because it could match more 6827 * than a single char). And in some cases it will 6828 * match 'ss', so set that flag */ 6829 maybe_SIMPLE = 0; 6830 has_ss = TRUE; 6831 6832 /* It can't change to be an EXACTFU (unless already 6833 * is one). We fold it iff under /u rules. */ 6834 if (node_type != EXACTFU) { 6835 maybe_exactfu = FALSE; 6836 } 6837 else { 6838 if (UNLIKELY(len + 2 > max_string_len)) { 6839 overflowed = TRUE; 6840 break; 6841 } 6842 6843 *(s++) = 's'; 6844 *(s++) = 's'; 6845 added_len = 2; 6846 6847 goto done_with_this_char; 6848 } 6849 } 6850 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's')) 6851 && LIKELY(len > 0) 6852 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's'))) 6853 { 6854 /* Also, the sequence 'ss' is special when not 6855 * under /u. If the target string is UTF-8, it 6856 * should match SHARP S; otherwise it won't. So, 6857 * here we have to exclude the possibility of this 6858 * node moving to /u.*/ 6859 has_ss = TRUE; 6860 maybe_exactfu = FALSE; 6861 } 6862 #endif 6863 /* Here, the fold will be a single character */ 6864 6865 if (UNLIKELY(ender == MICRO_SIGN)) { 6866 has_micro_sign = TRUE; 6867 } 6868 else if (PL_fold[ender] != PL_fold_latin1[ender]) { 6869 6870 /* If the character's fold differs between /d and 6871 * /u, this can't change to be an EXACTFU node */ 6872 maybe_exactfu = FALSE; 6873 } 6874 6875 *(s++) = (DEPENDS_SEMANTICS) 6876 ? (char) toFOLD(ender) 6877 6878 /* Under /u, the fold of any character in 6879 * the 0-255 range happens to be its 6880 * lowercase equivalent, except for LATIN 6881 * SMALL LETTER SHARP S, which was handled 6882 * above, and the MICRO SIGN, whose fold 6883 * requires UTF-8 to represent. */ 6884 : (char) toLOWER_L1(ender); 6885 } 6886 } /* End of adding current character to the node */ 6887 6888 done_with_this_char: 6889 6890 len += added_len; 6891 6892 if (next_is_quantifier) { 6893 6894 /* Here, the next input is a quantifier, and to get here, 6895 * the current character is the only one in the node. */ 6896 goto loopdone; 6897 } 6898 6899 } /* End of loop through literal characters */ 6900 6901 /* Here we have either exhausted the input or run out of room in 6902 * the node. If the former, we are done. (If we encountered a 6903 * character that can't be in the node, transfer is made directly 6904 * to <loopdone>, and so we wouldn't have fallen off the end of the 6905 * loop.) */ 6906 if (LIKELY(! overflowed)) { 6907 goto loopdone; 6908 } 6909 6910 /* Here we have run out of room. We can grow plain EXACT and 6911 * LEXACT nodes. If the pattern is gigantic enough, though, 6912 * eventually we'll have to artificially chunk the pattern into 6913 * multiple nodes. */ 6914 if (! LOC && (node_type == EXACT || node_type == LEXACT)) { 6915 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret))); 6916 Size_t overhead_expansion = 0; 6917 char temp[256]; 6918 Size_t max_nodes_for_string; 6919 Size_t achievable; 6920 SSize_t delta; 6921 6922 /* Here we couldn't fit the final character in the current 6923 * node, so it will have to be reparsed, no matter what else we 6924 * do */ 6925 p = oldp; 6926 6927 /* If would have overflowed a regular EXACT node, switch 6928 * instead to an LEXACT. The code below is structured so that 6929 * the actual growing code is common to changing from an EXACT 6930 * or just increasing the LEXACT size. This means that we have 6931 * to save the string in the EXACT case before growing, and 6932 * then copy it afterwards to its new location */ 6933 if (node_type == EXACT) { 6934 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT); 6935 RExC_emit += overhead_expansion; 6936 Copy(s0, temp, len, char); 6937 } 6938 6939 /* Ready to grow. If it was a plain EXACT, the string was 6940 * saved, and the first few bytes of it overwritten by adding 6941 * an argument field. We assume, as we do elsewhere in this 6942 * file, that one byte of remaining input will translate into 6943 * one byte of output, and if that's too small, we grow again, 6944 * if too large the excess memory is freed at the end */ 6945 6946 max_nodes_for_string = U16_MAX - overhead - overhead_expansion; 6947 achievable = MIN(max_nodes_for_string, 6948 current_string_nodes + STR_SZ(RExC_end - p)); 6949 delta = achievable - current_string_nodes; 6950 6951 /* If there is just no more room, go finish up this chunk of 6952 * the pattern. */ 6953 if (delta <= 0) { 6954 goto loopdone; 6955 } 6956 6957 change_engine_size(pRExC_state, delta + overhead_expansion); 6958 current_string_nodes += delta; 6959 max_string_len 6960 = sizeof(struct regnode) * current_string_nodes; 6961 upper_fill = max_string_len + 1; 6962 6963 /* If the length was small, we know this was originally an 6964 * EXACT node now converted to LEXACT, and the string has to be 6965 * restored. Otherwise the string was untouched. 260 is just 6966 * a number safely above 255 so don't have to worry about 6967 * getting it precise */ 6968 if (len < 260) { 6969 node_type = LEXACT; 6970 FILL_NODE(ret, node_type); 6971 s0 = STRING(REGNODE_p(ret)); 6972 Copy(temp, s0, len, char); 6973 s = s0 + len; 6974 } 6975 6976 goto continue_parse; 6977 } 6978 else if (FOLD) { 6979 bool splittable = FALSE; 6980 bool backed_up = FALSE; 6981 char * e; /* should this be U8? */ 6982 char * s_start; /* should this be U8? */ 6983 6984 /* Here is /i. Running out of room creates a problem if we are 6985 * folding, and the split happens in the middle of a 6986 * multi-character fold, as a match that should have occurred, 6987 * won't, due to the way nodes are matched, and our artificial 6988 * boundary. So back off until we aren't splitting such a 6989 * fold. If there is no such place to back off to, we end up 6990 * taking the entire node as-is. This can happen if the node 6991 * consists entirely of 'f' or entirely of 's' characters (or 6992 * things that fold to them) as 'ff' and 'ss' are 6993 * multi-character folds. 6994 * 6995 * The Unicode standard says that multi character folds consist 6996 * of either two or three characters. That means we would be 6997 * splitting one if the final character in the node is at the 6998 * beginning of either type, or is the second of a three 6999 * character fold. 7000 * 7001 * At this point: 7002 * ender is the code point of the character that won't fit 7003 * in the node 7004 * s points to just beyond the final byte in the node. 7005 * It's where we would place ender if there were 7006 * room, and where in fact we do place ender's fold 7007 * in the code below, as we've over-allocated space 7008 * for s0 (hence s) to allow for this 7009 * e starts at 's' and advances as we append things. 7010 * old_s is the same as 's'. (If ender had fit, 's' would 7011 * have been advanced to beyond it). 7012 * old_old_s points to the beginning byte of the final 7013 * character in the node 7014 * p points to the beginning byte in the input of the 7015 * character beyond 'ender'. 7016 * oldp points to the beginning byte in the input of 7017 * 'ender'. 7018 * 7019 * In the case of /il, we haven't folded anything that could be 7020 * affected by the locale. That means only above-Latin1 7021 * characters that fold to other above-latin1 characters get 7022 * folded at compile time. To check where a good place to 7023 * split nodes is, everything in it will have to be folded. 7024 * The boolean 'maybe_exactfu' keeps track in /il if there are 7025 * any unfolded characters in the node. */ 7026 bool need_to_fold_loc = LOC && ! maybe_exactfu; 7027 7028 /* If we do need to fold the node, we need a place to store the 7029 * folded copy, and a way to map back to the unfolded original 7030 * */ 7031 char * locfold_buf = NULL; 7032 Size_t * loc_correspondence = NULL; 7033 7034 if (! need_to_fold_loc) { /* The normal case. Just 7035 initialize to the actual node */ 7036 e = s; 7037 s_start = s0; 7038 s = old_old_s; /* Point to the beginning of the final char 7039 that fits in the node */ 7040 } 7041 else { 7042 7043 /* Here, we have filled a /il node, and there are unfolded 7044 * characters in it. If the runtime locale turns out to be 7045 * UTF-8, there are possible multi-character folds, just 7046 * like when not under /l. The node hence can't terminate 7047 * in the middle of such a fold. To determine this, we 7048 * have to create a folded copy of this node. That means 7049 * reparsing the node, folding everything assuming a UTF-8 7050 * locale. (If at runtime it isn't such a locale, the 7051 * actions here wouldn't have been necessary, but we have 7052 * to assume the worst case.) If we find we need to back 7053 * off the folded string, we do so, and then map that 7054 * position back to the original unfolded node, which then 7055 * gets output, truncated at that spot */ 7056 7057 char * redo_p = RExC_parse; 7058 char * redo_e; 7059 char * old_redo_e; 7060 7061 /* Allow enough space assuming a single byte input folds to 7062 * a single byte output, plus assume that the two unparsed 7063 * characters (that we may need) fold to the largest number 7064 * of bytes possible, plus extra for one more worst case 7065 * scenario. In the loop below, if we start eating into 7066 * that final spare space, we enlarge this initial space */ 7067 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1; 7068 7069 Newxz(locfold_buf, size, char); 7070 Newxz(loc_correspondence, size, Size_t); 7071 7072 /* Redo this node's parse, folding into 'locfold_buf' */ 7073 redo_p = RExC_parse; 7074 old_redo_e = redo_e = locfold_buf; 7075 while (redo_p <= oldp) { 7076 7077 old_redo_e = redo_e; 7078 loc_correspondence[redo_e - locfold_buf] 7079 = redo_p - RExC_parse; 7080 7081 if (UTF) { 7082 Size_t added_len; 7083 7084 (void) _to_utf8_fold_flags((U8 *) redo_p, 7085 (U8 *) RExC_end, 7086 (U8 *) redo_e, 7087 &added_len, 7088 FOLD_FLAGS_FULL); 7089 redo_e += added_len; 7090 redo_p += UTF8SKIP(redo_p); 7091 } 7092 else { 7093 7094 /* Note that if this code is run on some ancient 7095 * Unicode versions, SHARP S doesn't fold to 'ss', 7096 * but rather than clutter the code with #ifdef's, 7097 * as is done above, we ignore that possibility. 7098 * This is ok because this code doesn't affect what 7099 * gets matched, but merely where the node gets 7100 * split */ 7101 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) { 7102 *redo_e++ = toLOWER_L1(UCHARAT(redo_p)); 7103 } 7104 else { 7105 *redo_e++ = 's'; 7106 *redo_e++ = 's'; 7107 } 7108 redo_p++; 7109 } 7110 7111 7112 /* If we're getting so close to the end that a 7113 * worst-case fold in the next character would cause us 7114 * to overflow, increase, assuming one byte output byte 7115 * per one byte input one, plus room for another worst 7116 * case fold */ 7117 if ( redo_p <= oldp 7118 && redo_e > locfold_buf + size 7119 - (UTF8_MAXBYTES_CASE + 1)) 7120 { 7121 Size_t new_size = size 7122 + (oldp - redo_p) 7123 + UTF8_MAXBYTES_CASE + 1; 7124 Ptrdiff_t e_offset = redo_e - locfold_buf; 7125 7126 Renew(locfold_buf, new_size, char); 7127 Renew(loc_correspondence, new_size, Size_t); 7128 size = new_size; 7129 7130 redo_e = locfold_buf + e_offset; 7131 } 7132 } 7133 7134 /* Set so that things are in terms of the folded, temporary 7135 * string */ 7136 s = old_redo_e; 7137 s_start = locfold_buf; 7138 e = redo_e; 7139 7140 } 7141 7142 /* Here, we have 's', 's_start' and 'e' set up to point to the 7143 * input that goes into the node, folded. 7144 * 7145 * If the final character of the node and the fold of ender 7146 * form the first two characters of a three character fold, we 7147 * need to peek ahead at the next (unparsed) character in the 7148 * input to determine if the three actually do form such a 7149 * fold. Just looking at that character is not generally 7150 * sufficient, as it could be, for example, an escape sequence 7151 * that evaluates to something else, and it needs to be folded. 7152 * 7153 * khw originally thought to just go through the parse loop one 7154 * extra time, but that doesn't work easily as that iteration 7155 * could cause things to think that the parse is over and to 7156 * goto loopdone. The character could be a '$' for example, or 7157 * the character beyond could be a quantifier, and other 7158 * glitches as well. 7159 * 7160 * The solution used here for peeking ahead is to look at that 7161 * next character. If it isn't ASCII punctuation, then it will 7162 * be something that would continue on in an EXACTish node if 7163 * there were space. We append the fold of it to s, having 7164 * reserved enough room in s0 for the purpose. If we can't 7165 * reasonably peek ahead, we instead assume the worst case: 7166 * that it is something that would form the completion of a 7167 * multi-char fold. 7168 * 7169 * If we can't split between s and ender, we work backwards 7170 * character-by-character down to s0. At each current point 7171 * see if we are at the beginning of a multi-char fold. If so, 7172 * that means we would be splitting the fold across nodes, and 7173 * so we back up one and try again. 7174 * 7175 * If we're not at the beginning, we still could be at the 7176 * final two characters of a (rare) three character fold. We 7177 * check if the sequence starting at the character before the 7178 * current position (and including the current and next 7179 * characters) is a three character fold. If not, the node can 7180 * be split here. If it is, we have to backup two characters 7181 * and try again. 7182 * 7183 * Otherwise, the node can be split at the current position. 7184 * 7185 * The same logic is used for UTF-8 patterns and not */ 7186 if (UTF) { 7187 Size_t added_len; 7188 7189 /* Append the fold of ender */ 7190 (void) _to_uni_fold_flags( 7191 ender, 7192 (U8 *) e, 7193 &added_len, 7194 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 7195 ? FOLD_FLAGS_NOMIX_ASCII 7196 : 0)); 7197 e += added_len; 7198 7199 /* 's' and the character folded to by ender may be the 7200 * first two of a three-character fold, in which case the 7201 * node should not be split here. That may mean examining 7202 * the so-far unparsed character starting at 'p'. But if 7203 * ender folded to more than one character, we already have 7204 * three characters to look at. Also, we first check if 7205 * the sequence consisting of s and the next character form 7206 * the first two of some three character fold. If not, 7207 * there's no need to peek ahead. */ 7208 if ( added_len <= UTF8SKIP(e - added_len) 7209 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e))) 7210 { 7211 /* Here, the two do form the beginning of a potential 7212 * three character fold. The unexamined character may 7213 * or may not complete it. Peek at it. It might be 7214 * something that ends the node or an escape sequence, 7215 * in which case we don't know without a lot of work 7216 * what it evaluates to, so we have to assume the worst 7217 * case: that it does complete the fold, and so we 7218 * can't split here. All such instances will have 7219 * that character be an ASCII punctuation character, 7220 * like a backslash. So, for that case, backup one and 7221 * drop down to try at that position */ 7222 if (isPUNCT(*p)) { 7223 s = (char *) utf8_hop_back((U8 *) s, -1, 7224 (U8 *) s_start); 7225 backed_up = TRUE; 7226 } 7227 else { 7228 /* Here, since it's not punctuation, it must be a 7229 * real character, and we can append its fold to 7230 * 'e' (having deliberately reserved enough space 7231 * for this eventuality) and drop down to check if 7232 * the three actually do form a folded sequence */ 7233 (void) _to_utf8_fold_flags( 7234 (U8 *) p, (U8 *) RExC_end, 7235 (U8 *) e, 7236 &added_len, 7237 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) 7238 ? FOLD_FLAGS_NOMIX_ASCII 7239 : 0)); 7240 e += added_len; 7241 } 7242 } 7243 7244 /* Here, we either have three characters available in 7245 * sequence starting at 's', or we have two characters and 7246 * know that the following one can't possibly be part of a 7247 * three character fold. We go through the node backwards 7248 * until we find a place where we can split it without 7249 * breaking apart a multi-character fold. At any given 7250 * point we have to worry about if such a fold begins at 7251 * the current 's', and also if a three-character fold 7252 * begins at s-1, (containing s and s+1). Splitting in 7253 * either case would break apart a fold */ 7254 do { 7255 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1, 7256 (U8 *) s_start); 7257 7258 /* If is a multi-char fold, can't split here. Backup 7259 * one char and try again */ 7260 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) { 7261 s = prev_s; 7262 backed_up = TRUE; 7263 continue; 7264 } 7265 7266 /* If the two characters beginning at 's' are part of a 7267 * three character fold starting at the character 7268 * before s, we can't split either before or after s. 7269 * Backup two chars and try again */ 7270 if ( LIKELY(s > s_start) 7271 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e))) 7272 { 7273 s = prev_s; 7274 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start); 7275 backed_up = TRUE; 7276 continue; 7277 } 7278 7279 /* Here there's no multi-char fold between s and the 7280 * next character following it. We can split */ 7281 splittable = TRUE; 7282 break; 7283 7284 } while (s > s_start); /* End of loops backing up through the node */ 7285 7286 /* Here we either couldn't find a place to split the node, 7287 * or else we broke out of the loop setting 'splittable' to 7288 * true. In the latter case, the place to split is between 7289 * the first and second characters in the sequence starting 7290 * at 's' */ 7291 if (splittable) { 7292 s += UTF8SKIP(s); 7293 } 7294 } 7295 else { /* Pattern not UTF-8 */ 7296 if ( ender != LATIN_SMALL_LETTER_SHARP_S 7297 || ASCII_FOLD_RESTRICTED) 7298 { 7299 assert( toLOWER_L1(ender) < 256 ); 7300 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */ 7301 } 7302 else { 7303 *e++ = 's'; 7304 *e++ = 's'; 7305 } 7306 7307 if ( e - s <= 1 7308 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e))) 7309 { 7310 if (isPUNCT(*p)) { 7311 s--; 7312 backed_up = TRUE; 7313 } 7314 else { 7315 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S 7316 || ASCII_FOLD_RESTRICTED) 7317 { 7318 assert( toLOWER_L1(ender) < 256 ); 7319 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */ 7320 } 7321 else { 7322 *e++ = 's'; 7323 *e++ = 's'; 7324 } 7325 } 7326 } 7327 7328 do { 7329 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) { 7330 s--; 7331 backed_up = TRUE; 7332 continue; 7333 } 7334 7335 if ( LIKELY(s > s_start) 7336 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e))) 7337 { 7338 s -= 2; 7339 backed_up = TRUE; 7340 continue; 7341 } 7342 7343 splittable = TRUE; 7344 break; 7345 7346 } while (s > s_start); 7347 7348 if (splittable) { 7349 s++; 7350 } 7351 } 7352 7353 /* Here, we are done backing up. If we didn't backup at all 7354 * (the likely case), just proceed */ 7355 if (backed_up) { 7356 7357 /* If we did find a place to split, reparse the entire node 7358 * stopping where we have calculated. */ 7359 if (splittable) { 7360 7361 /* If we created a temporary folded string under /l, we 7362 * have to map that back to the original */ 7363 if (need_to_fold_loc) { 7364 upper_fill = loc_correspondence[s - s_start]; 7365 if (upper_fill == 0) { 7366 FAIL2("panic: loc_correspondence[%d] is 0", 7367 (int) (s - s_start)); 7368 } 7369 Safefree(locfold_buf); 7370 Safefree(loc_correspondence); 7371 } 7372 else { 7373 upper_fill = s - s0; 7374 } 7375 goto reparse; 7376 } 7377 7378 /* Here the node consists entirely of non-final multi-char 7379 * folds. (Likely it is all 'f's or all 's's.) There's no 7380 * decent place to split it, so give up and just take the 7381 * whole thing */ 7382 len = old_s - s0; 7383 } 7384 7385 if (need_to_fold_loc) { 7386 Safefree(locfold_buf); 7387 Safefree(loc_correspondence); 7388 } 7389 } /* End of verifying node ends with an appropriate char */ 7390 7391 /* We need to start the next node at the character that didn't fit 7392 * in this one */ 7393 p = oldp; 7394 7395 loopdone: /* Jumped to when encounters something that shouldn't be 7396 in the node */ 7397 7398 /* Free up any over-allocated space; cast is to silence bogus 7399 * warning in MS VC */ 7400 change_engine_size(pRExC_state, 7401 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len))); 7402 7403 /* I (khw) don't know if you can get here with zero length, but the 7404 * old code handled this situation by creating a zero-length EXACT 7405 * node. Might as well be NOTHING instead */ 7406 if (len == 0) { 7407 OP(REGNODE_p(ret)) = NOTHING; 7408 } 7409 else { 7410 7411 /* If the node type is EXACT here, check to see if it 7412 * should be EXACTL, or EXACT_REQ8. */ 7413 if (node_type == EXACT) { 7414 if (LOC) { 7415 node_type = EXACTL; 7416 } 7417 else if (requires_utf8_target) { 7418 node_type = EXACT_REQ8; 7419 } 7420 } 7421 else if (node_type == LEXACT) { 7422 if (requires_utf8_target) { 7423 node_type = LEXACT_REQ8; 7424 } 7425 } 7426 else if (FOLD) { 7427 if ( UNLIKELY(has_micro_sign || has_ss) 7428 && (node_type == EXACTFU || ( node_type == EXACTF 7429 && maybe_exactfu))) 7430 { /* These two conditions are problematic in non-UTF-8 7431 EXACTFU nodes. */ 7432 assert(! UTF); 7433 node_type = EXACTFUP; 7434 } 7435 else if (node_type == EXACTFL) { 7436 7437 /* 'maybe_exactfu' is deliberately set above to 7438 * indicate this node type, where all code points in it 7439 * are above 255 */ 7440 if (maybe_exactfu) { 7441 node_type = EXACTFLU8; 7442 } 7443 else if (UNLIKELY( 7444 _invlist_contains_cp(PL_HasMultiCharFold, ender))) 7445 { 7446 /* A character that folds to more than one will 7447 * match multiple characters, so can't be SIMPLE. 7448 * We don't have to worry about this with EXACTFLU8 7449 * nodes just above, as they have already been 7450 * folded (since the fold doesn't vary at run 7451 * time). Here, if the final character in the node 7452 * folds to multiple, it can't be simple. (This 7453 * only has an effect if the node has only a single 7454 * character, hence the final one, as elsewhere we 7455 * turn off simple for nodes whose length > 1 */ 7456 maybe_SIMPLE = 0; 7457 } 7458 } 7459 else if (node_type == EXACTF) { /* Means is /di */ 7460 7461 /* This intermediate variable is needed solely because 7462 * the asserts in the macro where used exceed Win32's 7463 * literal string capacity */ 7464 char first_char = * STRING(REGNODE_p(ret)); 7465 7466 /* If 'maybe_exactfu' is clear, then we need to stay 7467 * /di. If it is set, it means there are no code 7468 * points that match differently depending on UTF8ness 7469 * of the target string, so it can become an EXACTFU 7470 * node */ 7471 if (! maybe_exactfu) { 7472 RExC_seen_d_op = TRUE; 7473 } 7474 else if ( isALPHA_FOLD_EQ(first_char, 's') 7475 || isALPHA_FOLD_EQ(ender, 's')) 7476 { 7477 /* But, if the node begins or ends in an 's' we 7478 * have to defer changing it into an EXACTFU, as 7479 * the node could later get joined with another one 7480 * that ends or begins with 's' creating an 'ss' 7481 * sequence which would then wrongly match the 7482 * sharp s without the target being UTF-8. We 7483 * create a special node that we resolve later when 7484 * we join nodes together */ 7485 7486 node_type = EXACTFU_S_EDGE; 7487 } 7488 else { 7489 node_type = EXACTFU; 7490 } 7491 } 7492 7493 if (requires_utf8_target && node_type == EXACTFU) { 7494 node_type = EXACTFU_REQ8; 7495 } 7496 } 7497 7498 OP(REGNODE_p(ret)) = node_type; 7499 setSTR_LEN(REGNODE_p(ret), len); 7500 RExC_emit += STR_SZ(len); 7501 7502 /* If the node isn't a single character, it can't be SIMPLE */ 7503 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) { 7504 maybe_SIMPLE = 0; 7505 } 7506 7507 *flagp |= HASWIDTH | maybe_SIMPLE; 7508 } 7509 7510 RExC_parse_set(p); 7511 7512 { 7513 /* len is STRLEN which is unsigned, need to copy to signed */ 7514 IV iv = len; 7515 if (iv < 0) 7516 vFAIL("Internal disaster"); 7517 } 7518 7519 } /* End of label 'defchar:' */ 7520 break; 7521 } /* End of giant switch on input character */ 7522 7523 /* Position parse to next real character */ 7524 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 7525 FALSE /* Don't force to /x */ ); 7526 if ( *RExC_parse == '{' 7527 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL)) 7528 { 7529 if (RExC_strict) { 7530 RExC_parse_inc_by(1); 7531 vFAIL("Unescaped left brace in regex is illegal here"); 7532 } 7533 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is" 7534 " passed through"); 7535 } 7536 7537 return(ret); 7538 } 7539 7540 7541 #ifdef PERL_RE_BUILD_AUX 7542 void 7543 Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) 7544 { 7545 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It 7546 * sets up the bitmap and any flags, removing those code points from the 7547 * inversion list, setting it to NULL should it become completely empty */ 7548 7549 7550 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST; 7551 7552 /* There is no bitmap for this node type */ 7553 if (REGNODE_TYPE(OP(node)) != ANYOF) { 7554 return; 7555 } 7556 7557 ANYOF_BITMAP_ZERO(node); 7558 if (*invlist_ptr) { 7559 7560 /* This gets set if we actually need to modify things */ 7561 bool change_invlist = FALSE; 7562 7563 UV start, end; 7564 7565 /* Start looking through *invlist_ptr */ 7566 invlist_iterinit(*invlist_ptr); 7567 while (invlist_iternext(*invlist_ptr, &start, &end)) { 7568 UV high; 7569 int i; 7570 7571 /* Quit if are above what we should change */ 7572 if (start >= NUM_ANYOF_CODE_POINTS) { 7573 break; 7574 } 7575 7576 change_invlist = TRUE; 7577 7578 /* Set all the bits in the range, up to the max that we are doing */ 7579 high = (end < NUM_ANYOF_CODE_POINTS - 1) 7580 ? end 7581 : NUM_ANYOF_CODE_POINTS - 1; 7582 for (i = start; i <= (int) high; i++) { 7583 ANYOF_BITMAP_SET(node, i); 7584 } 7585 } 7586 invlist_iterfinish(*invlist_ptr); 7587 7588 /* Done with loop; remove any code points that are in the bitmap from 7589 * *invlist_ptr */ 7590 if (change_invlist) { 7591 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); 7592 } 7593 7594 /* If have completely emptied it, remove it completely */ 7595 if (_invlist_len(*invlist_ptr) == 0) { 7596 SvREFCNT_dec_NN(*invlist_ptr); 7597 *invlist_ptr = NULL; 7598 } 7599 } 7600 } 7601 #endif /* PERL_RE_BUILD_AUX */ 7602 7603 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]]. 7604 Character classes ([:foo:]) can also be negated ([:^foo:]). 7605 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise. 7606 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, 7607 but trigger failures because they are currently unimplemented. */ 7608 7609 #define POSIXCC_DONE(c) ((c) == ':') 7610 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') 7611 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) 7612 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';') 7613 7614 #define WARNING_PREFIX "Assuming NOT a POSIX class since " 7615 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one" 7616 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon" 7617 7618 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1) 7619 7620 /* 'posix_warnings' and 'warn_text' are names of variables in the following 7621 * routine. q.v. */ 7622 #define ADD_POSIX_WARNING(p, text) STMT_START { \ 7623 if (posix_warnings) { \ 7624 if (! RExC_warn_text ) RExC_warn_text = \ 7625 (AV *) sv_2mortal((SV *) newAV()); \ 7626 av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \ 7627 WARNING_PREFIX \ 7628 text \ 7629 REPORT_LOCATION, \ 7630 REPORT_LOCATION_ARGS(p))); \ 7631 } \ 7632 } STMT_END 7633 #define CLEAR_POSIX_WARNINGS() \ 7634 STMT_START { \ 7635 if (posix_warnings && RExC_warn_text) \ 7636 av_clear(RExC_warn_text); \ 7637 } STMT_END 7638 7639 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \ 7640 STMT_START { \ 7641 CLEAR_POSIX_WARNINGS(); \ 7642 return ret; \ 7643 } STMT_END 7644 7645 STATIC int 7646 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, 7647 7648 const char * const s, /* Where the putative posix class begins. 7649 Normally, this is one past the '['. This 7650 parameter exists so it can be somewhere 7651 besides RExC_parse. */ 7652 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or 7653 NULL */ 7654 AV ** posix_warnings, /* Where to place any generated warnings, or 7655 NULL */ 7656 const bool check_only /* Don't die if error */ 7657 ) 7658 { 7659 /* This parses what the caller thinks may be one of the three POSIX 7660 * constructs: 7661 * 1) a character class, like [:blank:] 7662 * 2) a collating symbol, like [. .] 7663 * 3) an equivalence class, like [= =] 7664 * In the latter two cases, it croaks if it finds a syntactically legal 7665 * one, as these are not handled by Perl. 7666 * 7667 * The main purpose is to look for a POSIX character class. It returns: 7668 * a) the class number 7669 * if it is a completely syntactically and semantically legal class. 7670 * 'updated_parse_ptr', if not NULL, is set to point to just after the 7671 * closing ']' of the class 7672 * b) OOB_NAMEDCLASS 7673 * if it appears that one of the three POSIX constructs was meant, but 7674 * its specification was somehow defective. 'updated_parse_ptr', if 7675 * not NULL, is set to point to the character just after the end 7676 * character of the class. See below for handling of warnings. 7677 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS 7678 * if it doesn't appear that a POSIX construct was intended. 7679 * 'updated_parse_ptr' is not changed. No warnings nor errors are 7680 * raised. 7681 * 7682 * In b) there may be errors or warnings generated. If 'check_only' is 7683 * TRUE, then any errors are discarded. Warnings are returned to the 7684 * caller via an AV* created into '*posix_warnings' if it is not NULL. If 7685 * instead it is NULL, warnings are suppressed. 7686 * 7687 * The reason for this function, and its complexity is that a bracketed 7688 * character class can contain just about anything. But it's easy to 7689 * mistype the very specific posix class syntax but yielding a valid 7690 * regular bracketed class, so it silently gets compiled into something 7691 * quite unintended. 7692 * 7693 * The solution adopted here maintains backward compatibility except that 7694 * it adds a warning if it looks like a posix class was intended but 7695 * improperly specified. The warning is not raised unless what is input 7696 * very closely resembles one of the 14 legal posix classes. To do this, 7697 * it uses fuzzy parsing. It calculates how many single-character edits it 7698 * would take to transform what was input into a legal posix class. Only 7699 * if that number is quite small does it think that the intention was a 7700 * posix class. Obviously these are heuristics, and there will be cases 7701 * where it errs on one side or another, and they can be tweaked as 7702 * experience informs. 7703 * 7704 * The syntax for a legal posix class is: 7705 * 7706 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/ 7707 * 7708 * What this routine considers syntactically to be an intended posix class 7709 * is this (the comments indicate some restrictions that the pattern 7710 * doesn't show): 7711 * 7712 * qr/(?x: \[? # The left bracket, possibly 7713 * # omitted 7714 * \h* # possibly followed by blanks 7715 * (?: \^ \h* )? # possibly a misplaced caret 7716 * [:;]? # The opening class character, 7717 * # possibly omitted. A typo 7718 * # semi-colon can also be used. 7719 * \h* 7720 * \^? # possibly a correctly placed 7721 * # caret, but not if there was also 7722 * # a misplaced one 7723 * \h* 7724 * .{3,15} # The class name. If there are 7725 * # deviations from the legal syntax, 7726 * # its edit distance must be close 7727 * # to a real class name in order 7728 * # for it to be considered to be 7729 * # an intended posix class. 7730 * \h* 7731 * [[:punct:]]? # The closing class character, 7732 * # possibly omitted. If not a colon 7733 * # nor semi colon, the class name 7734 * # must be even closer to a valid 7735 * # one 7736 * \h* 7737 * \]? # The right bracket, possibly 7738 * # omitted. 7739 * )/ 7740 * 7741 * In the above, \h must be ASCII-only. 7742 * 7743 * These are heuristics, and can be tweaked as field experience dictates. 7744 * There will be cases when someone didn't intend to specify a posix class 7745 * that this warns as being so. The goal is to minimize these, while 7746 * maximizing the catching of things intended to be a posix class that 7747 * aren't parsed as such. 7748 */ 7749 7750 const char* p = s; 7751 const char * const e = RExC_end; 7752 unsigned complement = 0; /* If to complement the class */ 7753 bool found_problem = FALSE; /* Assume OK until proven otherwise */ 7754 bool has_opening_bracket = FALSE; 7755 bool has_opening_colon = FALSE; 7756 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find 7757 valid class */ 7758 const char * possible_end = NULL; /* used for a 2nd parse pass */ 7759 const char* name_start; /* ptr to class name first char */ 7760 7761 /* If the number of single-character typos the input name is away from a 7762 * legal name is no more than this number, it is considered to have meant 7763 * the legal name */ 7764 int max_distance = 2; 7765 7766 /* to store the name. The size determines the maximum length before we 7767 * decide that no posix class was intended. Should be at least 7768 * sizeof("alphanumeric") */ 7769 UV input_text[15]; 7770 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric"); 7771 7772 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; 7773 7774 CLEAR_POSIX_WARNINGS(); 7775 7776 if (p >= e) { 7777 return NOT_MEANT_TO_BE_A_POSIX_CLASS; 7778 } 7779 7780 if (*(p - 1) != '[') { 7781 ADD_POSIX_WARNING(p, "it doesn't start with a '['"); 7782 found_problem = TRUE; 7783 } 7784 else { 7785 has_opening_bracket = TRUE; 7786 } 7787 7788 /* They could be confused and think you can put spaces between the 7789 * components */ 7790 if (isBLANK(*p)) { 7791 found_problem = TRUE; 7792 7793 do { 7794 p++; 7795 } while (p < e && isBLANK(*p)); 7796 7797 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7798 } 7799 7800 /* For [. .] and [= =]. These are quite different internally from [: :], 7801 * so they are handled separately. */ 7802 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' 7803 and 1 for at least one char in it 7804 */ 7805 { 7806 const char open_char = *p; 7807 const char * temp_ptr = p + 1; 7808 7809 /* These two constructs are not handled by perl, and if we find a 7810 * syntactically valid one, we croak. khw, who wrote this code, finds 7811 * this explanation of them very unclear: 7812 * https://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html 7813 * And searching the rest of the internet wasn't very helpful either. 7814 * It looks like just about any byte can be in these constructs, 7815 * depending on the locale. But unless the pattern is being compiled 7816 * under /l, which is very rare, Perl runs under the C or POSIX locale. 7817 * In that case, it looks like [= =] isn't allowed at all, and that 7818 * [. .] could be any single code point, but for longer strings the 7819 * constituent characters would have to be the ASCII alphabetics plus 7820 * the minus-hyphen. Any sensible locale definition would limit itself 7821 * to these. And any portable one definitely should. Trying to parse 7822 * the general case is a nightmare (see [perl #127604]). So, this code 7823 * looks only for interiors of these constructs that match: 7824 * qr/.|[-\w]{2,}/ 7825 * Using \w relaxes the apparent rules a little, without adding much 7826 * danger of mistaking something else for one of these constructs. 7827 * 7828 * [. .] in some implementations described on the internet is usable to 7829 * escape a character that otherwise is special in bracketed character 7830 * classes. For example [.].] means a literal right bracket instead of 7831 * the ending of the class 7832 * 7833 * [= =] can legitimately contain a [. .] construct, but we don't 7834 * handle this case, as that [. .] construct will later get parsed 7835 * itself and croak then. And [= =] is checked for even when not under 7836 * /l, as Perl has long done so. 7837 * 7838 * The code below relies on there being a trailing NUL, so it doesn't 7839 * have to keep checking if the parse ptr < e. 7840 */ 7841 if (temp_ptr[1] == open_char) { 7842 temp_ptr++; 7843 } 7844 else while ( temp_ptr < e 7845 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) 7846 { 7847 temp_ptr++; 7848 } 7849 7850 if (*temp_ptr == open_char) { 7851 temp_ptr++; 7852 if (*temp_ptr == ']') { 7853 temp_ptr++; 7854 if (! found_problem && ! check_only) { 7855 RExC_parse_set((char *) temp_ptr); 7856 vFAIL3("POSIX syntax [%c %c] is reserved for future " 7857 "extensions", open_char, open_char); 7858 } 7859 7860 /* Here, the syntax wasn't completely valid, or else the call 7861 * is to check-only */ 7862 if (updated_parse_ptr) { 7863 *updated_parse_ptr = (char *) temp_ptr; 7864 } 7865 7866 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS); 7867 } 7868 } 7869 7870 /* If we find something that started out to look like one of these 7871 * constructs, but isn't, we continue below so that it can be checked 7872 * for being a class name with a typo of '.' or '=' instead of a colon. 7873 * */ 7874 } 7875 7876 /* Here, we think there is a possibility that a [: :] class was meant, and 7877 * we have the first real character. It could be they think the '^' comes 7878 * first */ 7879 if (*p == '^') { 7880 found_problem = TRUE; 7881 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon"); 7882 complement = 1; 7883 p++; 7884 7885 if (isBLANK(*p)) { 7886 found_problem = TRUE; 7887 7888 do { 7889 p++; 7890 } while (p < e && isBLANK(*p)); 7891 7892 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7893 } 7894 } 7895 7896 /* But the first character should be a colon, which they could have easily 7897 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to 7898 * distinguish from a colon, so treat that as a colon). */ 7899 if (*p == ':') { 7900 p++; 7901 has_opening_colon = TRUE; 7902 } 7903 else if (*p == ';') { 7904 found_problem = TRUE; 7905 p++; 7906 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 7907 has_opening_colon = TRUE; 7908 } 7909 else { 7910 found_problem = TRUE; 7911 ADD_POSIX_WARNING(p, "there must be a starting ':'"); 7912 7913 /* Consider an initial punctuation (not one of the recognized ones) to 7914 * be a left terminator */ 7915 if (*p != '^' && *p != ']' && isPUNCT(*p)) { 7916 p++; 7917 } 7918 } 7919 7920 /* They may think that you can put spaces between the components */ 7921 if (isBLANK(*p)) { 7922 found_problem = TRUE; 7923 7924 do { 7925 p++; 7926 } while (p < e && isBLANK(*p)); 7927 7928 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7929 } 7930 7931 if (*p == '^') { 7932 7933 /* We consider something like [^:^alnum:]] to not have been intended to 7934 * be a posix class, but XXX maybe we should */ 7935 if (complement) { 7936 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 7937 } 7938 7939 complement = 1; 7940 p++; 7941 } 7942 7943 /* Again, they may think that you can put spaces between the components */ 7944 if (isBLANK(*p)) { 7945 found_problem = TRUE; 7946 7947 do { 7948 p++; 7949 } while (p < e && isBLANK(*p)); 7950 7951 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 7952 } 7953 7954 if (*p == ']') { 7955 7956 /* XXX This ']' may be a typo, and something else was meant. But 7957 * treating it as such creates enough complications, that that 7958 * possibility isn't currently considered here. So we assume that the 7959 * ']' is what is intended, and if we've already found an initial '[', 7960 * this leaves this construct looking like [:] or [:^], which almost 7961 * certainly weren't intended to be posix classes */ 7962 if (has_opening_bracket) { 7963 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 7964 } 7965 7966 /* But this function can be called when we parse the colon for 7967 * something like qr/[alpha:]]/, so we back up to look for the 7968 * beginning */ 7969 p--; 7970 7971 if (*p == ';') { 7972 found_problem = TRUE; 7973 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 7974 } 7975 else if (*p != ':') { 7976 7977 /* XXX We are currently very restrictive here, so this code doesn't 7978 * consider the possibility that, say, /[alpha.]]/ was intended to 7979 * be a posix class. */ 7980 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 7981 } 7982 7983 /* Here we have something like 'foo:]'. There was no initial colon, 7984 * and we back up over 'foo. XXX Unlike the going forward case, we 7985 * don't handle typos of non-word chars in the middle */ 7986 has_opening_colon = FALSE; 7987 p--; 7988 7989 while (p > RExC_start && isWORDCHAR(*p)) { 7990 p--; 7991 } 7992 p++; 7993 7994 /* Here, we have positioned ourselves to where we think the first 7995 * character in the potential class is */ 7996 } 7997 7998 /* Now the interior really starts. There are certain key characters that 7999 * can end the interior, or these could just be typos. To catch both 8000 * cases, we may have to do two passes. In the first pass, we keep on 8001 * going unless we come to a sequence that matches 8002 * qr/ [[:punct:]] [[:blank:]]* \] /xa 8003 * This means it takes a sequence to end the pass, so two typos in a row if 8004 * that wasn't what was intended. If the class is perfectly formed, just 8005 * this one pass is needed. We also stop if there are too many characters 8006 * being accumulated, but this number is deliberately set higher than any 8007 * real class. It is set high enough so that someone who thinks that 8008 * 'alphanumeric' is a correct name would get warned that it wasn't. 8009 * While doing the pass, we keep track of where the key characters were in 8010 * it. If we don't find an end to the class, and one of the key characters 8011 * was found, we redo the pass, but stop when we get to that character. 8012 * Thus the key character was considered a typo in the first pass, but a 8013 * terminator in the second. If two key characters are found, we stop at 8014 * the second one in the first pass. Again this can miss two typos, but 8015 * catches a single one 8016 * 8017 * In the first pass, 'possible_end' starts as NULL, and then gets set to 8018 * point to the first key character. For the second pass, it starts as -1. 8019 * */ 8020 8021 name_start = p; 8022 parse_name: 8023 { 8024 bool has_blank = FALSE; 8025 bool has_upper = FALSE; 8026 bool has_terminating_colon = FALSE; 8027 bool has_terminating_bracket = FALSE; 8028 bool has_semi_colon = FALSE; 8029 unsigned int name_len = 0; 8030 int punct_count = 0; 8031 8032 while (p < e) { 8033 8034 /* Squeeze out blanks when looking up the class name below */ 8035 if (isBLANK(*p) ) { 8036 has_blank = TRUE; 8037 found_problem = TRUE; 8038 p++; 8039 continue; 8040 } 8041 8042 /* The name will end with a punctuation */ 8043 if (isPUNCT(*p)) { 8044 const char * peek = p + 1; 8045 8046 /* Treat any non-']' punctuation followed by a ']' (possibly 8047 * with intervening blanks) as trying to terminate the class. 8048 * ']]' is very likely to mean a class was intended (but 8049 * missing the colon), but the warning message that gets 8050 * generated shows the error position better if we exit the 8051 * loop at the bottom (eventually), so skip it here. */ 8052 if (*p != ']') { 8053 if (peek < e && isBLANK(*peek)) { 8054 has_blank = TRUE; 8055 found_problem = TRUE; 8056 do { 8057 peek++; 8058 } while (peek < e && isBLANK(*peek)); 8059 } 8060 8061 if (peek < e && *peek == ']') { 8062 has_terminating_bracket = TRUE; 8063 if (*p == ':') { 8064 has_terminating_colon = TRUE; 8065 } 8066 else if (*p == ';') { 8067 has_semi_colon = TRUE; 8068 has_terminating_colon = TRUE; 8069 } 8070 else { 8071 found_problem = TRUE; 8072 } 8073 p = peek + 1; 8074 goto try_posix; 8075 } 8076 } 8077 8078 /* Here we have punctuation we thought didn't end the class. 8079 * Keep track of the position of the key characters that are 8080 * more likely to have been class-enders */ 8081 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') { 8082 8083 /* Allow just one such possible class-ender not actually 8084 * ending the class. */ 8085 if (possible_end) { 8086 break; 8087 } 8088 possible_end = p; 8089 } 8090 8091 /* If we have too many punctuation characters, no use in 8092 * keeping going */ 8093 if (++punct_count > max_distance) { 8094 break; 8095 } 8096 8097 /* Treat the punctuation as a typo. */ 8098 input_text[name_len++] = *p; 8099 p++; 8100 } 8101 else if (isUPPER(*p)) { /* Use lowercase for lookup */ 8102 input_text[name_len++] = toLOWER(*p); 8103 has_upper = TRUE; 8104 found_problem = TRUE; 8105 p++; 8106 } else if (! UTF || UTF8_IS_INVARIANT(*p)) { 8107 input_text[name_len++] = *p; 8108 p++; 8109 } 8110 else { 8111 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL); 8112 p+= UTF8SKIP(p); 8113 } 8114 8115 /* The declaration of 'input_text' is how long we allow a potential 8116 * class name to be, before saying they didn't mean a class name at 8117 * all */ 8118 if (name_len >= C_ARRAY_LENGTH(input_text)) { 8119 break; 8120 } 8121 } 8122 8123 /* We get to here when the possible class name hasn't been properly 8124 * terminated before: 8125 * 1) we ran off the end of the pattern; or 8126 * 2) found two characters, each of which might have been intended to 8127 * be the name's terminator 8128 * 3) found so many punctuation characters in the purported name, 8129 * that the edit distance to a valid one is exceeded 8130 * 4) we decided it was more characters than anyone could have 8131 * intended to be one. */ 8132 8133 found_problem = TRUE; 8134 8135 /* In the final two cases, we know that looking up what we've 8136 * accumulated won't lead to a match, even a fuzzy one. */ 8137 if ( name_len >= C_ARRAY_LENGTH(input_text) 8138 || punct_count > max_distance) 8139 { 8140 /* If there was an intermediate key character that could have been 8141 * an intended end, redo the parse, but stop there */ 8142 if (possible_end && possible_end != (char *) -1) { 8143 possible_end = (char *) -1; /* Special signal value to say 8144 we've done a first pass */ 8145 p = name_start; 8146 goto parse_name; 8147 } 8148 8149 /* Otherwise, it can't have meant to have been a class */ 8150 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 8151 } 8152 8153 /* If we ran off the end, and the final character was a punctuation 8154 * one, back up one, to look at that final one just below. Later, we 8155 * will restore the parse pointer if appropriate */ 8156 if (name_len && p == e && isPUNCT(*(p-1))) { 8157 p--; 8158 name_len--; 8159 } 8160 8161 if (p < e && isPUNCT(*p)) { 8162 if (*p == ']') { 8163 has_terminating_bracket = TRUE; 8164 8165 /* If this is a 2nd ']', and the first one is just below this 8166 * one, consider that to be the real terminator. This gives a 8167 * uniform and better positioning for the warning message */ 8168 if ( possible_end 8169 && possible_end != (char *) -1 8170 && *possible_end == ']' 8171 && name_len && input_text[name_len - 1] == ']') 8172 { 8173 name_len--; 8174 p = possible_end; 8175 8176 /* And this is actually equivalent to having done the 2nd 8177 * pass now, so set it to not try again */ 8178 possible_end = (char *) -1; 8179 } 8180 } 8181 else { 8182 if (*p == ':') { 8183 has_terminating_colon = TRUE; 8184 } 8185 else if (*p == ';') { 8186 has_semi_colon = TRUE; 8187 has_terminating_colon = TRUE; 8188 } 8189 p++; 8190 } 8191 } 8192 8193 try_posix: 8194 8195 /* Here, we have a class name to look up. We can short circuit the 8196 * stuff below for short names that can't possibly be meant to be a 8197 * class name. (We can do this on the first pass, as any second pass 8198 * will yield an even shorter name) */ 8199 if (name_len < 3) { 8200 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 8201 } 8202 8203 /* Find which class it is. Initially switch on the length of the name. 8204 * */ 8205 switch (name_len) { 8206 case 4: 8207 if (memEQs(name_start, 4, "word")) { 8208 /* this is not POSIX, this is the Perl \w */ 8209 class_number = ANYOF_WORDCHAR; 8210 } 8211 break; 8212 case 5: 8213 /* Names all of length 5: alnum alpha ascii blank cntrl digit 8214 * graph lower print punct space upper 8215 * Offset 4 gives the best switch position. */ 8216 switch (name_start[4]) { 8217 case 'a': 8218 if (memBEGINs(name_start, 5, "alph")) /* alpha */ 8219 class_number = ANYOF_ALPHA; 8220 break; 8221 case 'e': 8222 if (memBEGINs(name_start, 5, "spac")) /* space */ 8223 class_number = ANYOF_SPACE; 8224 break; 8225 case 'h': 8226 if (memBEGINs(name_start, 5, "grap")) /* graph */ 8227 class_number = ANYOF_GRAPH; 8228 break; 8229 case 'i': 8230 if (memBEGINs(name_start, 5, "asci")) /* ascii */ 8231 class_number = ANYOF_ASCII; 8232 break; 8233 case 'k': 8234 if (memBEGINs(name_start, 5, "blan")) /* blank */ 8235 class_number = ANYOF_BLANK; 8236 break; 8237 case 'l': 8238 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */ 8239 class_number = ANYOF_CNTRL; 8240 break; 8241 case 'm': 8242 if (memBEGINs(name_start, 5, "alnu")) /* alnum */ 8243 class_number = ANYOF_ALPHANUMERIC; 8244 break; 8245 case 'r': 8246 if (memBEGINs(name_start, 5, "lowe")) /* lower */ 8247 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; 8248 else if (memBEGINs(name_start, 5, "uppe")) /* upper */ 8249 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; 8250 break; 8251 case 't': 8252 if (memBEGINs(name_start, 5, "digi")) /* digit */ 8253 class_number = ANYOF_DIGIT; 8254 else if (memBEGINs(name_start, 5, "prin")) /* print */ 8255 class_number = ANYOF_PRINT; 8256 else if (memBEGINs(name_start, 5, "punc")) /* punct */ 8257 class_number = ANYOF_PUNCT; 8258 break; 8259 } 8260 break; 8261 case 6: 8262 if (memEQs(name_start, 6, "xdigit")) 8263 class_number = ANYOF_XDIGIT; 8264 break; 8265 } 8266 8267 /* If the name exactly matches a posix class name the class number will 8268 * here be set to it, and the input almost certainly was meant to be a 8269 * posix class, so we can skip further checking. If instead the syntax 8270 * is exactly correct, but the name isn't one of the legal ones, we 8271 * will return that as an error below. But if neither of these apply, 8272 * it could be that no posix class was intended at all, or that one 8273 * was, but there was a typo. We tease these apart by doing fuzzy 8274 * matching on the name */ 8275 if (class_number == OOB_NAMEDCLASS && found_problem) { 8276 const UV posix_names[][6] = { 8277 { 'a', 'l', 'n', 'u', 'm' }, 8278 { 'a', 'l', 'p', 'h', 'a' }, 8279 { 'a', 's', 'c', 'i', 'i' }, 8280 { 'b', 'l', 'a', 'n', 'k' }, 8281 { 'c', 'n', 't', 'r', 'l' }, 8282 { 'd', 'i', 'g', 'i', 't' }, 8283 { 'g', 'r', 'a', 'p', 'h' }, 8284 { 'l', 'o', 'w', 'e', 'r' }, 8285 { 'p', 'r', 'i', 'n', 't' }, 8286 { 'p', 'u', 'n', 'c', 't' }, 8287 { 's', 'p', 'a', 'c', 'e' }, 8288 { 'u', 'p', 'p', 'e', 'r' }, 8289 { 'w', 'o', 'r', 'd' }, 8290 { 'x', 'd', 'i', 'g', 'i', 't' } 8291 }; 8292 /* The names of the above all have added NULs to make them the same 8293 * size, so we need to also have the real lengths */ 8294 const UV posix_name_lengths[] = { 8295 sizeof("alnum") - 1, 8296 sizeof("alpha") - 1, 8297 sizeof("ascii") - 1, 8298 sizeof("blank") - 1, 8299 sizeof("cntrl") - 1, 8300 sizeof("digit") - 1, 8301 sizeof("graph") - 1, 8302 sizeof("lower") - 1, 8303 sizeof("print") - 1, 8304 sizeof("punct") - 1, 8305 sizeof("space") - 1, 8306 sizeof("upper") - 1, 8307 sizeof("word") - 1, 8308 sizeof("xdigit")- 1 8309 }; 8310 unsigned int i; 8311 int temp_max = max_distance; /* Use a temporary, so if we 8312 reparse, we haven't changed the 8313 outer one */ 8314 8315 /* Use a smaller max edit distance if we are missing one of the 8316 * delimiters */ 8317 if ( has_opening_bracket + has_opening_colon < 2 8318 || has_terminating_bracket + has_terminating_colon < 2) 8319 { 8320 temp_max--; 8321 } 8322 8323 /* See if the input name is close to a legal one */ 8324 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) { 8325 8326 /* Short circuit call if the lengths are too far apart to be 8327 * able to match */ 8328 if (abs( (int) (name_len - posix_name_lengths[i])) 8329 > temp_max) 8330 { 8331 continue; 8332 } 8333 8334 if (edit_distance(input_text, 8335 posix_names[i], 8336 name_len, 8337 posix_name_lengths[i], 8338 temp_max 8339 ) 8340 > -1) 8341 { /* If it is close, it probably was intended to be a class */ 8342 goto probably_meant_to_be; 8343 } 8344 } 8345 8346 /* Here the input name is not close enough to a valid class name 8347 * for us to consider it to be intended to be a posix class. If 8348 * we haven't already done so, and the parse found a character that 8349 * could have been terminators for the name, but which we absorbed 8350 * as typos during the first pass, repeat the parse, signalling it 8351 * to stop at that character */ 8352 if (possible_end && possible_end != (char *) -1) { 8353 possible_end = (char *) -1; 8354 p = name_start; 8355 goto parse_name; 8356 } 8357 8358 /* Here neither pass found a close-enough class name */ 8359 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS); 8360 } 8361 8362 probably_meant_to_be: 8363 8364 /* Here we think that a posix specification was intended. Update any 8365 * parse pointer */ 8366 if (updated_parse_ptr) { 8367 *updated_parse_ptr = (char *) p; 8368 } 8369 8370 /* If a posix class name was intended but incorrectly specified, we 8371 * output or return the warnings */ 8372 if (found_problem) { 8373 8374 /* We set flags for these issues in the parse loop above instead of 8375 * adding them to the list of warnings, because we can parse it 8376 * twice, and we only want one warning instance */ 8377 if (has_upper) { 8378 ADD_POSIX_WARNING(p, "the name must be all lowercase letters"); 8379 } 8380 if (has_blank) { 8381 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); 8382 } 8383 if (has_semi_colon) { 8384 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); 8385 } 8386 else if (! has_terminating_colon) { 8387 ADD_POSIX_WARNING(p, "there is no terminating ':'"); 8388 } 8389 if (! has_terminating_bracket) { 8390 ADD_POSIX_WARNING(p, "there is no terminating ']'"); 8391 } 8392 8393 if ( posix_warnings 8394 && RExC_warn_text 8395 && av_count(RExC_warn_text) > 0) 8396 { 8397 *posix_warnings = RExC_warn_text; 8398 } 8399 } 8400 else if (class_number != OOB_NAMEDCLASS) { 8401 /* If it is a known class, return the class. The class number 8402 * #defines are structured so each complement is +1 to the normal 8403 * one */ 8404 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement); 8405 } 8406 else if (! check_only) { 8407 8408 /* Here, it is an unrecognized class. This is an error (unless the 8409 * call is to check only, which we've already handled above) */ 8410 const char * const complement_string = (complement) 8411 ? "^" 8412 : ""; 8413 RExC_parse_set((char *) p); 8414 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown", 8415 complement_string, 8416 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); 8417 } 8418 } 8419 8420 return OOB_NAMEDCLASS; 8421 } 8422 #undef ADD_POSIX_WARNING 8423 8424 STATIC unsigned int 8425 S_regex_set_precedence(const U8 my_operator) { 8426 8427 /* Returns the precedence in the (?[...]) construct of the input operator, 8428 * specified by its character representation. The precedence follows 8429 * general Perl rules, but it extends this so that ')' and ']' have (low) 8430 * precedence even though they aren't really operators */ 8431 8432 switch (my_operator) { 8433 case '!': 8434 return 5; 8435 case '&': 8436 return 4; 8437 case '^': 8438 case '|': 8439 case '+': 8440 case '-': 8441 return 3; 8442 case ')': 8443 return 2; 8444 case ']': 8445 return 1; 8446 } 8447 8448 NOT_REACHED; /* NOTREACHED */ 8449 return 0; /* Silence compiler warning */ 8450 } 8451 8452 STATIC regnode_offset 8453 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, 8454 I32 *flagp, U32 depth) 8455 { 8456 /* Handle the (?[...]) construct to do set operations */ 8457 8458 U8 curchar; /* Current character being parsed */ 8459 UV start, end; /* End points of code point ranges */ 8460 SV* final = NULL; /* The end result inversion list */ 8461 SV* result_string; /* 'final' stringified */ 8462 AV* stack; /* stack of operators and operands not yet 8463 resolved */ 8464 AV* fence_stack = NULL; /* A stack containing the positions in 8465 'stack' of where the undealt-with left 8466 parens would be if they were actually 8467 put there */ 8468 /* The 'volatile' is a workaround for an optimiser bug 8469 * in Solaris Studio 12.3. See RT #127455 */ 8470 volatile IV fence = 0; /* Position of where most recent undealt- 8471 with left paren in stack is; -1 if none. 8472 */ 8473 STRLEN len; /* Temporary */ 8474 regnode_offset node; /* Temporary, and final regnode returned by 8475 this function */ 8476 const bool save_fold = FOLD; /* Temporary */ 8477 char *save_end, *save_parse; /* Temporaries */ 8478 const bool in_locale = LOC; /* we turn off /l during processing */ 8479 8480 DECLARE_AND_GET_RE_DEBUG_FLAGS; 8481 8482 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; 8483 8484 DEBUG_PARSE("xcls"); 8485 8486 if (in_locale) { 8487 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); 8488 } 8489 8490 /* The use of this operator implies /u. This is required so that the 8491 * compile time values are valid in all runtime cases */ 8492 REQUIRE_UNI_RULES(flagp, 0); 8493 8494 /* Everything in this construct is a metacharacter. Operands begin with 8495 * either a '\' (for an escape sequence), or a '[' for a bracketed 8496 * character class. Any other character should be an operator, or 8497 * parenthesis for grouping. Both types of operands are handled by calling 8498 * regclass() to parse them. It is called with a parameter to indicate to 8499 * return the computed inversion list. The parsing here is implemented via 8500 * a stack. Each entry on the stack is a single character representing one 8501 * of the operators; or else a pointer to an operand inversion list. */ 8502 8503 #define IS_OPERATOR(a) SvIOK(a) 8504 #define IS_OPERAND(a) (! IS_OPERATOR(a)) 8505 8506 /* The stack is kept in Łukasiewicz order. (That's pronounced similar 8507 * to luke-a-shave-itch (or -itz), but people who didn't want to bother 8508 * with pronouncing it called it Reverse Polish instead, but now that YOU 8509 * know how to pronounce it you can use the correct term, thus giving due 8510 * credit to the person who invented it, and impressing your geek friends. 8511 * Wikipedia says that the pronunciation of "Ł" has been changing so that 8512 * it is now more like an English initial W (as in wonk) than an L.) 8513 * 8514 * This means that, for example, 'a | b & c' is stored on the stack as 8515 * 8516 * c [4] 8517 * b [3] 8518 * & [2] 8519 * a [1] 8520 * | [0] 8521 * 8522 * where the numbers in brackets give the stack [array] element number. 8523 * In this implementation, parentheses are not stored on the stack. 8524 * Instead a '(' creates a "fence" so that the part of the stack below the 8525 * fence is invisible except to the corresponding ')' (this allows us to 8526 * replace testing for parens, by using instead subtraction of the fence 8527 * position). As new operands are processed they are pushed onto the stack 8528 * (except as noted in the next paragraph). New operators of higher 8529 * precedence than the current final one are inserted on the stack before 8530 * the lhs operand (so that when the rhs is pushed next, everything will be 8531 * in the correct positions shown above. When an operator of equal or 8532 * lower precedence is encountered in parsing, all the stacked operations 8533 * of equal or higher precedence are evaluated, leaving the result as the 8534 * top entry on the stack. This makes higher precedence operations 8535 * evaluate before lower precedence ones, and causes operations of equal 8536 * precedence to left associate. 8537 * 8538 * The only unary operator '!' is immediately pushed onto the stack when 8539 * encountered. When an operand is encountered, if the top of the stack is 8540 * a '!", the complement is immediately performed, and the '!' popped. The 8541 * resulting value is treated as a new operand, and the logic in the 8542 * previous paragraph is executed. Thus in the expression 8543 * [a] + ! [b] 8544 * the stack looks like 8545 * 8546 * ! 8547 * a 8548 * + 8549 * 8550 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack 8551 * becomes 8552 * 8553 * !b 8554 * a 8555 * + 8556 * 8557 * A ')' is treated as an operator with lower precedence than all the 8558 * aforementioned ones, which causes all operations on the stack above the 8559 * corresponding '(' to be evaluated down to a single resultant operand. 8560 * Then the fence for the '(' is removed, and the operand goes through the 8561 * algorithm above, without the fence. 8562 * 8563 * A separate stack is kept of the fence positions, so that the position of 8564 * the latest so-far unbalanced '(' is at the top of it. 8565 * 8566 * The ']' ending the construct is treated as the lowest operator of all, 8567 * so that everything gets evaluated down to a single operand, which is the 8568 * result */ 8569 8570 stack = (AV*)newSV_type_mortal(SVt_PVAV); 8571 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV); 8572 8573 while (RExC_parse < RExC_end) { 8574 I32 top_index; /* Index of top-most element in 'stack' */ 8575 SV** top_ptr; /* Pointer to top 'stack' element */ 8576 SV* current = NULL; /* To contain the current inversion list 8577 operand */ 8578 SV* only_to_avoid_leaks; 8579 8580 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 8581 TRUE /* Force /x */ ); 8582 if (RExC_parse >= RExC_end) { /* Fail */ 8583 break; 8584 } 8585 8586 curchar = UCHARAT(RExC_parse); 8587 8588 redo_curchar: 8589 8590 #ifdef ENABLE_REGEX_SETS_DEBUGGING 8591 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */ 8592 DEBUG_U(dump_regex_sets_structures(pRExC_state, 8593 stack, fence, fence_stack)); 8594 #endif 8595 8596 top_index = av_tindex_skip_len_mg(stack); 8597 8598 switch (curchar) { 8599 SV** stacked_ptr; /* Ptr to something already on 'stack' */ 8600 char stacked_operator; /* The topmost operator on the 'stack'. */ 8601 SV* lhs; /* Operand to the left of the operator */ 8602 SV* rhs; /* Operand to the right of the operator */ 8603 SV* fence_ptr; /* Pointer to top element of the fence 8604 stack */ 8605 case '(': 8606 8607 if ( RExC_parse < RExC_end - 2 8608 && UCHARAT(RExC_parse + 1) == '?' 8609 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2))) 8610 { 8611 const regnode_offset orig_emit = RExC_emit; 8612 SV * resultant_invlist; 8613 8614 /* Here it could be an embedded '(?flags:(?[...])'. 8615 * This happens when we have some thing like 8616 * 8617 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/; 8618 * ... 8619 * qr/(?[ \p{Digit} & $thai_or_lao ])/; 8620 * 8621 * Here we would be handling the interpolated 8622 * '$thai_or_lao'. We handle this by a recursive call to 8623 * reg which returns the inversion list the 8624 * interpolated expression evaluates to. Actually, the 8625 * return is a special regnode containing a pointer to that 8626 * inversion list. If the return isn't that regnode alone, 8627 * we know that this wasn't such an interpolation, which is 8628 * an error: we need to get a single inversion list back 8629 * from the recursion */ 8630 8631 RExC_parse_inc_by(1); 8632 RExC_sets_depth++; 8633 8634 node = reg(pRExC_state, 2, flagp, depth+1); 8635 RETURN_FAIL_ON_RESTART(*flagp, flagp); 8636 8637 if ( OP(REGNODE_p(node)) != REGEX_SET 8638 /* If more than a single node returned, the nested 8639 * parens evaluated to more than just a (?[...]), 8640 * which isn't legal */ 8641 || RExC_emit != orig_emit 8642 + NODE_STEP_REGNODE 8643 + REGNODE_ARG_LEN(REGEX_SET)) 8644 { 8645 vFAIL("Expecting interpolated extended charclass"); 8646 } 8647 resultant_invlist = (SV *) ARGp(REGNODE_p(node)); 8648 current = invlist_clone(resultant_invlist, NULL); 8649 SvREFCNT_dec(resultant_invlist); 8650 8651 RExC_sets_depth--; 8652 RExC_emit = orig_emit; 8653 goto handle_operand; 8654 } 8655 8656 /* A regular '('. Look behind for illegal syntax */ 8657 if (top_index - fence >= 0) { 8658 /* If the top entry on the stack is an operator, it had 8659 * better be a '!', otherwise the entry below the top 8660 * operand should be an operator */ 8661 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) 8662 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') 8663 || ( IS_OPERAND(*top_ptr) 8664 && ( top_index - fence < 1 8665 || ! (stacked_ptr = av_fetch(stack, 8666 top_index - 1, 8667 FALSE)) 8668 || ! IS_OPERATOR(*stacked_ptr)))) 8669 { 8670 RExC_parse_inc_by(1); 8671 vFAIL("Unexpected '(' with no preceding operator"); 8672 } 8673 } 8674 8675 /* Stack the position of this undealt-with left paren */ 8676 av_push_simple(fence_stack, newSViv(fence)); 8677 fence = top_index + 1; 8678 break; 8679 8680 case '\\': 8681 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 8682 * multi-char folds are allowed. */ 8683 if (!regclass(pRExC_state, flagp, depth+1, 8684 TRUE, /* means parse just the next thing */ 8685 FALSE, /* don't allow multi-char folds */ 8686 FALSE, /* don't silence non-portable warnings. */ 8687 TRUE, /* strict */ 8688 FALSE, /* Require return to be an ANYOF */ 8689 ¤t)) 8690 { 8691 RETURN_FAIL_ON_RESTART(*flagp, flagp); 8692 goto regclass_failed; 8693 } 8694 8695 assert(current); 8696 8697 /* regclass() will return with parsing just the \ sequence, 8698 * leaving the parse pointer at the next thing to parse */ 8699 RExC_parse--; 8700 goto handle_operand; 8701 8702 case '[': /* Is a bracketed character class */ 8703 { 8704 /* See if this is a [:posix:] class. */ 8705 bool is_posix_class = (OOB_NAMEDCLASS 8706 < handle_possible_posix(pRExC_state, 8707 RExC_parse + 1, 8708 NULL, 8709 NULL, 8710 TRUE /* checking only */)); 8711 /* If it is a posix class, leave the parse pointer at the '[' 8712 * to fool regclass() into thinking it is part of a 8713 * '[[:posix:]]'. */ 8714 if (! is_posix_class) { 8715 RExC_parse_inc_by(1); 8716 } 8717 8718 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if 8719 * multi-char folds are allowed. */ 8720 if (!regclass(pRExC_state, flagp, depth+1, 8721 is_posix_class, /* parse the whole char 8722 class only if not a 8723 posix class */ 8724 FALSE, /* don't allow multi-char folds */ 8725 TRUE, /* silence non-portable warnings. */ 8726 TRUE, /* strict */ 8727 FALSE, /* Require return to be an ANYOF */ 8728 ¤t)) 8729 { 8730 RETURN_FAIL_ON_RESTART(*flagp, flagp); 8731 goto regclass_failed; 8732 } 8733 8734 assert(current); 8735 8736 /* function call leaves parse pointing to the ']', except if we 8737 * faked it */ 8738 if (is_posix_class) { 8739 RExC_parse--; 8740 } 8741 8742 goto handle_operand; 8743 } 8744 8745 case ']': 8746 if (top_index >= 1) { 8747 goto join_operators; 8748 } 8749 8750 /* Only a single operand on the stack: are done */ 8751 goto done; 8752 8753 case ')': 8754 if (av_tindex_skip_len_mg(fence_stack) < 0) { 8755 if (UCHARAT(RExC_parse - 1) == ']') { 8756 break; 8757 } 8758 RExC_parse_inc_by(1); 8759 vFAIL("Unexpected ')'"); 8760 } 8761 8762 /* If nothing after the fence, is missing an operand */ 8763 if (top_index - fence < 0) { 8764 RExC_parse_inc_by(1); 8765 goto bad_syntax; 8766 } 8767 /* If at least two things on the stack, treat this as an 8768 * operator */ 8769 if (top_index - fence >= 1) { 8770 goto join_operators; 8771 } 8772 8773 /* Here only a single thing on the fenced stack, and there is a 8774 * fence. Get rid of it */ 8775 fence_ptr = av_pop(fence_stack); 8776 assert(fence_ptr); 8777 fence = SvIV(fence_ptr); 8778 SvREFCNT_dec_NN(fence_ptr); 8779 fence_ptr = NULL; 8780 8781 if (fence < 0) { 8782 fence = 0; 8783 } 8784 8785 /* Having gotten rid of the fence, we pop the operand at the 8786 * stack top and process it as a newly encountered operand */ 8787 current = av_pop(stack); 8788 if (IS_OPERAND(current)) { 8789 goto handle_operand; 8790 } 8791 8792 RExC_parse_inc_by(1); 8793 goto bad_syntax; 8794 8795 case '&': 8796 case '|': 8797 case '+': 8798 case '-': 8799 case '^': 8800 8801 /* These binary operators should have a left operand already 8802 * parsed */ 8803 if ( top_index - fence < 0 8804 || top_index - fence == 1 8805 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) 8806 || ! IS_OPERAND(*top_ptr)) 8807 { 8808 goto unexpected_binary; 8809 } 8810 8811 /* If only the one operand is on the part of the stack visible 8812 * to us, we just place this operator in the proper position */ 8813 if (top_index - fence < 2) { 8814 8815 /* Place the operator before the operand */ 8816 8817 SV* lhs = av_pop(stack); 8818 av_push_simple(stack, newSVuv(curchar)); 8819 av_push_simple(stack, lhs); 8820 break; 8821 } 8822 8823 /* But if there is something else on the stack, we need to 8824 * process it before this new operator if and only if the 8825 * stacked operation has equal or higher precedence than the 8826 * new one */ 8827 8828 join_operators: 8829 8830 /* The operator on the stack is supposed to be below both its 8831 * operands */ 8832 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) 8833 || IS_OPERAND(*stacked_ptr)) 8834 { 8835 /* But if not, it's legal and indicates we are completely 8836 * done if and only if we're currently processing a ']', 8837 * which should be the final thing in the expression */ 8838 if (curchar == ']') { 8839 goto done; 8840 } 8841 8842 unexpected_binary: 8843 RExC_parse_inc_by(1); 8844 vFAIL2("Unexpected binary operator '%c' with no " 8845 "preceding operand", curchar); 8846 } 8847 stacked_operator = (char) SvUV(*stacked_ptr); 8848 8849 if (regex_set_precedence(curchar) 8850 > regex_set_precedence(stacked_operator)) 8851 { 8852 /* Here, the new operator has higher precedence than the 8853 * stacked one. This means we need to add the new one to 8854 * the stack to await its rhs operand (and maybe more 8855 * stuff). We put it before the lhs operand, leaving 8856 * untouched the stacked operator and everything below it 8857 * */ 8858 lhs = av_pop(stack); 8859 assert(IS_OPERAND(lhs)); 8860 av_push_simple(stack, newSVuv(curchar)); 8861 av_push_simple(stack, lhs); 8862 break; 8863 } 8864 8865 /* Here, the new operator has equal or lower precedence than 8866 * what's already there. This means the operation already 8867 * there should be performed now, before the new one. */ 8868 8869 rhs = av_pop(stack); 8870 if (! IS_OPERAND(rhs)) { 8871 8872 /* This can happen when a ! is not followed by an operand, 8873 * like in /(?[\t &!])/ */ 8874 goto bad_syntax; 8875 } 8876 8877 lhs = av_pop(stack); 8878 8879 if (! IS_OPERAND(lhs)) { 8880 8881 /* This can happen when there is an empty (), like in 8882 * /(?[[0]+()+])/ */ 8883 goto bad_syntax; 8884 } 8885 8886 switch (stacked_operator) { 8887 case '&': 8888 _invlist_intersection(lhs, rhs, &rhs); 8889 break; 8890 8891 case '|': 8892 case '+': 8893 _invlist_union(lhs, rhs, &rhs); 8894 break; 8895 8896 case '-': 8897 _invlist_subtract(lhs, rhs, &rhs); 8898 break; 8899 8900 case '^': /* The union minus the intersection */ 8901 { 8902 SV* i = NULL; 8903 SV* u = NULL; 8904 8905 _invlist_union(lhs, rhs, &u); 8906 _invlist_intersection(lhs, rhs, &i); 8907 _invlist_subtract(u, i, &rhs); 8908 SvREFCNT_dec_NN(i); 8909 SvREFCNT_dec_NN(u); 8910 break; 8911 } 8912 } 8913 SvREFCNT_dec(lhs); 8914 8915 /* Here, the higher precedence operation has been done, and the 8916 * result is in 'rhs'. We overwrite the stacked operator with 8917 * the result. Then we redo this code to either push the new 8918 * operator onto the stack or perform any higher precedence 8919 * stacked operation */ 8920 only_to_avoid_leaks = av_pop(stack); 8921 SvREFCNT_dec(only_to_avoid_leaks); 8922 av_push_simple(stack, rhs); 8923 goto redo_curchar; 8924 8925 case '!': /* Highest priority, right associative */ 8926 8927 /* If what's already at the top of the stack is another '!", 8928 * they just cancel each other out */ 8929 if ( (top_ptr = av_fetch(stack, top_index, FALSE)) 8930 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) 8931 { 8932 only_to_avoid_leaks = av_pop(stack); 8933 SvREFCNT_dec(only_to_avoid_leaks); 8934 } 8935 else { /* Otherwise, since it's right associative, just push 8936 onto the stack */ 8937 av_push_simple(stack, newSVuv(curchar)); 8938 } 8939 break; 8940 8941 default: 8942 RExC_parse_inc(); 8943 if (RExC_parse >= RExC_end) { 8944 break; 8945 } 8946 vFAIL("Unexpected character"); 8947 8948 handle_operand: 8949 8950 /* Here 'current' is the operand. If something is already on the 8951 * stack, we have to check if it is a !. But first, the code above 8952 * may have altered the stack in the time since we earlier set 8953 * 'top_index'. */ 8954 8955 top_index = av_tindex_skip_len_mg(stack); 8956 if (top_index - fence >= 0) { 8957 /* If the top entry on the stack is an operator, it had better 8958 * be a '!', otherwise the entry below the top operand should 8959 * be an operator */ 8960 top_ptr = av_fetch(stack, top_index, FALSE); 8961 assert(top_ptr); 8962 if (IS_OPERATOR(*top_ptr)) { 8963 8964 /* The only permissible operator at the top of the stack is 8965 * '!', which is applied immediately to this operand. */ 8966 curchar = (char) SvUV(*top_ptr); 8967 if (curchar != '!') { 8968 SvREFCNT_dec(current); 8969 vFAIL2("Unexpected binary operator '%c' with no " 8970 "preceding operand", curchar); 8971 } 8972 8973 _invlist_invert(current); 8974 8975 only_to_avoid_leaks = av_pop(stack); 8976 SvREFCNT_dec(only_to_avoid_leaks); 8977 8978 /* And we redo with the inverted operand. This allows 8979 * handling multiple ! in a row */ 8980 goto handle_operand; 8981 } 8982 /* Single operand is ok only for the non-binary ')' 8983 * operator */ 8984 else if ((top_index - fence == 0 && curchar != ')') 8985 || (top_index - fence > 0 8986 && (! (stacked_ptr = av_fetch(stack, 8987 top_index - 1, 8988 FALSE)) 8989 || IS_OPERAND(*stacked_ptr)))) 8990 { 8991 SvREFCNT_dec(current); 8992 vFAIL("Operand with no preceding operator"); 8993 } 8994 } 8995 8996 /* Here there was nothing on the stack or the top element was 8997 * another operand. Just add this new one */ 8998 av_push_simple(stack, current); 8999 9000 } /* End of switch on next parse token */ 9001 9002 RExC_parse_inc(); 9003 } /* End of loop parsing through the construct */ 9004 9005 vFAIL("Syntax error in (?[...])"); 9006 9007 done: 9008 9009 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') { 9010 if (RExC_parse < RExC_end) { 9011 RExC_parse_inc_by(1); 9012 } 9013 9014 vFAIL("Unexpected ']' with no following ')' in (?[..."); 9015 } 9016 9017 if (av_tindex_skip_len_mg(fence_stack) >= 0) { 9018 vFAIL("Unmatched ("); 9019 } 9020 9021 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */ 9022 || ((final = av_pop(stack)) == NULL) 9023 || ! IS_OPERAND(final) 9024 || ! is_invlist(final) 9025 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */ 9026 { 9027 bad_syntax: 9028 SvREFCNT_dec(final); 9029 vFAIL("Incomplete expression within '(?[ ])'"); 9030 } 9031 9032 /* Here, 'final' is the resultant inversion list from evaluating the 9033 * expression. Return it if so requested */ 9034 if (return_invlist) { 9035 *return_invlist = final; 9036 return END; 9037 } 9038 9039 if (RExC_sets_depth) { /* If within a recursive call, return in a special 9040 regnode */ 9041 RExC_parse_inc_by(1); 9042 node = regpnode(pRExC_state, REGEX_SET, final); 9043 } 9044 else { 9045 9046 /* Otherwise generate a resultant node, based on 'final'. regclass() 9047 * is expecting a string of ranges and individual code points */ 9048 invlist_iterinit(final); 9049 result_string = newSVpvs(""); 9050 while (invlist_iternext(final, &start, &end)) { 9051 if (start == end) { 9052 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start); 9053 } 9054 else { 9055 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" 9056 UVXf "}", start, end); 9057 } 9058 } 9059 9060 /* About to generate an ANYOF (or similar) node from the inversion list 9061 * we have calculated */ 9062 save_parse = RExC_parse; 9063 RExC_parse_set(SvPV(result_string, len)); 9064 save_end = RExC_end; 9065 RExC_end = RExC_parse + len; 9066 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE; 9067 9068 /* We turn off folding around the call, as the class we have 9069 * constructed already has all folding taken into consideration, and we 9070 * don't want regclass() to add to that */ 9071 RExC_flags &= ~RXf_PMf_FOLD; 9072 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char 9073 * folds are allowed. */ 9074 node = regclass(pRExC_state, flagp, depth+1, 9075 FALSE, /* means parse the whole char class */ 9076 FALSE, /* don't allow multi-char folds */ 9077 TRUE, /* silence non-portable warnings. The above may 9078 very well have generated non-portable code 9079 points, but they're valid on this machine */ 9080 FALSE, /* similarly, no need for strict */ 9081 9082 /* We can optimize into something besides an ANYOF, 9083 * except under /l, which needs to be ANYOF because of 9084 * runtime checks for locale sanity, etc */ 9085 ! in_locale, 9086 NULL 9087 ); 9088 9089 RESTORE_WARNINGS; 9090 RExC_parse_set(save_parse + 1); 9091 RExC_end = save_end; 9092 SvREFCNT_dec_NN(final); 9093 SvREFCNT_dec_NN(result_string); 9094 9095 if (save_fold) { 9096 RExC_flags |= RXf_PMf_FOLD; 9097 } 9098 9099 if (!node) { 9100 RETURN_FAIL_ON_RESTART(*flagp, flagp); 9101 goto regclass_failed; 9102 } 9103 9104 /* Fix up the node type if we are in locale. (We have pretended we are 9105 * under /u for the purposes of regclass(), as this construct will only 9106 * work under UTF-8 locales. But now we change the opcode to be ANYOFL 9107 * (so as to cause any warnings about bad locales to be output in 9108 * regexec.c), and add the flag that indicates to check if not in a 9109 * UTF-8 locale. The reason we above forbid optimization into 9110 * something other than an ANYOF node is simply to minimize the number 9111 * of code changes in regexec.c. Otherwise we would have to create new 9112 * EXACTish node types and deal with them. This decision could be 9113 * revisited should this construct become popular. 9114 * 9115 * (One might think we could look at the resulting ANYOF node and 9116 * suppress the flag if everything is above 255, as those would be 9117 * UTF-8 only, but this isn't true, as the components that led to that 9118 * result could have been locale-affected, and just happen to cancel 9119 * each other out under UTF-8 locales.) */ 9120 if (in_locale) { 9121 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); 9122 9123 assert(OP(REGNODE_p(node)) == ANYOF); 9124 9125 OP(REGNODE_p(node)) = ANYOFL; 9126 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD; 9127 } 9128 } 9129 9130 nextchar(pRExC_state); 9131 return node; 9132 9133 regclass_failed: 9134 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf, 9135 (UV) *flagp); 9136 } 9137 9138 #ifdef ENABLE_REGEX_SETS_DEBUGGING 9139 9140 STATIC void 9141 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, 9142 AV * stack, const IV fence, AV * fence_stack) 9143 { /* Dumps the stacks in handle_regex_sets() */ 9144 9145 const SSize_t stack_top = av_tindex_skip_len_mg(stack); 9146 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack); 9147 SSize_t i; 9148 9149 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES; 9150 9151 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse); 9152 9153 if (stack_top < 0) { 9154 PerlIO_printf(Perl_debug_log, "Nothing on stack\n"); 9155 } 9156 else { 9157 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence); 9158 for (i = stack_top; i >= 0; i--) { 9159 SV ** element_ptr = av_fetch(stack, i, FALSE); 9160 if (! element_ptr) { 9161 } 9162 9163 if (IS_OPERATOR(*element_ptr)) { 9164 PerlIO_printf(Perl_debug_log, "[%d]: %c\n", 9165 (int) i, (int) SvIV(*element_ptr)); 9166 } 9167 else { 9168 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i); 9169 sv_dump(*element_ptr); 9170 } 9171 } 9172 } 9173 9174 if (fence_stack_top < 0) { 9175 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n"); 9176 } 9177 else { 9178 PerlIO_printf(Perl_debug_log, "Fence_stack: \n"); 9179 for (i = fence_stack_top; i >= 0; i--) { 9180 SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE); 9181 if (! element_ptr) { 9182 } 9183 9184 PerlIO_printf(Perl_debug_log, "[%d]: %d\n", 9185 (int) i, (int) SvIV(*element_ptr)); 9186 } 9187 } 9188 } 9189 9190 #endif 9191 9192 #undef IS_OPERATOR 9193 #undef IS_OPERAND 9194 9195 #ifdef PERL_RE_BUILD_AUX 9196 void 9197 Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) 9198 { 9199 /* This adds the Latin1/above-Latin1 folding rules. 9200 * 9201 * This should be called only for a Latin1-range code points, cp, which is 9202 * known to be involved in a simple fold with other code points above 9203 * Latin1. It would give false results if /aa has been specified. 9204 * Multi-char folds are outside the scope of this, and must be handled 9205 * specially. */ 9206 9207 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; 9208 9209 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); 9210 9211 /* The rules that are valid for all Unicode versions are hard-coded in */ 9212 switch (cp) { 9213 case 'k': 9214 case 'K': 9215 *invlist = 9216 add_cp_to_invlist(*invlist, KELVIN_SIGN); 9217 break; 9218 case 's': 9219 case 'S': 9220 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); 9221 break; 9222 case MICRO_SIGN: 9223 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); 9224 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); 9225 break; 9226 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: 9227 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: 9228 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); 9229 break; 9230 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: 9231 *invlist = add_cp_to_invlist(*invlist, 9232 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); 9233 break; 9234 9235 default: /* Other code points are checked against the data for the 9236 current Unicode version */ 9237 { 9238 Size_t folds_count; 9239 U32 first_fold; 9240 const U32 * remaining_folds; 9241 UV folded_cp; 9242 9243 if (isASCII(cp)) { 9244 folded_cp = toFOLD(cp); 9245 } 9246 else { 9247 U8 dummy_fold[UTF8_MAXBYTES_CASE+1]; 9248 Size_t dummy_len; 9249 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0); 9250 } 9251 9252 if (folded_cp > 255) { 9253 *invlist = add_cp_to_invlist(*invlist, folded_cp); 9254 } 9255 9256 folds_count = _inverse_folds(folded_cp, &first_fold, 9257 &remaining_folds); 9258 if (folds_count == 0) { 9259 9260 /* Use deprecated warning to increase the chances of this being 9261 * output */ 9262 ckWARN2reg_d(RExC_parse, 9263 "Perl folding rules are not up-to-date for 0x%02X;" 9264 " please use the perlbug utility to report;", cp); 9265 } 9266 else { 9267 unsigned int i; 9268 9269 if (first_fold > 255) { 9270 *invlist = add_cp_to_invlist(*invlist, first_fold); 9271 } 9272 for (i = 0; i < folds_count - 1; i++) { 9273 if (remaining_folds[i] > 255) { 9274 *invlist = add_cp_to_invlist(*invlist, 9275 remaining_folds[i]); 9276 } 9277 } 9278 } 9279 break; 9280 } 9281 } 9282 } 9283 #endif /* PERL_RE_BUILD_AUX */ 9284 9285 9286 STATIC void 9287 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings) 9288 { 9289 /* Output the elements of the array given by '*posix_warnings' as REGEXP 9290 * warnings. */ 9291 9292 SV * msg; 9293 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP)); 9294 9295 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS; 9296 9297 if (! TO_OUTPUT_WARNINGS(RExC_parse)) { 9298 CLEAR_POSIX_WARNINGS(); 9299 return; 9300 } 9301 9302 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { 9303 if (first_is_fatal) { /* Avoid leaking this */ 9304 av_undef(posix_warnings); /* This isn't necessary if the 9305 array is mortal, but is a 9306 fail-safe */ 9307 (void) sv_2mortal(msg); 9308 } 9309 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); 9310 SvREFCNT_dec_NN(msg); 9311 } 9312 9313 UPDATE_WARNINGS_LOC(RExC_parse); 9314 } 9315 9316 PERL_STATIC_INLINE Size_t 9317 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max) 9318 { 9319 const U8 * const start = s1; 9320 const U8 * const send = start + max; 9321 9322 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS; 9323 9324 while (s1 < send && *s1 == *s2) { 9325 s1++; s2++; 9326 } 9327 9328 return s1 - start; 9329 } 9330 9331 STATIC AV * 9332 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) 9333 { 9334 /* This adds the string scalar <multi_string> to the array 9335 * <multi_char_matches>. <multi_string> is known to have exactly 9336 * <cp_count> code points in it. This is used when constructing a 9337 * bracketed character class and we find something that needs to match more 9338 * than a single character. 9339 * 9340 * <multi_char_matches> is actually an array of arrays. Each top-level 9341 * element is an array that contains all the strings known so far that are 9342 * the same length. And that length (in number of code points) is the same 9343 * as the index of the top-level array. Hence, the [2] element is an 9344 * array, each element thereof is a string containing TWO code points; 9345 * while element [3] is for strings of THREE characters, and so on. Since 9346 * this is for multi-char strings there can never be a [0] nor [1] element. 9347 * 9348 * When we rewrite the character class below, we will do so such that the 9349 * longest strings are written first, so that it prefers the longest 9350 * matching strings first. This is done even if it turns out that any 9351 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom 9352 * Christiansen has agreed that this is ok. This makes the test for the 9353 * ligature 'ffi' come before the test for 'ff', for example */ 9354 9355 AV* this_array; 9356 AV** this_array_ptr; 9357 9358 PERL_ARGS_ASSERT_ADD_MULTI_MATCH; 9359 9360 if (! multi_char_matches) { 9361 multi_char_matches = newAV(); 9362 } 9363 9364 if (av_exists(multi_char_matches, cp_count)) { 9365 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE); 9366 this_array = *this_array_ptr; 9367 } 9368 else { 9369 this_array = newAV(); 9370 av_store_simple(multi_char_matches, cp_count, 9371 (SV*) this_array); 9372 } 9373 av_push_simple(this_array, multi_string); 9374 9375 return multi_char_matches; 9376 } 9377 9378 /* The names of properties whose definitions are not known at compile time are 9379 * stored in this SV, after a constant heading. So if the length has been 9380 * changed since initialization, then there is a run-time definition. */ 9381 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ 9382 (SvCUR(listsv) != initial_listsv_len) 9383 9384 /* There is a restricted set of white space characters that are legal when 9385 * ignoring white space in a bracketed character class. This generates the 9386 * code to skip them. 9387 * 9388 * There is a line below that uses the same white space criteria but is outside 9389 * this macro. Both here and there must use the same definition */ 9390 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \ 9391 STMT_START { \ 9392 if (do_skip) { \ 9393 while (p < stop_p && isBLANK_A(UCHARAT(p))) \ 9394 { \ 9395 p++; \ 9396 } \ 9397 } \ 9398 } STMT_END 9399 9400 STATIC regnode_offset 9401 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, 9402 const bool stop_at_1, /* Just parse the next thing, don't 9403 look for a full character class */ 9404 bool allow_mutiple_chars, 9405 const bool silence_non_portable, /* Don't output warnings 9406 about too large 9407 characters */ 9408 const bool strict, 9409 bool optimizable, /* ? Allow a non-ANYOF return 9410 node */ 9411 SV** ret_invlist /* Return an inversion list, not a node */ 9412 ) 9413 { 9414 /* parse a bracketed class specification. Most of these will produce an 9415 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an 9416 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex 9417 * under /i with multi-character folds: it will be rewritten following the 9418 * paradigm of this example, where the <multi-fold>s are characters which 9419 * fold to multiple character sequences: 9420 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i 9421 * gets effectively rewritten as: 9422 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i 9423 * reg() gets called (recursively) on the rewritten version, and this 9424 * function will return what it constructs. (Actually the <multi-fold>s 9425 * aren't physically removed from the [abcdefghi], it's just that they are 9426 * ignored in the recursion by means of a flag: 9427 * <RExC_in_multi_char_class>.) 9428 * 9429 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS 9430 * characters, with the corresponding bit set if that character is in the 9431 * list. For characters above this, an inversion list is used. There 9432 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not 9433 * determinable at compile time 9434 * 9435 * On success, returns the offset at which any next node should be placed 9436 * into the regex engine program being compiled. 9437 * 9438 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs 9439 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to 9440 * UTF-8 9441 */ 9442 9443 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; 9444 IV range = 0; 9445 UV value = OOB_UNICODE, save_value = OOB_UNICODE; 9446 regnode_offset ret = -1; /* Initialized to an illegal value */ 9447 STRLEN numlen; 9448 int namedclass = OOB_NAMEDCLASS; 9449 char *rangebegin = NULL; 9450 SV *listsv = NULL; /* List of \p{user-defined} whose definitions 9451 aren't available at the time this was called */ 9452 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more 9453 than just initialized. */ 9454 SV* properties = NULL; /* Code points that match \p{} \P{} */ 9455 SV* posixes = NULL; /* Code points that match classes like [:word:], 9456 extended beyond the Latin1 range. These have to 9457 be kept separate from other code points for much 9458 of this function because their handling is 9459 different under /i, and for most classes under 9460 /d as well */ 9461 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept 9462 separate for a while from the non-complemented 9463 versions because of complications with /d 9464 matching */ 9465 SV* simple_posixes = NULL; /* But under some conditions, the classes can be 9466 treated more simply than the general case, 9467 leading to less compilation and execution 9468 work */ 9469 UV element_count = 0; /* Number of distinct elements in the class. 9470 Optimizations may be possible if this is tiny */ 9471 AV * multi_char_matches = NULL; /* Code points that fold to more than one 9472 character; used under /i */ 9473 UV n; 9474 char * stop_ptr = RExC_end; /* where to stop parsing */ 9475 9476 /* ignore unescaped whitespace? */ 9477 const bool skip_white = cBOOL( ret_invlist 9478 || (RExC_flags & RXf_PMf_EXTENDED_MORE)); 9479 9480 /* inversion list of code points this node matches only when the target 9481 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under 9482 * /d) */ 9483 SV* upper_latin1_only_utf8_matches = NULL; 9484 9485 /* Inversion list of code points this node matches regardless of things 9486 * like locale, folding, utf8ness of the target string */ 9487 SV* cp_list = NULL; 9488 9489 /* Like cp_list, but code points on this list need to be checked for things 9490 * that fold to/from them under /i */ 9491 SV* cp_foldable_list = NULL; 9492 9493 /* Like cp_list, but code points on this list are valid only when the 9494 * runtime locale is UTF-8 */ 9495 SV* only_utf8_locale_list = NULL; 9496 9497 /* In a range, if one of the endpoints is non-character-set portable, 9498 * meaning that it hard-codes a code point that may mean a different 9499 * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a 9500 * mnemonic '\t' which each mean the same character no matter which 9501 * character set the platform is on. */ 9502 unsigned int non_portable_endpoint = 0; 9503 9504 /* Is the range unicode? which means on a platform that isn't 1-1 native 9505 * to Unicode (i.e. non-ASCII), each code point in it should be considered 9506 * to be a Unicode value. */ 9507 bool unicode_range = FALSE; 9508 bool invert = FALSE; /* Is this class to be complemented */ 9509 9510 bool warn_super = ALWAYS_WARN_SUPER; 9511 9512 const char * orig_parse = RExC_parse; 9513 9514 /* This variable is used to mark where the end in the input is of something 9515 * that looks like a POSIX construct but isn't. During the parse, when 9516 * something looks like it could be such a construct is encountered, it is 9517 * checked for being one, but not if we've already checked this area of the 9518 * input. Only after this position is reached do we check again */ 9519 char *not_posix_region_end = RExC_parse - 1; 9520 9521 AV* posix_warnings = NULL; 9522 const bool do_posix_warnings = ckWARN(WARN_REGEXP); 9523 U8 op = ANYOF; /* The returned node-type, initialized to the expected 9524 type. */ 9525 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */ 9526 U32 posixl = 0; /* bit field of posix classes matched under /l */ 9527 9528 9529 /* Flags as to what things aren't knowable until runtime. (Note that these are 9530 * mutually exclusive.) */ 9531 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that 9532 haven't been defined as of yet */ 9533 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is 9534 UTF-8 or not */ 9535 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and 9536 what gets folded */ 9537 U32 has_runtime_dependency = 0; /* OR of the above flags */ 9538 9539 DECLARE_AND_GET_RE_DEBUG_FLAGS; 9540 9541 PERL_ARGS_ASSERT_REGCLASS; 9542 #ifndef DEBUGGING 9543 PERL_UNUSED_ARG(depth); 9544 #endif 9545 9546 assert(! (ret_invlist && allow_mutiple_chars)); 9547 9548 /* If wants an inversion list returned, we can't optimize to something 9549 * else. */ 9550 if (ret_invlist) { 9551 optimizable = FALSE; 9552 } 9553 9554 DEBUG_PARSE("clas"); 9555 9556 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ 9557 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ 9558 && UNICODE_DOT_DOT_VERSION == 0) 9559 allow_mutiple_chars = FALSE; 9560 #endif 9561 9562 /* We include the /i status at the beginning of this so that we can 9563 * know it at runtime */ 9564 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD))); 9565 initial_listsv_len = SvCUR(listsv); 9566 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ 9567 9568 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 9569 9570 assert(RExC_parse <= RExC_end); 9571 9572 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ 9573 RExC_parse_inc_by(1); 9574 invert = TRUE; 9575 allow_mutiple_chars = FALSE; 9576 MARK_NAUGHTY(1); 9577 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 9578 } 9579 9580 /* Check that they didn't say [:posix:] instead of [[:posix:]] */ 9581 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { 9582 int maybe_class = handle_possible_posix(pRExC_state, 9583 RExC_parse, 9584 ¬_posix_region_end, 9585 NULL, 9586 TRUE /* checking only */); 9587 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { 9588 ckWARN4reg(not_posix_region_end, 9589 "POSIX syntax [%c %c] belongs inside character classes%s", 9590 *RExC_parse, *RExC_parse, 9591 (maybe_class == OOB_NAMEDCLASS) 9592 ? ((POSIXCC_NOTYET(*RExC_parse)) 9593 ? " (but this one isn't implemented)" 9594 : " (but this one isn't fully valid)") 9595 : "" 9596 ); 9597 } 9598 } 9599 9600 /* If the caller wants us to just parse a single element, accomplish this 9601 * by faking the loop ending condition */ 9602 if (stop_at_1 && RExC_end > RExC_parse) { 9603 stop_ptr = RExC_parse + 1; 9604 } 9605 9606 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */ 9607 if (UCHARAT(RExC_parse) == ']') 9608 goto charclassloop; 9609 9610 while (1) { 9611 9612 if ( posix_warnings 9613 && av_tindex_skip_len_mg(posix_warnings) >= 0 9614 && RExC_parse > not_posix_region_end) 9615 { 9616 /* Warnings about posix class issues are considered tentative until 9617 * we are far enough along in the parse that we can no longer 9618 * change our mind, at which point we output them. This is done 9619 * each time through the loop so that a later class won't zap them 9620 * before they have been dealt with. */ 9621 output_posix_warnings(pRExC_state, posix_warnings); 9622 } 9623 9624 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 9625 9626 if (RExC_parse >= stop_ptr) { 9627 break; 9628 } 9629 9630 if (UCHARAT(RExC_parse) == ']') { 9631 break; 9632 } 9633 9634 charclassloop: 9635 9636 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ 9637 save_value = value; 9638 save_prevvalue = prevvalue; 9639 9640 if (!range) { 9641 rangebegin = RExC_parse; 9642 element_count++; 9643 non_portable_endpoint = 0; 9644 } 9645 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) { 9646 value = utf8n_to_uvchr((U8*)RExC_parse, 9647 RExC_end - RExC_parse, 9648 &numlen, UTF8_ALLOW_DEFAULT); 9649 RExC_parse_inc_by(numlen); 9650 } 9651 else { 9652 value = UCHARAT(RExC_parse); 9653 RExC_parse_inc_by(1); 9654 } 9655 9656 if (value == '[') { 9657 char * posix_class_end; 9658 namedclass = handle_possible_posix(pRExC_state, 9659 RExC_parse, 9660 &posix_class_end, 9661 do_posix_warnings ? &posix_warnings : NULL, 9662 FALSE /* die if error */); 9663 if (namedclass > OOB_NAMEDCLASS) { 9664 9665 /* If there was an earlier attempt to parse this particular 9666 * posix class, and it failed, it was a false alarm, as this 9667 * successful one proves */ 9668 if ( posix_warnings 9669 && av_tindex_skip_len_mg(posix_warnings) >= 0 9670 && not_posix_region_end >= RExC_parse 9671 && not_posix_region_end <= posix_class_end) 9672 { 9673 av_undef(posix_warnings); 9674 } 9675 9676 RExC_parse_set(posix_class_end); 9677 } 9678 else if (namedclass == OOB_NAMEDCLASS) { 9679 not_posix_region_end = posix_class_end; 9680 } 9681 else { 9682 namedclass = OOB_NAMEDCLASS; 9683 } 9684 } 9685 else if ( RExC_parse - 1 > not_posix_region_end 9686 && MAYBE_POSIXCC(value)) 9687 { 9688 (void) handle_possible_posix( 9689 pRExC_state, 9690 RExC_parse - 1, /* -1 because parse has already been 9691 advanced */ 9692 ¬_posix_region_end, 9693 do_posix_warnings ? &posix_warnings : NULL, 9694 TRUE /* checking only */); 9695 } 9696 else if ( strict && ! skip_white 9697 && ( generic_isCC_(value, CC_VERTSPACE_) 9698 || is_VERTWS_cp_high(value))) 9699 { 9700 vFAIL("Literal vertical space in [] is illegal except under /x"); 9701 } 9702 else if (value == '\\') { 9703 /* Is a backslash; get the code point of the char after it */ 9704 9705 if (RExC_parse >= RExC_end) { 9706 vFAIL("Unmatched ["); 9707 } 9708 9709 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { 9710 value = utf8n_to_uvchr((U8*)RExC_parse, 9711 RExC_end - RExC_parse, 9712 &numlen, UTF8_ALLOW_DEFAULT); 9713 RExC_parse_inc_by(numlen); 9714 } 9715 else { 9716 value = UCHARAT(RExC_parse); 9717 RExC_parse_inc_by(1); 9718 } 9719 9720 /* Some compilers cannot handle switching on 64-bit integer 9721 * values, therefore value cannot be an UV. Yes, this will 9722 * be a problem later if we want switch on Unicode. 9723 * A similar issue a little bit later when switching on 9724 * namedclass. --jhi */ 9725 9726 /* If the \ is escaping white space when white space is being 9727 * skipped, it means that that white space is wanted literally, and 9728 * is already in 'value'. Otherwise, need to translate the escape 9729 * into what it signifies. */ 9730 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { 9731 const char * message; 9732 U32 packed_warn; 9733 U8 grok_c_char; 9734 9735 case 'w': namedclass = ANYOF_WORDCHAR; break; 9736 case 'W': namedclass = ANYOF_NWORDCHAR; break; 9737 case 's': namedclass = ANYOF_SPACE; break; 9738 case 'S': namedclass = ANYOF_NSPACE; break; 9739 case 'd': namedclass = ANYOF_DIGIT; break; 9740 case 'D': namedclass = ANYOF_NDIGIT; break; 9741 case 'v': namedclass = ANYOF_VERTWS; break; 9742 case 'V': namedclass = ANYOF_NVERTWS; break; 9743 case 'h': namedclass = ANYOF_HORIZWS; break; 9744 case 'H': namedclass = ANYOF_NHORIZWS; break; 9745 case 'N': /* Handle \N{NAME} in class */ 9746 { 9747 const char * const backslash_N_beg = RExC_parse - 2; 9748 int cp_count; 9749 9750 if (! grok_bslash_N(pRExC_state, 9751 NULL, /* No regnode */ 9752 &value, /* Yes single value */ 9753 &cp_count, /* Multiple code pt count */ 9754 flagp, 9755 strict, 9756 depth) 9757 ) { 9758 9759 if (*flagp & NEED_UTF8) 9760 FAIL("panic: grok_bslash_N set NEED_UTF8"); 9761 9762 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 9763 9764 if (cp_count < 0) { 9765 vFAIL("\\N in a character class must be a named character: \\N{...}"); 9766 } 9767 else if (cp_count == 0) { 9768 ckWARNreg(RExC_parse, 9769 "Ignoring zero length \\N{} in character class"); 9770 } 9771 else { /* cp_count > 1 */ 9772 assert(cp_count > 1); 9773 if (! RExC_in_multi_char_class) { 9774 if ( ! allow_mutiple_chars 9775 || invert 9776 || range 9777 || *RExC_parse == '-') 9778 { 9779 if (strict) { 9780 RExC_parse--; 9781 vFAIL("\\N{} here is restricted to one character"); 9782 } 9783 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); 9784 break; /* <value> contains the first code 9785 point. Drop out of the switch to 9786 process it */ 9787 } 9788 else { 9789 SV * multi_char_N = newSVpvn(backslash_N_beg, 9790 RExC_parse - backslash_N_beg); 9791 multi_char_matches 9792 = add_multi_match(multi_char_matches, 9793 multi_char_N, 9794 cp_count); 9795 } 9796 } 9797 } /* End of cp_count != 1 */ 9798 9799 /* This element should not be processed further in this 9800 * class */ 9801 element_count--; 9802 value = save_value; 9803 prevvalue = save_prevvalue; 9804 continue; /* Back to top of loop to get next char */ 9805 } 9806 9807 /* Here, is a single code point, and <value> contains it */ 9808 unicode_range = TRUE; /* \N{} are Unicode */ 9809 } 9810 break; 9811 case 'p': 9812 case 'P': 9813 { 9814 char *e; 9815 9816 if (RExC_pm_flags & PMf_WILDCARD) { 9817 RExC_parse_inc_by(1); 9818 /* diag_listed_as: Use of %s is not allowed in Unicode 9819 property wildcard subpatterns in regex; marked by <-- 9820 HERE in m/%s/ */ 9821 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property" 9822 " wildcard subpatterns", (char) value, *(RExC_parse - 1)); 9823 } 9824 9825 /* \p means they want Unicode semantics */ 9826 REQUIRE_UNI_RULES(flagp, 0); 9827 9828 if (RExC_parse >= RExC_end) 9829 vFAIL2("Empty \\%c", (U8)value); 9830 if (*RExC_parse == '{') { 9831 const U8 c = (U8)value; 9832 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); 9833 if (!e) { 9834 RExC_parse_inc_by(1); 9835 vFAIL2("Missing right brace on \\%c{}", c); 9836 } 9837 9838 RExC_parse_inc_by(1); 9839 9840 /* White space is allowed adjacent to the braces and after 9841 * any '^', even when not under /x */ 9842 while (isSPACE(*RExC_parse)) { 9843 RExC_parse_inc_by(1); 9844 } 9845 9846 if (UCHARAT(RExC_parse) == '^') { 9847 9848 /* toggle. (The rhs xor gets the single bit that 9849 * differs between P and p; the other xor inverts just 9850 * that bit) */ 9851 value ^= 'P' ^ 'p'; 9852 9853 RExC_parse_inc_by(1); 9854 while (isSPACE(*RExC_parse)) { 9855 RExC_parse_inc_by(1); 9856 } 9857 } 9858 9859 if (e == RExC_parse) 9860 vFAIL2("Empty \\%c{}", c); 9861 9862 n = e - RExC_parse; 9863 while (isSPACE(*(RExC_parse + n - 1))) 9864 n--; 9865 9866 } /* The \p isn't immediately followed by a '{' */ 9867 else if (! isALPHA(*RExC_parse)) { 9868 RExC_parse_inc_safe(); 9869 vFAIL2("Character following \\%c must be '{' or a " 9870 "single-character Unicode property name", 9871 (U8) value); 9872 } 9873 else { 9874 e = RExC_parse; 9875 n = 1; 9876 } 9877 { 9878 char* name = RExC_parse; 9879 9880 /* Any message returned about expanding the definition */ 9881 SV* msg = newSVpvs_flags("", SVs_TEMP); 9882 9883 /* If set TRUE, the property is user-defined as opposed to 9884 * official Unicode */ 9885 bool user_defined = FALSE; 9886 AV * strings = NULL; 9887 9888 SV * prop_definition = parse_uniprop_string( 9889 name, n, UTF, FOLD, 9890 FALSE, /* This is compile-time */ 9891 9892 /* We can't defer this defn when 9893 * the full result is required in 9894 * this call */ 9895 ! cBOOL(ret_invlist), 9896 9897 &strings, 9898 &user_defined, 9899 msg, 9900 0 /* Base level */ 9901 ); 9902 if (SvCUR(msg)) { /* Assumes any error causes a msg */ 9903 assert(prop_definition == NULL); 9904 RExC_parse_set(e + 1); 9905 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole 9906 thing so, or else the display is 9907 mojibake */ 9908 RExC_utf8 = TRUE; 9909 } 9910 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */ 9911 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg), 9912 SvCUR(msg), SvPVX(msg))); 9913 } 9914 9915 assert(prop_definition || strings); 9916 9917 if (strings) { 9918 if (ret_invlist) { 9919 if (! prop_definition) { 9920 RExC_parse_set(e + 1); 9921 vFAIL("Unicode string properties are not implemented in (?[...])"); 9922 } 9923 else { 9924 ckWARNreg(e + 1, 9925 "Using just the single character results" 9926 " returned by \\p{} in (?[...])"); 9927 } 9928 } 9929 else if (! RExC_in_multi_char_class) { 9930 if (invert ^ (value == 'P')) { 9931 RExC_parse_set(e + 1); 9932 vFAIL("Inverting a character class which contains" 9933 " a multi-character sequence is illegal"); 9934 } 9935 9936 /* For each multi-character string ... */ 9937 while (av_count(strings) > 0) { 9938 /* ... Each entry is itself an array of code 9939 * points. */ 9940 AV * this_string = (AV *) av_shift( strings); 9941 STRLEN cp_count = av_count(this_string); 9942 SV * final = newSV(cp_count ? cp_count * 4 : 1); 9943 SvPVCLEAR_FRESH(final); 9944 9945 /* Create another string of sequences of \x{...} */ 9946 while (av_count(this_string) > 0) { 9947 SV * character = av_shift(this_string); 9948 UV cp = SvUV(character); 9949 9950 if (cp > 255) { 9951 REQUIRE_UTF8(flagp); 9952 } 9953 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}", 9954 cp); 9955 SvREFCNT_dec_NN(character); 9956 } 9957 SvREFCNT_dec_NN(this_string); 9958 9959 /* And add that to the list of such things */ 9960 multi_char_matches 9961 = add_multi_match(multi_char_matches, 9962 final, 9963 cp_count); 9964 } 9965 } 9966 SvREFCNT_dec_NN(strings); 9967 } 9968 9969 if (! prop_definition) { /* If we got only a string, 9970 this iteration didn't really 9971 find a character */ 9972 element_count--; 9973 } 9974 else if (! is_invlist(prop_definition)) { 9975 9976 /* Here, the definition isn't known, so we have gotten 9977 * returned a string that will be evaluated if and when 9978 * encountered at runtime. We add it to the list of 9979 * such properties, along with whether it should be 9980 * complemented or not */ 9981 if (value == 'P') { 9982 sv_catpvs(listsv, "!"); 9983 } 9984 else { 9985 sv_catpvs(listsv, "+"); 9986 } 9987 sv_catsv(listsv, prop_definition); 9988 9989 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY; 9990 9991 /* We don't know yet what this matches, so have to flag 9992 * it */ 9993 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 9994 } 9995 else { 9996 assert (prop_definition && is_invlist(prop_definition)); 9997 9998 /* Here we do have the complete property definition 9999 * 10000 * Temporary workaround for [GH #16520]. For this 10001 * precise input that is in the .t that is failing, 10002 * load utf8.pm, which is what the test wants, so that 10003 * that .t passes */ 10004 if ( memEQs(RExC_start, e + 1 - RExC_start, 10005 "foo\\p{Alnum}") 10006 && ! hv_common(GvHVn(PL_incgv), 10007 NULL, 10008 "utf8.pm", sizeof("utf8.pm") - 1, 10009 0, HV_FETCH_ISEXISTS, NULL, 0)) 10010 { 10011 require_pv("utf8.pm"); 10012 } 10013 10014 if (! user_defined && 10015 /* We warn on matching an above-Unicode code point 10016 * if the match would return true, except don't 10017 * warn for \p{All}, which has exactly one element 10018 * = 0 */ 10019 (_invlist_contains_cp(prop_definition, 0x110000) 10020 && (! (_invlist_len(prop_definition) == 1 10021 && *invlist_array(prop_definition) == 0)))) 10022 { 10023 warn_super = TRUE; 10024 } 10025 10026 /* Invert if asking for the complement */ 10027 if (value == 'P') { 10028 _invlist_union_complement_2nd(properties, 10029 prop_definition, 10030 &properties); 10031 } 10032 else { 10033 _invlist_union(properties, prop_definition, &properties); 10034 } 10035 } 10036 } 10037 10038 RExC_parse_set(e + 1); 10039 namedclass = ANYOF_UNIPROP; /* no official name, but it's 10040 named */ 10041 } 10042 break; 10043 case 'n': value = '\n'; break; 10044 case 'r': value = '\r'; break; 10045 case 't': value = '\t'; break; 10046 case 'f': value = '\f'; break; 10047 case 'b': value = '\b'; break; 10048 case 'e': value = ESC_NATIVE; break; 10049 case 'a': value = '\a'; break; 10050 case 'o': 10051 RExC_parse--; /* function expects to be pointed at the 'o' */ 10052 if (! grok_bslash_o(&RExC_parse, 10053 RExC_end, 10054 &value, 10055 &message, 10056 &packed_warn, 10057 strict, 10058 cBOOL(range), /* MAX_UV allowed for range 10059 upper limit */ 10060 UTF)) 10061 { 10062 vFAIL(message); 10063 } 10064 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { 10065 warn_non_literal_string(RExC_parse, packed_warn, message); 10066 } 10067 10068 if (value < 256) { 10069 non_portable_endpoint++; 10070 } 10071 break; 10072 case 'x': 10073 RExC_parse--; /* function expects to be pointed at the 'x' */ 10074 if (! grok_bslash_x(&RExC_parse, 10075 RExC_end, 10076 &value, 10077 &message, 10078 &packed_warn, 10079 strict, 10080 cBOOL(range), /* MAX_UV allowed for range 10081 upper limit */ 10082 UTF)) 10083 { 10084 vFAIL(message); 10085 } 10086 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { 10087 warn_non_literal_string(RExC_parse, packed_warn, message); 10088 } 10089 10090 if (value < 256) { 10091 non_portable_endpoint++; 10092 } 10093 break; 10094 case 'c': 10095 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message, 10096 &packed_warn)) 10097 { 10098 /* going to die anyway; point to exact spot of 10099 * failure */ 10100 RExC_parse_inc_safe(); 10101 vFAIL(message); 10102 } 10103 10104 value = grok_c_char; 10105 RExC_parse_inc_by(1); 10106 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) { 10107 warn_non_literal_string(RExC_parse, packed_warn, message); 10108 } 10109 10110 non_portable_endpoint++; 10111 break; 10112 case '0': case '1': case '2': case '3': case '4': 10113 case '5': case '6': case '7': 10114 { 10115 /* Take 1-3 octal digits */ 10116 I32 flags = PERL_SCAN_SILENT_ILLDIGIT 10117 | PERL_SCAN_NOTIFY_ILLDIGIT; 10118 numlen = (strict) ? 4 : 3; 10119 value = grok_oct(--RExC_parse, &numlen, &flags, NULL); 10120 RExC_parse_inc_by(numlen); 10121 if (numlen != 3) { 10122 if (strict) { 10123 RExC_parse_inc_safe(); 10124 vFAIL("Need exactly 3 octal digits"); 10125 } 10126 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT) 10127 && RExC_parse < RExC_end 10128 && isDIGIT(*RExC_parse) 10129 && ckWARN(WARN_REGEXP)) 10130 { 10131 reg_warn_non_literal_string( 10132 RExC_parse + 1, 10133 form_alien_digit_msg(8, numlen, RExC_parse, 10134 RExC_end, UTF, FALSE)); 10135 } 10136 } 10137 if (value < 256) { 10138 non_portable_endpoint++; 10139 } 10140 break; 10141 } 10142 default: 10143 /* Allow \_ to not give an error */ 10144 if (isWORDCHAR(value) && value != '_') { 10145 if (strict) { 10146 vFAIL2("Unrecognized escape \\%c in character class", 10147 (int)value); 10148 } 10149 else { 10150 ckWARN2reg(RExC_parse, 10151 "Unrecognized escape \\%c in character class passed through", 10152 (int)value); 10153 } 10154 } 10155 break; 10156 } /* End of switch on char following backslash */ 10157 } /* end of handling backslash escape sequences */ 10158 10159 /* Here, we have the current token in 'value' */ 10160 10161 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ 10162 U8 classnum; 10163 10164 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a 10165 * literal, as is the character that began the false range, i.e. 10166 * the 'a' in the examples */ 10167 if (range) { 10168 const int w = (RExC_parse >= rangebegin) 10169 ? RExC_parse - rangebegin 10170 : 0; 10171 if (strict) { 10172 vFAIL2utf8f( 10173 "False [] range \"%" UTF8f "\"", 10174 UTF8fARG(UTF, w, rangebegin)); 10175 } 10176 else { 10177 ckWARN2reg(RExC_parse, 10178 "False [] range \"%" UTF8f "\"", 10179 UTF8fARG(UTF, w, rangebegin)); 10180 cp_list = add_cp_to_invlist(cp_list, '-'); 10181 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, 10182 prevvalue); 10183 } 10184 10185 range = 0; /* this was not a true range */ 10186 element_count += 2; /* So counts for three values */ 10187 } 10188 10189 classnum = namedclass_to_classnum(namedclass); 10190 10191 if (LOC && namedclass < ANYOF_POSIXL_MAX 10192 #ifndef HAS_ISASCII 10193 && classnum != CC_ASCII_ 10194 #endif 10195 ) { 10196 SV* scratch_list = NULL; 10197 10198 /* What the Posix classes (like \w, [:space:]) match isn't 10199 * generally knowable under locale until actual match time. A 10200 * special node is used for these which has extra space for a 10201 * bitmap, with a bit reserved for each named class that is to 10202 * be matched against. (This isn't needed for \p{} and 10203 * pseudo-classes, as they are not affected by locale, and 10204 * hence are dealt with separately.) However, if a named class 10205 * and its complement are both present, then it matches 10206 * everything, and there is no runtime dependency. Odd numbers 10207 * are the complements of the next lower number, so xor works. 10208 * (Note that something like [\w\D] should match everything, 10209 * because \d should be a proper subset of \w. But rather than 10210 * trust that the locale is well behaved, we leave this to 10211 * runtime to sort out) */ 10212 if (POSIXL_TEST(posixl, namedclass ^ 1)) { 10213 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX); 10214 POSIXL_ZERO(posixl); 10215 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY; 10216 anyof_flags &= ~ANYOF_MATCHES_POSIXL; 10217 continue; /* We could ignore the rest of the class, but 10218 best to parse it for any errors */ 10219 } 10220 else { /* Here, isn't the complement of any already parsed 10221 class */ 10222 POSIXL_SET(posixl, namedclass); 10223 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 10224 anyof_flags |= ANYOF_MATCHES_POSIXL; 10225 10226 /* The above-Latin1 characters are not subject to locale 10227 * rules. Just add them to the unconditionally-matched 10228 * list */ 10229 10230 /* Get the list of the above-Latin1 code points this 10231 * matches */ 10232 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1, 10233 PL_XPosix_ptrs[classnum], 10234 10235 /* Odd numbers are complements, 10236 * like NDIGIT, NASCII, ... */ 10237 namedclass % 2 != 0, 10238 &scratch_list); 10239 /* Checking if 'cp_list' is NULL first saves an extra 10240 * clone. Its reference count will be decremented at the 10241 * next union, etc, or if this is the only instance, at the 10242 * end of the routine */ 10243 if (! cp_list) { 10244 cp_list = scratch_list; 10245 } 10246 else { 10247 _invlist_union(cp_list, scratch_list, &cp_list); 10248 SvREFCNT_dec_NN(scratch_list); 10249 } 10250 continue; /* Go get next character */ 10251 } 10252 } 10253 else { 10254 10255 /* Here, is not /l, or is a POSIX class for which /l doesn't 10256 * matter (or is a Unicode property, which is skipped here). */ 10257 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ 10258 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ 10259 10260 /* Here, should be \h, \H, \v, or \V. None of /d, /i 10261 * nor /l make a difference in what these match, 10262 * therefore we just add what they match to cp_list. */ 10263 if (classnum != CC_VERTSPACE_) { 10264 assert( namedclass == ANYOF_HORIZWS 10265 || namedclass == ANYOF_NHORIZWS); 10266 10267 /* It turns out that \h is just a synonym for 10268 * XPosixBlank */ 10269 classnum = CC_BLANK_; 10270 } 10271 10272 _invlist_union_maybe_complement_2nd( 10273 cp_list, 10274 PL_XPosix_ptrs[classnum], 10275 namedclass % 2 != 0, /* Complement if odd 10276 (NHORIZWS, NVERTWS) 10277 */ 10278 &cp_list); 10279 } 10280 } 10281 else if ( AT_LEAST_UNI_SEMANTICS 10282 || classnum == CC_ASCII_ 10283 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_ 10284 || classnum == CC_XDIGIT_))) 10285 { 10286 /* We usually have to worry about /d affecting what POSIX 10287 * classes match, with special code needed because we won't 10288 * know until runtime what all matches. But there is no 10289 * extra work needed under /u and /a; and [:ascii:] is 10290 * unaffected by /d; and :digit: and :xdigit: don't have 10291 * runtime differences under /d. So we can special case 10292 * these, and avoid some extra work below, and at runtime. 10293 * */ 10294 _invlist_union_maybe_complement_2nd( 10295 simple_posixes, 10296 ((AT_LEAST_ASCII_RESTRICTED) 10297 ? PL_Posix_ptrs[classnum] 10298 : PL_XPosix_ptrs[classnum]), 10299 namedclass % 2 != 0, 10300 &simple_posixes); 10301 } 10302 else { /* Garden variety class. If is NUPPER, NALPHA, ... 10303 complement and use nposixes */ 10304 SV** posixes_ptr = namedclass % 2 == 0 10305 ? &posixes 10306 : &nposixes; 10307 _invlist_union_maybe_complement_2nd( 10308 *posixes_ptr, 10309 PL_XPosix_ptrs[classnum], 10310 namedclass % 2 != 0, 10311 posixes_ptr); 10312 } 10313 } 10314 } /* end of namedclass \blah */ 10315 10316 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end); 10317 10318 /* If 'range' is set, 'value' is the ending of a range--check its 10319 * validity. (If value isn't a single code point in the case of a 10320 * range, we should have figured that out above in the code that 10321 * catches false ranges). Later, we will handle each individual code 10322 * point in the range. If 'range' isn't set, this could be the 10323 * beginning of a range, so check for that by looking ahead to see if 10324 * the next real character to be processed is the range indicator--the 10325 * minus sign */ 10326 10327 if (range) { 10328 #ifdef EBCDIC 10329 /* For unicode ranges, we have to test that the Unicode as opposed 10330 * to the native values are not decreasing. (Above 255, there is 10331 * no difference between native and Unicode) */ 10332 if (unicode_range && prevvalue < 255 && value < 255) { 10333 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) { 10334 goto backwards_range; 10335 } 10336 } 10337 else 10338 #endif 10339 if (prevvalue > value) /* b-a */ { 10340 int w; 10341 #ifdef EBCDIC 10342 backwards_range: 10343 #endif 10344 w = RExC_parse - rangebegin; 10345 vFAIL2utf8f( 10346 "Invalid [] range \"%" UTF8f "\"", 10347 UTF8fARG(UTF, w, rangebegin)); 10348 NOT_REACHED; /* NOTREACHED */ 10349 } 10350 } 10351 else { 10352 prevvalue = value; /* save the beginning of the potential range */ 10353 if (! stop_at_1 /* Can't be a range if parsing just one thing */ 10354 && *RExC_parse == '-') 10355 { 10356 char* next_char_ptr = RExC_parse + 1; 10357 10358 /* Get the next real char after the '-' */ 10359 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end); 10360 10361 /* If the '-' is at the end of the class (just before the ']', 10362 * it is a literal minus; otherwise it is a range */ 10363 if (next_char_ptr < RExC_end && *next_char_ptr != ']') { 10364 RExC_parse_set(next_char_ptr); 10365 10366 /* a bad range like \w-, [:word:]- ? */ 10367 if (namedclass > OOB_NAMEDCLASS) { 10368 if (strict || ckWARN(WARN_REGEXP)) { 10369 const int w = RExC_parse >= rangebegin 10370 ? RExC_parse - rangebegin 10371 : 0; 10372 if (strict) { 10373 vFAIL4("False [] range \"%*.*s\"", 10374 w, w, rangebegin); 10375 } 10376 else { 10377 vWARN4(RExC_parse, 10378 "False [] range \"%*.*s\"", 10379 w, w, rangebegin); 10380 } 10381 } 10382 cp_list = add_cp_to_invlist(cp_list, '-'); 10383 element_count++; 10384 } else 10385 range = 1; /* yeah, it's a range! */ 10386 continue; /* but do it the next time */ 10387 } 10388 } 10389 } 10390 10391 if (namedclass > OOB_NAMEDCLASS) { 10392 continue; 10393 } 10394 10395 /* Here, we have a single value this time through the loop, and 10396 * <prevvalue> is the beginning of the range, if any; or <value> if 10397 * not. */ 10398 10399 /* non-Latin1 code point implies unicode semantics. */ 10400 if (value > 255) { 10401 if (value > MAX_LEGAL_CP && ( value != UV_MAX 10402 || prevvalue > MAX_LEGAL_CP)) 10403 { 10404 vFAIL(form_cp_too_large_msg(16, NULL, 0, value)); 10405 } 10406 REQUIRE_UNI_RULES(flagp, 0); 10407 if ( ! silence_non_portable 10408 && UNICODE_IS_PERL_EXTENDED(value) 10409 && TO_OUTPUT_WARNINGS(RExC_parse)) 10410 { 10411 ckWARN2_non_literal_string(RExC_parse, 10412 packWARN(WARN_PORTABLE), 10413 PL_extended_cp_format, 10414 value); 10415 } 10416 } 10417 10418 /* Ready to process either the single value, or the completed range. 10419 * For single-valued non-inverted ranges, we consider the possibility 10420 * of multi-char folds. (We made a conscious decision to not do this 10421 * for the other cases because it can often lead to non-intuitive 10422 * results. For example, you have the peculiar case that: 10423 * "s s" =~ /^[^\xDF]+$/i => Y 10424 * "ss" =~ /^[^\xDF]+$/i => N 10425 * 10426 * See [perl #89750] */ 10427 if (FOLD && allow_mutiple_chars && value == prevvalue) { 10428 if ( value == LATIN_SMALL_LETTER_SHARP_S 10429 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, 10430 value))) 10431 { 10432 /* Here <value> is indeed a multi-char fold. Get what it is */ 10433 10434 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 10435 STRLEN foldlen; 10436 10437 UV folded = _to_uni_fold_flags( 10438 value, 10439 foldbuf, 10440 &foldlen, 10441 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED 10442 ? FOLD_FLAGS_NOMIX_ASCII 10443 : 0) 10444 ); 10445 10446 /* Here, <folded> should be the first character of the 10447 * multi-char fold of <value>, with <foldbuf> containing the 10448 * whole thing. But, if this fold is not allowed (because of 10449 * the flags), <fold> will be the same as <value>, and should 10450 * be processed like any other character, so skip the special 10451 * handling */ 10452 if (folded != value) { 10453 10454 /* Skip if we are recursed, currently parsing the class 10455 * again. Otherwise add this character to the list of 10456 * multi-char folds. */ 10457 if (! RExC_in_multi_char_class) { 10458 STRLEN cp_count = utf8_length(foldbuf, 10459 foldbuf + foldlen); 10460 SV* multi_fold = newSVpvs_flags("", SVs_TEMP); 10461 10462 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value); 10463 10464 multi_char_matches 10465 = add_multi_match(multi_char_matches, 10466 multi_fold, 10467 cp_count); 10468 10469 } 10470 10471 /* This element should not be processed further in this 10472 * class */ 10473 element_count--; 10474 value = save_value; 10475 prevvalue = save_prevvalue; 10476 continue; 10477 } 10478 } 10479 } 10480 10481 if (strict && ckWARN(WARN_REGEXP)) { 10482 if (range) { 10483 10484 /* If the range starts above 255, everything is portable and 10485 * likely to be so for any forseeable character set, so don't 10486 * warn. */ 10487 if (unicode_range && non_portable_endpoint && prevvalue < 256) { 10488 vWARN(RExC_parse, "Both or neither range ends should be Unicode"); 10489 } 10490 else if (prevvalue != value) { 10491 10492 /* Under strict, ranges that stop and/or end in an ASCII 10493 * printable should have each end point be a portable value 10494 * for it (preferably like 'A', but we don't warn if it is 10495 * a (portable) Unicode name or code point), and the range 10496 * must be all digits or all letters of the same case. 10497 * Otherwise, the range is non-portable and unclear as to 10498 * what it contains */ 10499 if ( (isPRINT_A(prevvalue) || isPRINT_A(value)) 10500 && ( non_portable_endpoint 10501 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value)) 10502 || (isLOWER_A(prevvalue) && isLOWER_A(value)) 10503 || (isUPPER_A(prevvalue) && isUPPER_A(value)) 10504 ))) { 10505 vWARN(RExC_parse, "Ranges of ASCII printables should" 10506 " be some subset of \"0-9\"," 10507 " \"A-Z\", or \"a-z\""); 10508 } 10509 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) { 10510 SSize_t index_start; 10511 SSize_t index_final; 10512 10513 /* But the nature of Unicode and languages mean we 10514 * can't do the same checks for above-ASCII ranges, 10515 * except in the case of digit ones. These should 10516 * contain only digits from the same group of 10. The 10517 * ASCII case is handled just above. Hence here, the 10518 * range could be a range of digits. First some 10519 * unlikely special cases. Grandfather in that a range 10520 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad 10521 * if its starting value is one of the 10 digits prior 10522 * to it. This is because it is an alternate way of 10523 * writing 19D1, and some people may expect it to be in 10524 * that group. But it is bad, because it won't give 10525 * the expected results. In Unicode 5.2 it was 10526 * considered to be in that group (of 11, hence), but 10527 * this was fixed in the next version */ 10528 10529 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) { 10530 goto warn_bad_digit_range; 10531 } 10532 else if (UNLIKELY( prevvalue >= 0x1D7CE 10533 && value <= 0x1D7FF)) 10534 { 10535 /* This is the only other case currently in Unicode 10536 * where the algorithm below fails. The code 10537 * points just above are the end points of a single 10538 * range containing only decimal digits. It is 5 10539 * different series of 0-9. All other ranges of 10540 * digits currently in Unicode are just a single 10541 * series. (And mktables will notify us if a later 10542 * Unicode version breaks this.) 10543 * 10544 * If the range being checked is at most 9 long, 10545 * and the digit values represented are in 10546 * numerical order, they are from the same series. 10547 * */ 10548 if ( value - prevvalue > 9 10549 || ((( value - 0x1D7CE) % 10) 10550 <= (prevvalue - 0x1D7CE) % 10)) 10551 { 10552 goto warn_bad_digit_range; 10553 } 10554 } 10555 else { 10556 10557 /* For all other ranges of digits in Unicode, the 10558 * algorithm is just to check if both end points 10559 * are in the same series, which is the same range. 10560 * */ 10561 index_start = _invlist_search( 10562 PL_XPosix_ptrs[CC_DIGIT_], 10563 prevvalue); 10564 10565 /* Warn if the range starts and ends with a digit, 10566 * and they are not in the same group of 10. */ 10567 if ( index_start >= 0 10568 && ELEMENT_RANGE_MATCHES_INVLIST(index_start) 10569 && (index_final = 10570 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_], 10571 value)) != index_start 10572 && index_final >= 0 10573 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)) 10574 { 10575 warn_bad_digit_range: 10576 vWARN(RExC_parse, "Ranges of digits should be" 10577 " from the same group of" 10578 " 10"); 10579 } 10580 } 10581 } 10582 } 10583 } 10584 if ((! range || prevvalue == value) && non_portable_endpoint) { 10585 if (isPRINT_A(value)) { 10586 char literal[3]; 10587 unsigned d = 0; 10588 if (isBACKSLASHED_PUNCT(value)) { 10589 literal[d++] = '\\'; 10590 } 10591 literal[d++] = (char) value; 10592 literal[d++] = '\0'; 10593 10594 vWARN4(RExC_parse, 10595 "\"%.*s\" is more clearly written simply as \"%s\"", 10596 (int) (RExC_parse - rangebegin), 10597 rangebegin, 10598 literal 10599 ); 10600 } 10601 else if (isMNEMONIC_CNTRL(value)) { 10602 vWARN4(RExC_parse, 10603 "\"%.*s\" is more clearly written simply as \"%s\"", 10604 (int) (RExC_parse - rangebegin), 10605 rangebegin, 10606 cntrl_to_mnemonic((U8) value) 10607 ); 10608 } 10609 } 10610 } 10611 10612 /* Deal with this element of the class */ 10613 10614 #ifndef EBCDIC 10615 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 10616 prevvalue, value); 10617 #else 10618 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones 10619 * that don't require special handling, we can just add the range like 10620 * we do for ASCII platforms */ 10621 if ((UNLIKELY(prevvalue == 0) && value >= 255) 10622 || ! (prevvalue < 256 10623 && (unicode_range 10624 || (! non_portable_endpoint 10625 && ((isLOWER_A(prevvalue) && isLOWER_A(value)) 10626 || (isUPPER_A(prevvalue) 10627 && isUPPER_A(value))))))) 10628 { 10629 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 10630 prevvalue, value); 10631 } 10632 else { 10633 /* Here, requires special handling. This can be because it is a 10634 * range whose code points are considered to be Unicode, and so 10635 * must be individually translated into native, or because its a 10636 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in 10637 * EBCDIC, but we have defined them to include only the "expected" 10638 * upper or lower case ASCII alphabetics. Subranges above 255 are 10639 * the same in native and Unicode, so can be added as a range */ 10640 U8 start = NATIVE_TO_LATIN1(prevvalue); 10641 unsigned j; 10642 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; 10643 for (j = start; j <= end; j++) { 10644 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); 10645 } 10646 if (value > 255) { 10647 cp_foldable_list = _add_range_to_invlist(cp_foldable_list, 10648 256, value); 10649 } 10650 } 10651 #endif 10652 10653 range = 0; /* this range (if it was one) is done now */ 10654 } /* End of loop through all the text within the brackets */ 10655 10656 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { 10657 output_posix_warnings(pRExC_state, posix_warnings); 10658 } 10659 10660 /* If anything in the class expands to more than one character, we have to 10661 * deal with them by building up a substitute parse string, and recursively 10662 * calling reg() on it, instead of proceeding */ 10663 if (multi_char_matches) { 10664 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); 10665 I32 cp_count; 10666 STRLEN len; 10667 char *save_end = RExC_end; 10668 char *save_parse = RExC_parse; 10669 char *save_start = RExC_start; 10670 Size_t constructed_prefix_len = 0; /* This gives the length of the 10671 constructed portion of the 10672 substitute parse. */ 10673 bool first_time = TRUE; /* First multi-char occurrence doesn't get 10674 a "|" */ 10675 I32 reg_flags; 10676 10677 assert(! invert); 10678 /* Only one level of recursion allowed */ 10679 assert(RExC_copy_start_in_constructed == RExC_precomp); 10680 10681 #if 0 /* Have decided not to deal with multi-char folds in inverted classes, 10682 because too confusing */ 10683 if (invert) { 10684 sv_catpvs(substitute_parse, "(?:"); 10685 } 10686 #endif 10687 10688 /* Look at the longest strings first */ 10689 for (cp_count = av_tindex_skip_len_mg(multi_char_matches); 10690 cp_count > 0; 10691 cp_count--) 10692 { 10693 10694 if (av_exists(multi_char_matches, cp_count)) { 10695 AV** this_array_ptr; 10696 SV* this_sequence; 10697 10698 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, 10699 cp_count, FALSE); 10700 while ((this_sequence = av_pop(*this_array_ptr)) != 10701 &PL_sv_undef) 10702 { 10703 if (! first_time) { 10704 sv_catpvs(substitute_parse, "|"); 10705 } 10706 first_time = FALSE; 10707 10708 sv_catpv(substitute_parse, SvPVX(this_sequence)); 10709 } 10710 } 10711 } 10712 10713 /* If the character class contains anything else besides these 10714 * multi-character strings, have to include it in recursive parsing */ 10715 if (element_count) { 10716 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '['; 10717 10718 sv_catpvs(substitute_parse, "|"); 10719 if (has_l_bracket) { /* Add an [ if the original had one */ 10720 sv_catpvs(substitute_parse, "["); 10721 } 10722 constructed_prefix_len = SvCUR(substitute_parse); 10723 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); 10724 10725 /* Put in a closing ']' to match any opening one, but not if going 10726 * off the end, as otherwise we are adding something that really 10727 * isn't there */ 10728 if (has_l_bracket && RExC_parse < RExC_end) { 10729 sv_catpvs(substitute_parse, "]"); 10730 } 10731 } 10732 10733 sv_catpvs(substitute_parse, ")"); 10734 #if 0 10735 if (invert) { 10736 /* This is a way to get the parse to skip forward a whole named 10737 * sequence instead of matching the 2nd character when it fails the 10738 * first */ 10739 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); 10740 } 10741 #endif 10742 10743 /* Set up the data structure so that any errors will be properly 10744 * reported. See the comments at the definition of 10745 * REPORT_LOCATION_ARGS for details */ 10746 RExC_copy_start_in_input = (char *) orig_parse; 10747 RExC_start = SvPV(substitute_parse, len); 10748 RExC_parse_set( RExC_start ); 10749 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len; 10750 RExC_end = RExC_parse + len; 10751 RExC_in_multi_char_class = 1; 10752 10753 ret = reg(pRExC_state, 1, ®_flags, depth+1); 10754 10755 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8); 10756 10757 /* And restore so can parse the rest of the pattern */ 10758 RExC_parse_set(save_parse); 10759 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start; 10760 RExC_end = save_end; 10761 RExC_in_multi_char_class = 0; 10762 SvREFCNT_dec_NN(multi_char_matches); 10763 SvREFCNT_dec(properties); 10764 SvREFCNT_dec(cp_list); 10765 SvREFCNT_dec(simple_posixes); 10766 SvREFCNT_dec(posixes); 10767 SvREFCNT_dec(nposixes); 10768 SvREFCNT_dec(cp_foldable_list); 10769 return ret; 10770 } 10771 10772 /* If folding, we calculate all characters that could fold to or from the 10773 * ones already on the list */ 10774 if (cp_foldable_list) { 10775 if (FOLD) { 10776 UV start, end; /* End points of code point ranges */ 10777 10778 SV* fold_intersection = NULL; 10779 SV** use_list; 10780 10781 /* Our calculated list will be for Unicode rules. For locale 10782 * matching, we have to keep a separate list that is consulted at 10783 * runtime only when the locale indicates Unicode rules (and we 10784 * don't include potential matches in the ASCII/Latin1 range, as 10785 * any code point could fold to any other, based on the run-time 10786 * locale). For non-locale, we just use the general list */ 10787 if (LOC) { 10788 use_list = &only_utf8_locale_list; 10789 } 10790 else { 10791 use_list = &cp_list; 10792 } 10793 10794 /* Only the characters in this class that participate in folds need 10795 * be checked. Get the intersection of this class and all the 10796 * possible characters that are foldable. This can quickly narrow 10797 * down a large class */ 10798 _invlist_intersection(PL_in_some_fold, cp_foldable_list, 10799 &fold_intersection); 10800 10801 /* Now look at the foldable characters in this class individually */ 10802 invlist_iterinit(fold_intersection); 10803 while (invlist_iternext(fold_intersection, &start, &end)) { 10804 UV j; 10805 UV folded; 10806 10807 /* Look at every character in the range */ 10808 for (j = start; j <= end; j++) { 10809 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; 10810 STRLEN foldlen; 10811 unsigned int k; 10812 Size_t folds_count; 10813 U32 first_fold; 10814 const U32 * remaining_folds; 10815 10816 if (j < 256) { 10817 10818 /* Under /l, we don't know what code points below 256 10819 * fold to, except we do know the MICRO SIGN folds to 10820 * an above-255 character if the locale is UTF-8, so we 10821 * add it to the special list (in *use_list) Otherwise 10822 * we know now what things can match, though some folds 10823 * are valid under /d only if the target is UTF-8. 10824 * Those go in a separate list */ 10825 if ( IS_IN_SOME_FOLD_L1(j) 10826 && ! (LOC && j != MICRO_SIGN)) 10827 { 10828 10829 /* ASCII is always matched; non-ASCII is matched 10830 * only under Unicode rules (which could happen 10831 * under /l if the locale is a UTF-8 one */ 10832 if (isASCII(j) || ! DEPENDS_SEMANTICS) { 10833 *use_list = add_cp_to_invlist(*use_list, 10834 PL_fold_latin1[j]); 10835 } 10836 else if (j != PL_fold_latin1[j]) { 10837 upper_latin1_only_utf8_matches 10838 = add_cp_to_invlist( 10839 upper_latin1_only_utf8_matches, 10840 PL_fold_latin1[j]); 10841 } 10842 } 10843 10844 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) 10845 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) 10846 { 10847 add_above_Latin1_folds(pRExC_state, 10848 (U8) j, 10849 use_list); 10850 } 10851 continue; 10852 } 10853 10854 /* Here is an above Latin1 character. We don't have the 10855 * rules hard-coded for it. First, get its fold. This is 10856 * the simple fold, as the multi-character folds have been 10857 * handled earlier and separated out */ 10858 folded = _to_uni_fold_flags(j, foldbuf, &foldlen, 10859 (ASCII_FOLD_RESTRICTED) 10860 ? FOLD_FLAGS_NOMIX_ASCII 10861 : 0); 10862 10863 /* Single character fold of above Latin1. Add everything 10864 * in its fold closure to the list that this node should 10865 * match. */ 10866 folds_count = _inverse_folds(folded, &first_fold, 10867 &remaining_folds); 10868 for (k = 0; k <= folds_count; k++) { 10869 UV c = (k == 0) /* First time through use itself */ 10870 ? folded 10871 : (k == 1) /* 2nd time use, the first fold */ 10872 ? first_fold 10873 10874 /* Then the remaining ones */ 10875 : remaining_folds[k-2]; 10876 10877 /* /aa doesn't allow folds between ASCII and non- */ 10878 if (( ASCII_FOLD_RESTRICTED 10879 && (isASCII(c) != isASCII(j)))) 10880 { 10881 continue; 10882 } 10883 10884 /* Folds under /l which cross the 255/256 boundary are 10885 * added to a separate list. (These are valid only 10886 * when the locale is UTF-8.) */ 10887 if (c < 256 && LOC) { 10888 *use_list = add_cp_to_invlist(*use_list, c); 10889 continue; 10890 } 10891 10892 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) 10893 { 10894 cp_list = add_cp_to_invlist(cp_list, c); 10895 } 10896 else { 10897 /* Similarly folds involving non-ascii Latin1 10898 * characters under /d are added to their list */ 10899 upper_latin1_only_utf8_matches 10900 = add_cp_to_invlist( 10901 upper_latin1_only_utf8_matches, 10902 c); 10903 } 10904 } 10905 } 10906 } 10907 SvREFCNT_dec_NN(fold_intersection); 10908 } 10909 10910 /* Now that we have finished adding all the folds, there is no reason 10911 * to keep the foldable list separate */ 10912 _invlist_union(cp_list, cp_foldable_list, &cp_list); 10913 SvREFCNT_dec_NN(cp_foldable_list); 10914 } 10915 10916 /* And combine the result (if any) with any inversion lists from posix 10917 * classes. The lists are kept separate up to now because we don't want to 10918 * fold the classes */ 10919 if (simple_posixes) { /* These are the classes known to be unaffected by 10920 /a, /aa, and /d */ 10921 if (cp_list) { 10922 _invlist_union(cp_list, simple_posixes, &cp_list); 10923 SvREFCNT_dec_NN(simple_posixes); 10924 } 10925 else { 10926 cp_list = simple_posixes; 10927 } 10928 } 10929 if (posixes || nposixes) { 10930 if (! DEPENDS_SEMANTICS) { 10931 10932 /* For everything but /d, we can just add the current 'posixes' and 10933 * 'nposixes' to the main list */ 10934 if (posixes) { 10935 if (cp_list) { 10936 _invlist_union(cp_list, posixes, &cp_list); 10937 SvREFCNT_dec_NN(posixes); 10938 } 10939 else { 10940 cp_list = posixes; 10941 } 10942 } 10943 if (nposixes) { 10944 if (cp_list) { 10945 _invlist_union(cp_list, nposixes, &cp_list); 10946 SvREFCNT_dec_NN(nposixes); 10947 } 10948 else { 10949 cp_list = nposixes; 10950 } 10951 } 10952 } 10953 else { 10954 /* Under /d, things like \w match upper Latin1 characters only if 10955 * the target string is in UTF-8. But things like \W match all the 10956 * upper Latin1 characters if the target string is not in UTF-8. 10957 * 10958 * Handle the case with something like \W separately */ 10959 if (nposixes) { 10960 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL); 10961 10962 /* A complemented posix class matches all upper Latin1 10963 * characters if not in UTF-8. And it matches just certain 10964 * ones when in UTF-8. That means those certain ones are 10965 * matched regardless, so can just be added to the 10966 * unconditional list */ 10967 if (cp_list) { 10968 _invlist_union(cp_list, nposixes, &cp_list); 10969 SvREFCNT_dec_NN(nposixes); 10970 nposixes = NULL; 10971 } 10972 else { 10973 cp_list = nposixes; 10974 } 10975 10976 /* Likewise for 'posixes' */ 10977 _invlist_union(posixes, cp_list, &cp_list); 10978 SvREFCNT_dec(posixes); 10979 10980 /* Likewise for anything else in the range that matched only 10981 * under UTF-8 */ 10982 if (upper_latin1_only_utf8_matches) { 10983 _invlist_union(cp_list, 10984 upper_latin1_only_utf8_matches, 10985 &cp_list); 10986 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 10987 upper_latin1_only_utf8_matches = NULL; 10988 } 10989 10990 /* If we don't match all the upper Latin1 characters regardless 10991 * of UTF-8ness, we have to set a flag to match the rest when 10992 * not in UTF-8 */ 10993 _invlist_subtract(only_non_utf8_list, cp_list, 10994 &only_non_utf8_list); 10995 if (_invlist_len(only_non_utf8_list) != 0) { 10996 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared; 10997 } 10998 SvREFCNT_dec_NN(only_non_utf8_list); 10999 } 11000 else { 11001 /* Here there were no complemented posix classes. That means 11002 * the upper Latin1 characters in 'posixes' match only when the 11003 * target string is in UTF-8. So we have to add them to the 11004 * list of those types of code points, while adding the 11005 * remainder to the unconditional list. 11006 * 11007 * First calculate what they are */ 11008 SV* nonascii_but_latin1_properties = NULL; 11009 _invlist_intersection(posixes, PL_UpperLatin1, 11010 &nonascii_but_latin1_properties); 11011 11012 /* And add them to the final list of such characters. */ 11013 _invlist_union(upper_latin1_only_utf8_matches, 11014 nonascii_but_latin1_properties, 11015 &upper_latin1_only_utf8_matches); 11016 11017 /* Remove them from what now becomes the unconditional list */ 11018 _invlist_subtract(posixes, nonascii_but_latin1_properties, 11019 &posixes); 11020 11021 /* And add those unconditional ones to the final list */ 11022 if (cp_list) { 11023 _invlist_union(cp_list, posixes, &cp_list); 11024 SvREFCNT_dec_NN(posixes); 11025 posixes = NULL; 11026 } 11027 else { 11028 cp_list = posixes; 11029 } 11030 11031 SvREFCNT_dec(nonascii_but_latin1_properties); 11032 11033 /* Get rid of any characters from the conditional list that we 11034 * now know are matched unconditionally, which may make that 11035 * list empty */ 11036 _invlist_subtract(upper_latin1_only_utf8_matches, 11037 cp_list, 11038 &upper_latin1_only_utf8_matches); 11039 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) { 11040 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 11041 upper_latin1_only_utf8_matches = NULL; 11042 } 11043 } 11044 } 11045 } 11046 11047 /* And combine the result (if any) with any inversion list from properties. 11048 * The lists are kept separate up to now so that we can distinguish the two 11049 * in regards to matching above-Unicode. A run-time warning is generated 11050 * if a Unicode property is matched against a non-Unicode code point. But, 11051 * we allow user-defined properties to match anything, without any warning, 11052 * and we also suppress the warning if there is a portion of the character 11053 * class that isn't a Unicode property, and which matches above Unicode, \W 11054 * or [\x{110000}] for example. 11055 * (Note that in this case, unlike the Posix one above, there is no 11056 * <upper_latin1_only_utf8_matches>, because having a Unicode property 11057 * forces Unicode semantics */ 11058 if (properties) { 11059 if (cp_list) { 11060 11061 /* If it matters to the final outcome, see if a non-property 11062 * component of the class matches above Unicode. If so, the 11063 * warning gets suppressed. This is true even if just a single 11064 * such code point is specified, as, though not strictly correct if 11065 * another such code point is matched against, the fact that they 11066 * are using above-Unicode code points indicates they should know 11067 * the issues involved */ 11068 if (warn_super) { 11069 warn_super = ! (invert 11070 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list)))); 11071 } 11072 11073 _invlist_union(properties, cp_list, &cp_list); 11074 SvREFCNT_dec_NN(properties); 11075 } 11076 else { 11077 cp_list = properties; 11078 } 11079 11080 if (warn_super) { 11081 anyof_flags |= ANYOF_WARN_SUPER__shared; 11082 11083 /* Because an ANYOF node is the only one that warns, this node 11084 * can't be optimized into something else */ 11085 optimizable = FALSE; 11086 } 11087 } 11088 11089 /* Here, we have calculated what code points should be in the character 11090 * class. 11091 * 11092 * Now we can see about various optimizations. Fold calculation (which we 11093 * did above) needs to take place before inversion. Otherwise /[^k]/i 11094 * would invert to include K, which under /i would match k, which it 11095 * shouldn't. Therefore we can't invert folded locale now, as it won't be 11096 * folded until runtime */ 11097 11098 /* If we didn't do folding, it's because some information isn't available 11099 * until runtime; set the run-time fold flag for these We know to set the 11100 * flag if we have a non-NULL list for UTF-8 locales, or the class matches 11101 * at least one 0-255 range code point */ 11102 if (LOC && FOLD) { 11103 11104 /* Some things on the list might be unconditionally included because of 11105 * other components. Remove them, and clean up the list if it goes to 11106 * 0 elements */ 11107 if (only_utf8_locale_list && cp_list) { 11108 _invlist_subtract(only_utf8_locale_list, cp_list, 11109 &only_utf8_locale_list); 11110 11111 if (_invlist_len(only_utf8_locale_list) == 0) { 11112 SvREFCNT_dec_NN(only_utf8_locale_list); 11113 only_utf8_locale_list = NULL; 11114 } 11115 } 11116 if ( only_utf8_locale_list 11117 || ( cp_list 11118 && ( _invlist_contains_cp(cp_list, 11119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) 11120 || _invlist_contains_cp(cp_list, 11121 LATIN_SMALL_LETTER_DOTLESS_I)))) 11122 { 11123 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 11124 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 11125 } 11126 else if (cp_list && invlist_lowest(cp_list) < 256) { 11127 /* If nothing is below 256, has no locale dependency; otherwise it 11128 * does */ 11129 anyof_flags |= ANYOFL_FOLD; 11130 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; 11131 11132 /* In a Turkish locale these could match, notify the run-time code 11133 * to check for that */ 11134 if ( _invlist_contains_cp(cp_list, 'I') 11135 || _invlist_contains_cp(cp_list, 'i')) 11136 { 11137 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 11138 } 11139 } 11140 } 11141 else if ( DEPENDS_SEMANTICS 11142 && ( upper_latin1_only_utf8_matches 11143 || ( anyof_flags 11144 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))) 11145 { 11146 RExC_seen_d_op = TRUE; 11147 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY; 11148 } 11149 11150 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at 11151 * compile time. */ 11152 if ( cp_list 11153 && invert 11154 && ! has_runtime_dependency) 11155 { 11156 _invlist_invert(cp_list); 11157 11158 /* Clear the invert flag since have just done it here */ 11159 invert = FALSE; 11160 } 11161 11162 /* All possible optimizations below still have these characteristics. 11163 * (Multi-char folds aren't SIMPLE, but they don't get this far in this 11164 * routine) */ 11165 *flagp |= HASWIDTH|SIMPLE; 11166 11167 if (ret_invlist) { 11168 *ret_invlist = cp_list; 11169 11170 return (cp_list) ? RExC_emit : 0; 11171 } 11172 11173 if (anyof_flags & ANYOF_LOCALE_FLAGS) { 11174 RExC_contains_locale = 1; 11175 } 11176 11177 if (optimizable) { 11178 11179 /* Some character classes are equivalent to other nodes. Such nodes 11180 * take up less room, and some nodes require fewer operations to 11181 * execute, than ANYOF nodes. EXACTish nodes may be joinable with 11182 * adjacent nodes to improve efficiency. */ 11183 op = optimize_regclass(pRExC_state, cp_list, 11184 only_utf8_locale_list, 11185 upper_latin1_only_utf8_matches, 11186 has_runtime_dependency, 11187 posixl, 11188 &anyof_flags, &invert, &ret, flagp); 11189 RETURN_FAIL_ON_RESTART_FLAGP(flagp); 11190 11191 /* If optimized to something else and emitted, clean up and return */ 11192 if (ret >= 0) { 11193 SvREFCNT_dec(cp_list); 11194 SvREFCNT_dec(only_utf8_locale_list); 11195 SvREFCNT_dec(upper_latin1_only_utf8_matches); 11196 return ret; 11197 } 11198 11199 /* If no optimization was found, an END was returned and we will now 11200 * emit an ANYOF */ 11201 if (op == END) { 11202 op = ANYOF; 11203 } 11204 } 11205 11206 /* Here are going to emit an ANYOF; set the particular type */ 11207 if (op == ANYOF) { 11208 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) { 11209 op = ANYOFD; 11210 } 11211 else if (posixl) { 11212 op = ANYOFPOSIXL; 11213 } 11214 else if (LOC) { 11215 op = ANYOFL; 11216 } 11217 } 11218 11219 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 11220 FILL_NODE(ret, op); /* We set the argument later */ 11221 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op); 11222 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags; 11223 11224 /* Here, <cp_list> contains all the code points we can determine at 11225 * compile time that match under all conditions. Go through it, and 11226 * for things that belong in the bitmap, put them there, and delete from 11227 * <cp_list>. While we are at it, see if everything above 255 is in the 11228 * list, and if so, set a flag to speed up execution */ 11229 11230 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list); 11231 11232 if (posixl) { 11233 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl); 11234 } 11235 11236 if (invert) { 11237 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT; 11238 } 11239 11240 /* Here, the bitmap has been populated with all the Latin1 code points that 11241 * always match. Can now add to the overall list those that match only 11242 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>). 11243 * */ 11244 if (upper_latin1_only_utf8_matches) { 11245 if (cp_list) { 11246 _invlist_union(cp_list, 11247 upper_latin1_only_utf8_matches, 11248 &cp_list); 11249 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches); 11250 } 11251 else { 11252 cp_list = upper_latin1_only_utf8_matches; 11253 } 11254 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; 11255 } 11256 11257 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list, 11258 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) 11259 ? listsv 11260 : NULL, 11261 only_utf8_locale_list); 11262 11263 SvREFCNT_dec(cp_list); 11264 SvREFCNT_dec(only_utf8_locale_list); 11265 return ret; 11266 } 11267 11268 STATIC U8 11269 S_optimize_regclass(pTHX_ 11270 RExC_state_t *pRExC_state, 11271 SV * cp_list, 11272 SV* only_utf8_locale_list, 11273 SV* upper_latin1_only_utf8_matches, 11274 const U32 has_runtime_dependency, 11275 const U32 posixl, 11276 U8 * anyof_flags, 11277 bool * invert, 11278 regnode_offset * ret, 11279 I32 *flagp 11280 ) 11281 { 11282 /* This function exists just to make S_regclass() smaller. It extracts out 11283 * the code that looks for potential optimizations away from a full generic 11284 * ANYOF node. The parameter names are the same as the corresponding 11285 * variables in S_regclass. 11286 * 11287 * It returns the new op (the impossible END one if no optimization found) 11288 * and sets *ret to any created regnode. If the new op is sufficiently 11289 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass. 11290 * 11291 * Certain of the parameters may be updated as a result of the changes 11292 * herein */ 11293 11294 U8 op = END; /* The returned node-type, initialized to an impossible 11295 one. */ 11296 UV value = 0; 11297 PERL_UINT_FAST8_T i; 11298 UV partial_cp_count = 0; 11299 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */ 11300 UV end[MAX_FOLD_FROMS+1] = { 0 }; 11301 bool single_range = FALSE; 11302 UV lowest_cp = 0, highest_cp = 0; 11303 11304 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS; 11305 11306 if (cp_list) { /* Count the code points in enough ranges that we would see 11307 all the ones possible in any fold in this version of 11308 Unicode */ 11309 11310 invlist_iterinit(cp_list); 11311 for (i = 0; i <= MAX_FOLD_FROMS; i++) { 11312 if (! invlist_iternext(cp_list, &start[i], &end[i])) { 11313 break; 11314 } 11315 partial_cp_count += end[i] - start[i] + 1; 11316 } 11317 11318 if (i == 1) { 11319 single_range = TRUE; 11320 } 11321 invlist_iterfinish(cp_list); 11322 11323 /* If we know at compile time that this matches every possible code 11324 * point, any run-time dependencies don't matter */ 11325 if (start[0] == 0 && end[0] == UV_MAX) { 11326 if (*invert) { 11327 goto return_OPFAIL; 11328 } 11329 else { 11330 goto return_SANY; 11331 } 11332 } 11333 11334 /* Use a clearer mnemonic for below */ 11335 lowest_cp = start[0]; 11336 11337 highest_cp = invlist_highest(cp_list); 11338 } 11339 11340 /* Similarly, for /l posix classes, if both a class and its complement 11341 * match, any run-time dependencies don't matter */ 11342 if (posixl) { 11343 int namedclass; 11344 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) { 11345 if ( POSIXL_TEST(posixl, namedclass) /* class */ 11346 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */ 11347 { 11348 if (*invert) { 11349 goto return_OPFAIL; 11350 } 11351 goto return_SANY; 11352 } 11353 } 11354 11355 /* For well-behaved locales, some classes are subsets of others, so 11356 * complementing the subset and including the non-complemented superset 11357 * should match everything, like [\D[:alnum:]], and 11358 * [[:^alpha:][:alnum:]], but some implementations of locales are 11359 * buggy, and khw thinks its a bad idea to have optimization change 11360 * behavior, even if it avoids an OS bug in a given case */ 11361 11362 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n) 11363 11364 /* If is a single posix /l class, can optimize to just that op. Such a 11365 * node will not match anything in the Latin1 range, as that is not 11366 * determinable until runtime, but will match whatever the class does 11367 * outside that range. (Note that some classes won't match anything 11368 * outside the range, like [:ascii:]) */ 11369 if ( isSINGLE_BIT_SET(posixl) 11370 && (partial_cp_count == 0 || lowest_cp > 255)) 11371 { 11372 U8 classnum; 11373 SV * class_above_latin1 = NULL; 11374 bool already_inverted; 11375 bool are_equivalent; 11376 11377 11378 namedclass = single_1bit_pos32(posixl); 11379 classnum = namedclass_to_classnum(namedclass); 11380 11381 /* The named classes are such that the inverted number is one 11382 * larger than the non-inverted one */ 11383 already_inverted = namedclass - classnum_to_namedclass(classnum); 11384 11385 /* Create an inversion list of the official property, inverted if 11386 * the constructed node list is inverted, and restricted to only 11387 * the above latin1 code points, which are the only ones known at 11388 * compile time */ 11389 _invlist_intersection_maybe_complement_2nd( 11390 PL_AboveLatin1, 11391 PL_XPosix_ptrs[classnum], 11392 already_inverted, 11393 &class_above_latin1); 11394 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE); 11395 SvREFCNT_dec_NN(class_above_latin1); 11396 11397 if (are_equivalent) { 11398 11399 /* Resolve the run-time inversion flag with this possibly 11400 * inverted class */ 11401 *invert = *invert ^ already_inverted; 11402 11403 op = POSIXL + *invert * (NPOSIXL - POSIXL); 11404 *ret = reg_node(pRExC_state, op); 11405 FLAGS(REGNODE_p(*ret)) = classnum; 11406 return op; 11407 } 11408 } 11409 } 11410 11411 /* khw can't think of any other possible transformation involving these. */ 11412 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) { 11413 return END; 11414 } 11415 11416 if (! has_runtime_dependency) { 11417 11418 /* If the list is empty, nothing matches. This happens, for example, 11419 * when a Unicode property that doesn't match anything is the only 11420 * element in the character class (perluniprops.pod notes such 11421 * properties). */ 11422 if (partial_cp_count == 0) { 11423 if (*invert) { 11424 goto return_SANY; 11425 } 11426 else { 11427 goto return_OPFAIL; 11428 } 11429 } 11430 11431 /* If matches everything but \n */ 11432 if ( start[0] == 0 && end[0] == '\n' - 1 11433 && start[1] == '\n' + 1 && end[1] == UV_MAX) 11434 { 11435 assert (! *invert); 11436 op = REG_ANY; 11437 *ret = reg_node(pRExC_state, op); 11438 MARK_NAUGHTY(1); 11439 return op; 11440 } 11441 } 11442 11443 /* Next see if can optimize classes that contain just a few code points 11444 * into an EXACTish node. The reason to do this is to let the optimizer 11445 * join this node with adjacent EXACTish ones, and ANYOF nodes require 11446 * runtime conversion to code point from UTF-8, which we'd like to avoid. 11447 * 11448 * An EXACTFish node can be generated even if not under /i, and vice versa. 11449 * But care must be taken. An EXACTFish node has to be such that it only 11450 * matches precisely the code points in the class, but we want to generate 11451 * the least restrictive one that does that, to increase the odds of being 11452 * able to join with an adjacent node. For example, if the class contains 11453 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN 11454 * from matching. Whether we are under /i or not is irrelevant in this 11455 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER 11456 * LETTER APOSTROPHE. That is supposed to match the single character U+0149 11457 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there 11458 * is no simple fold that includes \X{02BC}, there is a multi-char fold 11459 * that does, and so the node generated for it must be an EXACTFish one. 11460 * On the other hand qr/:/i should generate a plain EXACT node since the 11461 * colon participates in no fold whatsoever, and having it be EXACT tells 11462 * the optimizer the target string cannot match unless it has a colon in 11463 * it. */ 11464 if ( ! posixl 11465 && ! *invert 11466 11467 /* Only try if there are no more code points in the class than in 11468 * the max possible fold */ 11469 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1)) 11470 { 11471 /* We can always make a single code point class into an EXACTish node. 11472 * */ 11473 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) { 11474 if (LOC) { 11475 11476 /* Here is /l: Use EXACTL, except if there is a fold not known 11477 * until runtime so shows as only a single code point here. 11478 * For code points above 255, we know which can cause problems 11479 * by having a potential fold to the Latin1 range. */ 11480 if ( ! FOLD 11481 || ( lowest_cp > 255 11482 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp))) 11483 { 11484 op = EXACTL; 11485 } 11486 else { 11487 op = EXACTFL; 11488 } 11489 } 11490 else if (! FOLD) { /* Not /l and not /i */ 11491 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8; 11492 } 11493 else if (lowest_cp < 256) { /* /i, not /l, and the code point is 11494 small */ 11495 11496 /* Under /i, it gets a little tricky. A code point that 11497 * doesn't participate in a fold should be an EXACT node. We 11498 * know this one isn't the result of a simple fold, or there'd 11499 * be more than one code point in the list, but it could be 11500 * part of a multi-character fold. In that case we better not 11501 * create an EXACT node, as we would wrongly be telling the 11502 * optimizer that this code point must be in the target string, 11503 * and that is wrong. This is because if the sequence around 11504 * this code point forms a multi-char fold, what needs to be in 11505 * the string could be the code point that folds to the 11506 * sequence. 11507 * 11508 * This handles the case of below-255 code points, as we have 11509 * an easy look up for those. The next clause handles the 11510 * above-256 one */ 11511 op = IS_IN_SOME_FOLD_L1(lowest_cp) 11512 ? EXACTFU 11513 : EXACT; 11514 } 11515 else { /* /i, larger code point. Since we are under /i, and have 11516 just this code point, we know that it can't fold to 11517 something else, so PL_InMultiCharFold applies to it */ 11518 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp)) 11519 ? EXACTFU_REQ8 11520 : EXACT_REQ8; 11521 } 11522 11523 value = lowest_cp; 11524 } 11525 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY) 11526 && _invlist_contains_cp(PL_in_some_fold, lowest_cp)) 11527 { 11528 /* Here, the only runtime dependency, if any, is from /d, and the 11529 * class matches more than one code point, and the lowest code 11530 * point participates in some fold. It might be that the other 11531 * code points are /i equivalent to this one, and hence they would 11532 * be representable by an EXACTFish node. Above, we eliminated 11533 * classes that contain too many code points to be EXACTFish, with 11534 * the test for MAX_FOLD_FROMS 11535 * 11536 * First, special case the ASCII fold pairs, like 'B' and 'b'. We 11537 * do this because we have EXACTFAA at our disposal for the ASCII 11538 * range */ 11539 if (partial_cp_count == 2 && isASCII(lowest_cp)) { 11540 11541 /* The only ASCII characters that participate in folds are 11542 * alphabetics */ 11543 assert(isALPHA(lowest_cp)); 11544 if ( end[0] == start[0] /* First range is a single 11545 character, so 2nd exists */ 11546 && isALPHA_FOLD_EQ(start[0], start[1])) 11547 { 11548 /* Here, is part of an ASCII fold pair */ 11549 11550 if ( ASCII_FOLD_RESTRICTED 11551 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp)) 11552 { 11553 /* If the second clause just above was true, it means 11554 * we can't be under /i, or else the list would have 11555 * included more than this fold pair. Therefore we 11556 * have to exclude the possibility of whatever else it 11557 * is that folds to these, by using EXACTFAA */ 11558 op = EXACTFAA; 11559 } 11560 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) { 11561 11562 /* Here, there's no simple fold that lowest_cp is part 11563 * of, but there is a multi-character one. If we are 11564 * not under /i, we want to exclude that possibility; 11565 * if under /i, we want to include it */ 11566 op = (FOLD) ? EXACTFU : EXACTFAA; 11567 } 11568 else { 11569 11570 /* Here, the only possible fold lowest_cp participates in 11571 * is with start[1]. /i or not isn't relevant */ 11572 op = EXACTFU; 11573 } 11574 11575 value = toFOLD(lowest_cp); 11576 } 11577 } 11578 else if ( ! upper_latin1_only_utf8_matches 11579 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2 11580 && PL_fold_latin1[ 11581 invlist_highest(upper_latin1_only_utf8_matches)] 11582 == lowest_cp)) 11583 { 11584 /* Here, the smallest character is non-ascii or there are more 11585 * than 2 code points matched by this node. Also, we either 11586 * don't have /d UTF-8 dependent matches, or if we do, they 11587 * look like they could be a single character that is the fold 11588 * of the lowest one is in the always-match list. This test 11589 * quickly excludes most of the false positives when there are 11590 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL 11591 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE 11592 * iff the target string is UTF-8. (We don't have to worry 11593 * above about exceeding the array bounds of PL_fold_latin1[] 11594 * because any code point in 'upper_latin1_only_utf8_matches' 11595 * is below 256.) 11596 * 11597 * EXACTFAA would apply only to pairs (hence exactly 2 code 11598 * points) in the ASCII range, so we can't use it here to 11599 * artificially restrict the fold domain, so we check if the 11600 * class does or does not match some EXACTFish node. Further, 11601 * if we aren't under /i, and and the folded-to character is 11602 * part of a multi-character fold, we can't do this 11603 * optimization, as the sequence around it could be that 11604 * multi-character fold, and we don't here know the context, so 11605 * we have to assume it is that multi-char fold, to prevent 11606 * potential bugs. 11607 * 11608 * To do the general case, we first find the fold of the lowest 11609 * code point (which may be higher than that lowest unfolded 11610 * one), then find everything that folds to it. (The data 11611 * structure we have only maps from the folded code points, so 11612 * we have to do the earlier step.) */ 11613 11614 Size_t foldlen; 11615 U8 foldbuf[UTF8_MAXBYTES_CASE]; 11616 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0); 11617 U32 first_fold; 11618 const U32 * remaining_folds; 11619 Size_t folds_to_this_cp_count = _inverse_folds( 11620 folded, 11621 &first_fold, 11622 &remaining_folds); 11623 Size_t folds_count = folds_to_this_cp_count + 1; 11624 SV * fold_list = _new_invlist(folds_count); 11625 unsigned int i; 11626 11627 /* If there are UTF-8 dependent matches, create a temporary 11628 * list of what this node matches, including them. */ 11629 SV * all_cp_list = NULL; 11630 SV ** use_this_list = &cp_list; 11631 11632 if (upper_latin1_only_utf8_matches) { 11633 all_cp_list = _new_invlist(0); 11634 use_this_list = &all_cp_list; 11635 _invlist_union(cp_list, 11636 upper_latin1_only_utf8_matches, 11637 use_this_list); 11638 } 11639 11640 /* Having gotten everything that participates in the fold 11641 * containing the lowest code point, we turn that into an 11642 * inversion list, making sure everything is included. */ 11643 fold_list = add_cp_to_invlist(fold_list, lowest_cp); 11644 fold_list = add_cp_to_invlist(fold_list, folded); 11645 if (folds_to_this_cp_count > 0) { 11646 fold_list = add_cp_to_invlist(fold_list, first_fold); 11647 for (i = 0; i + 1 < folds_to_this_cp_count; i++) { 11648 fold_list = add_cp_to_invlist(fold_list, 11649 remaining_folds[i]); 11650 } 11651 } 11652 11653 /* If the fold list is identical to what's in this ANYOF node, 11654 * the node can be represented by an EXACTFish one instead */ 11655 if (_invlistEQ(*use_this_list, fold_list, 11656 0 /* Don't complement */ ) 11657 ) { 11658 11659 /* But, we have to be careful, as mentioned above. Just 11660 * the right sequence of characters could match this if it 11661 * is part of a multi-character fold. That IS what we want 11662 * if we are under /i. But it ISN'T what we want if not 11663 * under /i, as it could match when it shouldn't. So, when 11664 * we aren't under /i and this character participates in a 11665 * multi-char fold, we don't optimize into an EXACTFish 11666 * node. So, for each case below we have to check if we 11667 * are folding, and if not, if it is not part of a 11668 * multi-char fold. */ 11669 if (lowest_cp > 255) { /* Highish code point */ 11670 if (FOLD || ! _invlist_contains_cp( 11671 PL_InMultiCharFold, folded)) 11672 { 11673 op = (LOC) 11674 ? EXACTFLU8 11675 : (ASCII_FOLD_RESTRICTED) 11676 ? EXACTFAA 11677 : EXACTFU_REQ8; 11678 value = folded; 11679 } 11680 } /* Below, the lowest code point < 256 */ 11681 else if ( FOLD 11682 && folded == 's' 11683 && DEPENDS_SEMANTICS) 11684 { /* An EXACTF node containing a single character 's', 11685 can be an EXACTFU if it doesn't get joined with an 11686 adjacent 's' */ 11687 op = EXACTFU_S_EDGE; 11688 value = folded; 11689 } 11690 else if ( FOLD 11691 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) 11692 { 11693 if (upper_latin1_only_utf8_matches) { 11694 op = EXACTF; 11695 11696 /* We can't use the fold, as that only matches 11697 * under UTF-8 */ 11698 value = lowest_cp; 11699 } 11700 else if ( UNLIKELY(lowest_cp == MICRO_SIGN) 11701 && ! UTF) 11702 { /* EXACTFUP is a special node for this character */ 11703 op = (ASCII_FOLD_RESTRICTED) 11704 ? EXACTFAA 11705 : EXACTFUP; 11706 value = MICRO_SIGN; 11707 } 11708 else if ( ASCII_FOLD_RESTRICTED 11709 && ! isASCII(lowest_cp)) 11710 { /* For ASCII under /iaa, we can use EXACTFU below 11711 */ 11712 op = EXACTFAA; 11713 value = folded; 11714 } 11715 else { 11716 op = EXACTFU; 11717 value = folded; 11718 } 11719 } 11720 } 11721 11722 SvREFCNT_dec_NN(fold_list); 11723 SvREFCNT_dec(all_cp_list); 11724 } 11725 } 11726 11727 if (op != END) { 11728 U8 len; 11729 11730 /* Here, we have calculated what EXACTish node to use. Have to 11731 * convert to UTF-8 if not already there */ 11732 if (value > 255) { 11733 if (! UTF) { 11734 SvREFCNT_dec(cp_list); 11735 REQUIRE_UTF8(flagp); 11736 } 11737 11738 /* This is a kludge to the special casing issues with this 11739 * ligature under /aa. FB05 should fold to FB06, but the call 11740 * above to _to_uni_fold_flags() didn't find this, as it didn't 11741 * use the /aa restriction in order to not miss other folds 11742 * that would be affected. This is the only instance likely to 11743 * ever be a problem in all of Unicode. So special case it. */ 11744 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T 11745 && ASCII_FOLD_RESTRICTED) 11746 { 11747 value = LATIN_SMALL_LIGATURE_ST; 11748 } 11749 } 11750 11751 len = (UTF) ? UVCHR_SKIP(value) : 1; 11752 11753 *ret = REGNODE_GUTS(pRExC_state, op, len); 11754 FILL_NODE(*ret, op); 11755 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len); 11756 setSTR_LEN(REGNODE_p(*ret), len); 11757 if (len == 1) { 11758 *STRINGs(REGNODE_p(*ret)) = (U8) value; 11759 } 11760 else { 11761 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value); 11762 } 11763 11764 return op; 11765 } 11766 } 11767 11768 if (! has_runtime_dependency) { 11769 11770 /* See if this can be turned into an ANYOFM node. Think about the bit 11771 * patterns in two different bytes. In some positions, the bits in 11772 * each will be 1; and in other positions both will be 0; and in some 11773 * positions the bit will be 1 in one byte, and 0 in the other. Let 11774 * 'n' be the number of positions where the bits differ. We create a 11775 * mask which has exactly 'n' 0 bits, each in a position where the two 11776 * bytes differ. Now take the set of all bytes that when ANDed with 11777 * the mask yield the same result. That set has 2**n elements, and is 11778 * representable by just two 8 bit numbers: the result and the mask. 11779 * Importantly, matching the set can be vectorized by creating a word 11780 * full of the result bytes, and a word full of the mask bytes, 11781 * yielding a significant speed up. Here, see if this node matches 11782 * such a set. As a concrete example consider [01], and the byte 11783 * representing '0' which is 0x30 on ASCII machines. It has the bits 11784 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with 11785 * that mask we get 0x30. Any other bytes ANDed yield something else. 11786 * So [01], which is a common usage, is optimizable into ANYOFM, and 11787 * can benefit from the speed up. We can only do this on UTF-8 11788 * invariant bytes, because they have the same bit patterns under UTF-8 11789 * as not. */ 11790 PERL_UINT_FAST8_T inverted = 0; 11791 11792 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on 11793 * EBCDIC */ 11794 const PERL_UINT_FAST8_T max_permissible 11795 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT); 11796 11797 /* If doesn't fit the criteria for ANYOFM, invert and try again. If 11798 * that works we will instead later generate an NANYOFM, and invert 11799 * back when through */ 11800 if (highest_cp > max_permissible) { 11801 _invlist_invert(cp_list); 11802 inverted = 1; 11803 } 11804 11805 if (invlist_highest(cp_list) <= max_permissible) { 11806 UV this_start, this_end; 11807 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */ 11808 U8 bits_differing = 0; 11809 Size_t full_cp_count = 0; 11810 bool first_time = TRUE; 11811 11812 /* Go through the bytes and find the bit positions that differ */ 11813 invlist_iterinit(cp_list); 11814 while (invlist_iternext(cp_list, &this_start, &this_end)) { 11815 unsigned int i = this_start; 11816 11817 if (first_time) { 11818 if (! UVCHR_IS_INVARIANT(i)) { 11819 goto done_anyofm; 11820 } 11821 11822 first_time = FALSE; 11823 lowest_cp = this_start; 11824 11825 /* We have set up the code point to compare with. Don't 11826 * compare it with itself */ 11827 i++; 11828 } 11829 11830 /* Find the bit positions that differ from the lowest code 11831 * point in the node. Keep track of all such positions by 11832 * OR'ing */ 11833 for (; i <= this_end; i++) { 11834 if (! UVCHR_IS_INVARIANT(i)) { 11835 goto done_anyofm; 11836 } 11837 11838 bits_differing |= i ^ lowest_cp; 11839 } 11840 11841 full_cp_count += this_end - this_start + 1; 11842 } 11843 11844 /* At the end of the loop, we count how many bits differ from the 11845 * bits in lowest code point, call the count 'd'. If the set we 11846 * found contains 2**d elements, it is the closure of all code 11847 * points that differ only in those bit positions. To convince 11848 * yourself of that, first note that the number in the closure must 11849 * be a power of 2, which we test for. The only way we could have 11850 * that count and it be some differing set, is if we got some code 11851 * points that don't differ from the lowest code point in any 11852 * position, but do differ from each other in some other position. 11853 * That means one code point has a 1 in that position, and another 11854 * has a 0. But that would mean that one of them differs from the 11855 * lowest code point in that position, which possibility we've 11856 * already excluded. */ 11857 if ( (inverted || full_cp_count > 1) 11858 && full_cp_count == 1U << PL_bitcount[bits_differing]) 11859 { 11860 U8 ANYOFM_mask; 11861 11862 op = ANYOFM + inverted; 11863 11864 /* We need to make the bits that differ be 0's */ 11865 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */ 11866 11867 /* The argument is the lowest code point */ 11868 *ret = reg1node(pRExC_state, op, lowest_cp); 11869 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask; 11870 } 11871 11872 done_anyofm: 11873 invlist_iterfinish(cp_list); 11874 } 11875 11876 if (inverted) { 11877 _invlist_invert(cp_list); 11878 } 11879 11880 if (op != END) { 11881 return op; 11882 } 11883 11884 /* XXX We could create an ANYOFR_LOW node here if we saved above if all 11885 * were invariants, it wasn't inverted, and there is a single range. 11886 * This would be faster than some of the posix nodes we create below 11887 * like /\d/a, but would be twice the size. Without having actually 11888 * measured the gain, khw doesn't think the tradeoff is really worth it 11889 * */ 11890 } 11891 11892 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) { 11893 PERL_UINT_FAST8_T type; 11894 SV * intersection = NULL; 11895 SV* d_invlist = NULL; 11896 11897 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD 11898 * ones are about the same speed as ANYOF ops, but take less room; the 11899 * ones that have above-Latin1 code point matches are somewhat faster 11900 * than ANYOF. */ 11901 11902 for (type = POSIXA; type >= POSIXD; type--) { 11903 int posix_class; 11904 11905 if (type == POSIXL) { /* But not /l posix classes */ 11906 continue; 11907 } 11908 11909 for (posix_class = 0; 11910 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_; 11911 posix_class++) 11912 { 11913 SV** our_code_points = &cp_list; 11914 SV** official_code_points; 11915 int try_inverted; 11916 11917 if (type == POSIXA) { 11918 official_code_points = &PL_Posix_ptrs[posix_class]; 11919 } 11920 else { 11921 official_code_points = &PL_XPosix_ptrs[posix_class]; 11922 } 11923 11924 /* Skip non-existent classes of this type. e.g. \v only has an 11925 * entry in PL_XPosix_ptrs */ 11926 if (! *official_code_points) { 11927 continue; 11928 } 11929 11930 /* Try both the regular class, and its inversion */ 11931 for (try_inverted = 0; try_inverted < 2; try_inverted++) { 11932 bool this_inverted = *invert ^ try_inverted; 11933 11934 if (type != POSIXD) { 11935 11936 /* This class that isn't /d can't match if we have /d 11937 * dependencies */ 11938 if (has_runtime_dependency 11939 & HAS_D_RUNTIME_DEPENDENCY) 11940 { 11941 continue; 11942 } 11943 } 11944 else /* is /d */ if (! this_inverted) { 11945 11946 /* /d classes don't match anything non-ASCII below 256 11947 * unconditionally (which cp_list contains) */ 11948 _invlist_intersection(cp_list, PL_UpperLatin1, 11949 &intersection); 11950 if (_invlist_len(intersection) != 0) { 11951 continue; 11952 } 11953 11954 SvREFCNT_dec(d_invlist); 11955 d_invlist = invlist_clone(cp_list, NULL); 11956 11957 /* But under UTF-8 it turns into using /u rules. Add 11958 * the things it matches under these conditions so that 11959 * we check below that these are identical to what the 11960 * tested class should match */ 11961 if (upper_latin1_only_utf8_matches) { 11962 _invlist_union( 11963 d_invlist, 11964 upper_latin1_only_utf8_matches, 11965 &d_invlist); 11966 } 11967 our_code_points = &d_invlist; 11968 } 11969 else { /* POSIXD, inverted. If this doesn't have this 11970 flag set, it isn't /d. */ 11971 if (! ( *anyof_flags 11972 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) 11973 { 11974 continue; 11975 } 11976 11977 our_code_points = &cp_list; 11978 } 11979 11980 /* Here, have weeded out some things. We want to see if 11981 * the list of characters this node contains 11982 * ('*our_code_points') precisely matches those of the 11983 * class we are currently checking against 11984 * ('*official_code_points'). */ 11985 if (_invlistEQ(*our_code_points, 11986 *official_code_points, 11987 try_inverted)) 11988 { 11989 /* Here, they precisely match. Optimize this ANYOF 11990 * node into its equivalent POSIX one of the correct 11991 * type, possibly inverted. 11992 * 11993 * Some of these nodes match a single range of 11994 * characters (or [:alpha:] matches two parallel ranges 11995 * on ASCII platforms). The array lookup at execution 11996 * time could be replaced by a range check for such 11997 * nodes. But regnodes are a finite resource, and the 11998 * possible performance boost isn't large, so this 11999 * hasn't been done. An attempt to use just one node 12000 * (and its inverse) to encompass all such cases was 12001 * made in d62feba66bf43f35d092bb026694f927e9f94d38. 12002 * But the shifting/masking it used ended up being 12003 * slower than the array look up, so it was reverted */ 12004 op = (try_inverted) 12005 ? type + NPOSIXA - POSIXA 12006 : type; 12007 *ret = reg_node(pRExC_state, op); 12008 FLAGS(REGNODE_p(*ret)) = posix_class; 12009 SvREFCNT_dec(d_invlist); 12010 SvREFCNT_dec(intersection); 12011 return op; 12012 } 12013 } 12014 } 12015 } 12016 SvREFCNT_dec(d_invlist); 12017 SvREFCNT_dec(intersection); 12018 } 12019 12020 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both 12021 * in size and speed. Currently, a 20 bit range base (smallest code point 12022 * in the range), and a 12 bit maximum delta are packed into a 32 bit word. 12023 * This allows for using it on all of the Unicode code points except for 12024 * the highest plane, which is only for private use code points. khw 12025 * doubts that a bigger delta is likely in real world applications */ 12026 if ( single_range 12027 && ! has_runtime_dependency 12028 && *anyof_flags == 0 12029 && start[0] < (1 << ANYOFR_BASE_BITS) 12030 && end[0] - start[0] 12031 < ((1U << (sizeof(ARG1u_LOC(NULL)) 12032 * CHARBITS - ANYOFR_BASE_BITS)))) 12033 12034 { 12035 U8 low_utf8[UTF8_MAXBYTES+1]; 12036 U8 high_utf8[UTF8_MAXBYTES+1]; 12037 12038 op = ANYOFR; 12039 *ret = reg1node(pRExC_state, op, 12040 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS)); 12041 12042 /* Place the lowest UTF-8 start byte in the flags field, so as to allow 12043 * efficient ruling out at run time of many possible inputs. */ 12044 (void) uvchr_to_utf8(low_utf8, start[0]); 12045 (void) uvchr_to_utf8(high_utf8, end[0]); 12046 12047 /* If all code points share the same first byte, this can be an 12048 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can 12049 * quickly rule out many inputs at run-time without having to compute 12050 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that 12051 * transformation would not rule out nearly so many things */ 12052 if (low_utf8[0] == high_utf8[0]) { 12053 op = ANYOFRb; 12054 OP(REGNODE_p(*ret)) = op; 12055 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0]; 12056 } 12057 else { 12058 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]); 12059 } 12060 12061 return op; 12062 } 12063 12064 /* If didn't find an optimization and there is no need for a bitmap, 12065 * of the lowest code points, optimize to indicate that */ 12066 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS 12067 && ! LOC 12068 && ! upper_latin1_only_utf8_matches 12069 && *anyof_flags == 0) 12070 { 12071 U8 low_utf8[UTF8_MAXBYTES+1]; 12072 UV highest_cp = invlist_highest(cp_list); 12073 12074 /* Currently the maximum allowed code point by the system is IV_MAX. 12075 * Higher ones are reserved for future internal use. This particular 12076 * regnode can be used for higher ones, but we can't calculate the code 12077 * point of those. IV_MAX suffices though, as it will be a large first 12078 * byte */ 12079 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX)) 12080 - low_utf8; 12081 12082 /* We store the lowest possible first byte of the UTF-8 representation, 12083 * using the flags field. This allows for quick ruling out of some 12084 * inputs without having to convert from UTF-8 to code point. For 12085 * EBCDIC, we use I8, as not doing that transformation would not rule 12086 * out nearly so many things */ 12087 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]); 12088 12089 op = ANYOFH; 12090 12091 /* If the first UTF-8 start byte for the highest code point in the 12092 * range is suitably small, we may be able to get an upper bound as 12093 * well */ 12094 if (highest_cp <= IV_MAX) { 12095 U8 high_utf8[UTF8_MAXBYTES+1]; 12096 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8; 12097 12098 /* If the lowest and highest are the same, we can get an exact 12099 * first byte instead of a just minimum or even a sequence of exact 12100 * leading bytes. We signal these with different regnodes */ 12101 if (low_utf8[0] == high_utf8[0]) { 12102 Size_t len = find_first_differing_byte_pos(low_utf8, 12103 high_utf8, 12104 MIN(low_len, high_len)); 12105 if (len == 1) { 12106 12107 /* No need to convert to I8 for EBCDIC as this is an exact 12108 * match */ 12109 *anyof_flags = low_utf8[0]; 12110 12111 if (high_len == 2) { 12112 /* If the elements matched all have a 2-byte UTF-8 12113 * representation, with the first byte being the same, 12114 * we can use a compact, fast regnode. capable of 12115 * matching any combination of continuation byte 12116 * patterns. 12117 * 12118 * (A similar regnode could be created for the Latin1 12119 * range; the complication being that it could match 12120 * non-UTF8 targets. The internal bitmap would serve 12121 * both cases; with some extra code in regexec.c) */ 12122 op = ANYOFHbbm; 12123 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12124 FILL_NODE(*ret, op); 12125 FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0], 12126 12127 /* The 64 bit (or 32 on EBCCDIC) map can be looked up 12128 * directly based on the continuation byte, without 12129 * needing to convert to code point */ 12130 populate_bitmap_from_invlist( 12131 cp_list, 12132 12133 /* The base code point is from the start byte */ 12134 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0], 12135 UTF_CONTINUATION_MARK | 0), 12136 12137 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap, 12138 REGNODE_BBM_BITMAP_LEN); 12139 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op); 12140 return op; 12141 } 12142 else { 12143 op = ANYOFHb; 12144 } 12145 } 12146 else { 12147 op = ANYOFHs; 12148 *ret = REGNODE_GUTS(pRExC_state, op, 12149 REGNODE_ARG_LEN(op) + STR_SZ(len)); 12150 FILL_NODE(*ret, op); 12151 STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret)) 12152 = len; 12153 Copy(low_utf8, /* Add the common bytes */ 12154 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string, 12155 len, U8); 12156 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret))); 12157 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list, 12158 NULL, only_utf8_locale_list); 12159 return op; 12160 } 12161 } 12162 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) { 12163 12164 /* Here, the high byte is not the same as the low, but is small 12165 * enough that its reasonable to have a loose upper bound, 12166 * which is packed in with the strict lower bound. See 12167 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC 12168 * platforms, I8 is used. On ASCII platforms I8 is the same 12169 * thing as UTF-8 */ 12170 12171 U8 bits = 0; 12172 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags; 12173 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0]) 12174 - *anyof_flags; 12175 12176 if (range_diff <= max_range_diff / 8) { 12177 bits = 3; 12178 } 12179 else if (range_diff <= max_range_diff / 4) { 12180 bits = 2; 12181 } 12182 else if (range_diff <= max_range_diff / 2) { 12183 bits = 1; 12184 } 12185 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits; 12186 op = ANYOFHr; 12187 } 12188 } 12189 } 12190 12191 return op; 12192 12193 return_OPFAIL: 12194 op = OPFAIL; 12195 *flagp &= ~(SIMPLE|HASWIDTH); 12196 *ret = reg1node(pRExC_state, op, 0); 12197 return op; 12198 12199 return_SANY: 12200 op = SANY; 12201 *ret = reg_node(pRExC_state, op); 12202 MARK_NAUGHTY(1); 12203 return op; 12204 } 12205 12206 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION 12207 12208 #ifdef PERL_RE_BUILD_AUX 12209 void 12210 Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, 12211 regnode* const node, 12212 SV* const cp_list, 12213 SV* const runtime_defns, 12214 SV* const only_utf8_locale_list) 12215 { 12216 /* Sets the arg field of an ANYOF-type node 'node', using information about 12217 * the node passed-in. If only the bitmap is needed to determine what 12218 * matches, the arg is set appropriately to either 12219 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE 12220 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE 12221 * 12222 * Otherwise, it sets the argument to the count returned by reg_add_data(), 12223 * having allocated and stored an array, av, as follows: 12224 * av[0] stores the inversion list defining this class as far as known at 12225 * this time, or PL_sv_undef if nothing definite is now known. 12226 * av[1] stores the inversion list of code points that match only if the 12227 * current locale is UTF-8, or if none, PL_sv_undef if there is an 12228 * av[2], or no entry otherwise. 12229 * av[2] stores the list of user-defined properties whose subroutine 12230 * definitions aren't known at this time, or no entry if none. */ 12231 12232 UV n; 12233 12234 PERL_ARGS_ASSERT_SET_ANYOF_ARG; 12235 12236 /* If this is set, the final disposition won't be known until runtime, so 12237 * we can't do any of the compile time optimizations */ 12238 if (! runtime_defns) { 12239 12240 /* On plain ANYOF nodes without the possibility of a runtime locale 12241 * making a difference, maybe there's no information to be gleaned 12242 * except for what's in the bitmap */ 12243 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) { 12244 12245 /* There are two such cases: 12246 * 1) there is no list of code points matched outside the bitmap 12247 */ 12248 if (! cp_list) { 12249 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE); 12250 return; 12251 } 12252 12253 /* 2) the list indicates everything outside the bitmap matches */ 12254 if ( invlist_highest(cp_list) == UV_MAX 12255 && invlist_highest_range_start(cp_list) 12256 <= NUM_ANYOF_CODE_POINTS) 12257 { 12258 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); 12259 return; 12260 } 12261 12262 /* In all other cases there are things outside the bitmap that we 12263 * may need to check at runtime. */ 12264 } 12265 12266 /* Here, we have resolved all the possible run-time matches, and they 12267 * are stored in one or both of two possible lists. (While some match 12268 * only under certain runtime circumstances, we know all the possible 12269 * ones for each such circumstance.) 12270 * 12271 * It may very well be that the pattern being compiled contains an 12272 * identical class, already encountered. Reusing that class here saves 12273 * space. Look through all classes so far encountered. */ 12274 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0; 12275 for (unsigned int i = 0; i < existing_items; i++) { 12276 12277 /* Only look at auxiliary data of this type */ 12278 if (RExC_rxi->data->what[i] != 's') { 12279 continue; 12280 } 12281 12282 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]); 12283 AV * const av = MUTABLE_AV(SvRV(rv)); 12284 12285 /* If the already encountered class has data that won't be known 12286 * until runtime (stored in the final element of the array), we 12287 * can't share */ 12288 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) { 12289 continue; 12290 } 12291 12292 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX, 12293 false /* no lvalue */); 12294 12295 /* The new and the existing one both have to have or both not 12296 * have this element, for this one to duplicate that one */ 12297 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) { 12298 continue; 12299 } 12300 12301 /* If the inversion lists aren't equivalent, can't share */ 12302 if (cp_list && ! _invlistEQ(cp_list, 12303 *stored_cp_list_ptr, 12304 FALSE /* don't complement */)) 12305 { 12306 continue; 12307 } 12308 12309 /* Similarly for the other list */ 12310 SV ** stored_only_utf8_locale_list_ptr = av_fetch( 12311 av, 12312 ONLY_LOCALE_MATCHES_INDEX, 12313 false /* no lvalue */); 12314 if ( cBOOL(only_utf8_locale_list) 12315 != cBOOL(stored_only_utf8_locale_list_ptr)) 12316 { 12317 continue; 12318 } 12319 12320 if (only_utf8_locale_list && ! _invlistEQ( 12321 only_utf8_locale_list, 12322 *stored_only_utf8_locale_list_ptr, 12323 FALSE /* don't complement */)) 12324 { 12325 continue; 12326 } 12327 12328 /* Here, the existence and contents of both compile-time lists 12329 * are identical between the new and existing data. Re-use the 12330 * existing one */ 12331 ARG1u_SET(node, i); 12332 return; 12333 } /* end of loop through existing classes */ 12334 } 12335 12336 /* Here, we need to create a new auxiliary data element; either because 12337 * this doesn't duplicate an existing one, or we can't tell at this time if 12338 * it eventually will */ 12339 12340 AV * const av = newAV(); 12341 SV *rv; 12342 12343 if (cp_list) { 12344 av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list)); 12345 } 12346 12347 /* (Note that if any of this changes, the size calculations in 12348 * S_optimize_regclass() might need to be updated.) */ 12349 12350 if (only_utf8_locale_list) { 12351 av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX, 12352 SvREFCNT_inc_NN(only_utf8_locale_list)); 12353 } 12354 12355 if (runtime_defns) { 12356 av_store_simple(av, DEFERRED_USER_DEFINED_INDEX, 12357 SvREFCNT_inc_NN(runtime_defns)); 12358 } 12359 12360 rv = newRV_noinc(MUTABLE_SV(av)); 12361 n = reg_add_data(pRExC_state, STR_WITH_LEN("s")); 12362 RExC_rxi->data->data[n] = (void*)rv; 12363 ARG1u_SET(node, n); 12364 } 12365 #endif /* PERL_RE_BUILD_AUX */ 12366 12367 SV * 12368 12369 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 12370 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist) 12371 #else 12372 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist) 12373 #endif 12374 12375 { 12376 /* For internal core use only. 12377 * Returns the inversion list for the input 'node' in the regex 'prog'. 12378 * If <doinit> is 'true', will attempt to create the inversion list if not 12379 * already done. If it is created, it will add to the normal inversion 12380 * list any that comes from user-defined properties. It croaks if this 12381 * is called before such a list is ready to be generated, that is when a 12382 * user-defined property has been declared, buyt still not yet defined. 12383 * If <listsvp> is non-null, will return the printable contents of the 12384 * property definition. This can be used to get debugging information 12385 * even before the inversion list exists, by calling this function with 12386 * 'doinit' set to false, in which case the components that will be used 12387 * to eventually create the inversion list are returned (in a printable 12388 * form). 12389 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to 12390 * store an inversion list of code points that should match only if the 12391 * execution-time locale is a UTF-8 one. 12392 * If <output_invlist> is not NULL, it is where this routine is to store an 12393 * inversion list of the code points that would be instead returned in 12394 * <listsvp> if this were NULL. Thus, what gets output in <listsvp> 12395 * when this parameter is used, is just the non-code point data that 12396 * will go into creating the inversion list. This currently should be just 12397 * user-defined properties whose definitions were not known at compile 12398 * time. Using this parameter allows for easier manipulation of the 12399 * inversion list's data by the caller. It is illegal to call this 12400 * function with this parameter set, but not <listsvp> 12401 * 12402 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note 12403 * that, in spite of this function's name, the inversion list it returns 12404 * may include the bitmap data as well */ 12405 12406 SV *si = NULL; /* Input initialization string */ 12407 SV* invlist = NULL; 12408 12409 RXi_GET_DECL_NULL(prog, progi); 12410 const struct reg_data * const data = prog ? progi->data : NULL; 12411 12412 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) 12413 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA; 12414 #else 12415 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA; 12416 #endif 12417 assert(! output_invlist || listsvp); 12418 12419 if (data && data->count) { 12420 const U32 n = ARG1u(node); 12421 12422 if (data->what[n] == 's') { 12423 SV * const rv = MUTABLE_SV(data->data[n]); 12424 AV * const av = MUTABLE_AV(SvRV(rv)); 12425 SV **const ary = AvARRAY(av); 12426 12427 invlist = ary[INVLIST_INDEX]; 12428 12429 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) { 12430 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX]; 12431 } 12432 12433 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { 12434 si = ary[DEFERRED_USER_DEFINED_INDEX]; 12435 } 12436 12437 if (doinit && (si || invlist)) { 12438 if (si) { 12439 bool user_defined; 12440 SV * msg = newSVpvs_flags("", SVs_TEMP); 12441 12442 SV * prop_definition = handle_user_defined_property( 12443 "", 0, FALSE, /* There is no \p{}, \P{} */ 12444 SvPVX_const(si)[1] - '0', /* /i or not has been 12445 stored here for just 12446 this occasion */ 12447 TRUE, /* run time */ 12448 FALSE, /* This call must find the defn */ 12449 si, /* The property definition */ 12450 &user_defined, 12451 msg, 12452 0 /* base level call */ 12453 ); 12454 12455 if (SvCUR(msg)) { 12456 assert(prop_definition == NULL); 12457 12458 Perl_croak(aTHX_ "%" UTF8f, 12459 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg))); 12460 } 12461 12462 if (invlist) { 12463 _invlist_union(invlist, prop_definition, &invlist); 12464 SvREFCNT_dec_NN(prop_definition); 12465 } 12466 else { 12467 invlist = prop_definition; 12468 } 12469 12470 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX); 12471 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX); 12472 12473 ary[INVLIST_INDEX] = invlist; 12474 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX]) 12475 ? ONLY_LOCALE_MATCHES_INDEX 12476 : INVLIST_INDEX); 12477 si = NULL; 12478 } 12479 } 12480 } 12481 } 12482 12483 /* If requested, return a printable version of what this ANYOF node matches 12484 * */ 12485 if (listsvp) { 12486 SV* matches_string = NULL; 12487 12488 /* This function can be called at compile-time, before everything gets 12489 * resolved, in which case we return the currently best available 12490 * information, which is the string that will eventually be used to do 12491 * that resolving, 'si' */ 12492 if (si) { 12493 /* Here, we only have 'si' (and possibly some passed-in data in 12494 * 'invlist', which is handled below) If the caller only wants 12495 * 'si', use that. */ 12496 if (! output_invlist) { 12497 matches_string = newSVsv(si); 12498 } 12499 else { 12500 /* But if the caller wants an inversion list of the node, we 12501 * need to parse 'si' and place as much as possible in the 12502 * desired output inversion list, making 'matches_string' only 12503 * contain the currently unresolvable things */ 12504 const char *si_string = SvPVX(si); 12505 STRLEN remaining = SvCUR(si); 12506 UV prev_cp = 0; 12507 U8 count = 0; 12508 12509 /* Ignore everything before and including the first new-line */ 12510 si_string = (const char *) memchr(si_string, '\n', SvCUR(si)); 12511 assert (si_string != NULL); 12512 si_string++; 12513 remaining = SvPVX(si) + SvCUR(si) - si_string; 12514 12515 while (remaining > 0) { 12516 12517 /* The data consists of just strings defining user-defined 12518 * property names, but in prior incarnations, and perhaps 12519 * somehow from pluggable regex engines, it could still 12520 * hold hex code point definitions, all of which should be 12521 * legal (or it wouldn't have gotten this far). Each 12522 * component of a range would be separated by a tab, and 12523 * each range by a new-line. If these are found, instead 12524 * add them to the inversion list */ 12525 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT 12526 |PERL_SCAN_SILENT_NON_PORTABLE; 12527 STRLEN len = remaining; 12528 UV cp = grok_hex(si_string, &len, &grok_flags, NULL); 12529 12530 /* If the hex decode routine found something, it should go 12531 * up to the next \n */ 12532 if ( *(si_string + len) == '\n') { 12533 if (count) { /* 2nd code point on line */ 12534 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); 12535 } 12536 else { 12537 *output_invlist = add_cp_to_invlist(*output_invlist, cp); 12538 } 12539 count = 0; 12540 goto prepare_for_next_iteration; 12541 } 12542 12543 /* If the hex decode was instead for the lower range limit, 12544 * save it, and go parse the upper range limit */ 12545 if (*(si_string + len) == '\t') { 12546 assert(count == 0); 12547 12548 prev_cp = cp; 12549 count = 1; 12550 prepare_for_next_iteration: 12551 si_string += len + 1; 12552 remaining -= len + 1; 12553 continue; 12554 } 12555 12556 /* Here, didn't find a legal hex number. Just add the text 12557 * from here up to the next \n, omitting any trailing 12558 * markers. */ 12559 12560 remaining -= len; 12561 len = strcspn(si_string, 12562 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n"); 12563 remaining -= len; 12564 if (matches_string) { 12565 sv_catpvn(matches_string, si_string, len); 12566 } 12567 else { 12568 matches_string = newSVpvn(si_string, len); 12569 } 12570 sv_catpvs(matches_string, " "); 12571 12572 si_string += len; 12573 if ( remaining 12574 && UCHARAT(si_string) 12575 == DEFERRED_COULD_BE_OFFICIAL_MARKERc) 12576 { 12577 si_string++; 12578 remaining--; 12579 } 12580 if (remaining && UCHARAT(si_string) == '\n') { 12581 si_string++; 12582 remaining--; 12583 } 12584 } /* end of loop through the text */ 12585 12586 assert(matches_string); 12587 if (SvCUR(matches_string)) { /* Get rid of trailing blank */ 12588 SvCUR_set(matches_string, SvCUR(matches_string) - 1); 12589 } 12590 } /* end of has an 'si' */ 12591 } 12592 12593 /* Add the stuff that's already known */ 12594 if (invlist) { 12595 12596 /* Again, if the caller doesn't want the output inversion list, put 12597 * everything in 'matches-string' */ 12598 if (! output_invlist) { 12599 if ( ! matches_string) { 12600 matches_string = newSVpvs("\n"); 12601 } 12602 sv_catsv(matches_string, invlist_contents(invlist, 12603 TRUE /* traditional style */ 12604 )); 12605 } 12606 else if (! *output_invlist) { 12607 *output_invlist = invlist_clone(invlist, NULL); 12608 } 12609 else { 12610 _invlist_union(*output_invlist, invlist, output_invlist); 12611 } 12612 } 12613 12614 *listsvp = matches_string; 12615 } 12616 12617 return invlist; 12618 } 12619 12620 /* reg_skipcomment() 12621 12622 Absorbs an /x style # comment from the input stream, 12623 returning a pointer to the first character beyond the comment, or if the 12624 comment terminates the pattern without anything following it, this returns 12625 one past the final character of the pattern (in other words, RExC_end) and 12626 sets the REG_RUN_ON_COMMENT_SEEN flag. 12627 12628 Note it's the callers responsibility to ensure that we are 12629 actually in /x mode 12630 12631 */ 12632 12633 PERL_STATIC_INLINE char* 12634 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) 12635 { 12636 PERL_ARGS_ASSERT_REG_SKIPCOMMENT; 12637 12638 assert(*p == '#'); 12639 12640 while (p < RExC_end) { 12641 if (*(++p) == '\n') { 12642 return p+1; 12643 } 12644 } 12645 12646 /* we ran off the end of the pattern without ending the comment, so we have 12647 * to add an \n when wrapping */ 12648 RExC_seen |= REG_RUN_ON_COMMENT_SEEN; 12649 return p; 12650 } 12651 12652 STATIC void 12653 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, 12654 char ** p, 12655 const bool force_to_xmod 12656 ) 12657 { 12658 /* If the text at the current parse position '*p' is a '(?#...)' comment, 12659 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' 12660 * is /x whitespace, advance '*p' so that on exit it points to the first 12661 * byte past all such white space and comments */ 12662 12663 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); 12664 12665 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; 12666 12667 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); 12668 12669 for (;;) { 12670 if (RExC_end - (*p) >= 3 12671 && *(*p) == '(' 12672 && *(*p + 1) == '?' 12673 && *(*p + 2) == '#') 12674 { 12675 while (*(*p) != ')') { 12676 if ((*p) == RExC_end) 12677 FAIL("Sequence (?#... not terminated"); 12678 (*p)++; 12679 } 12680 (*p)++; 12681 continue; 12682 } 12683 12684 if (use_xmod) { 12685 const char * save_p = *p; 12686 while ((*p) < RExC_end) { 12687 STRLEN len; 12688 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) { 12689 (*p) += len; 12690 } 12691 else if (*(*p) == '#') { 12692 (*p) = reg_skipcomment(pRExC_state, (*p)); 12693 } 12694 else { 12695 break; 12696 } 12697 } 12698 if (*p != save_p) { 12699 continue; 12700 } 12701 } 12702 12703 break; 12704 } 12705 12706 return; 12707 } 12708 12709 /* nextchar() 12710 12711 Advances the parse position by one byte, unless that byte is the beginning 12712 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In 12713 those two cases, the parse position is advanced beyond all such comments and 12714 white space. 12715 12716 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1). 12717 */ 12718 12719 STATIC void 12720 S_nextchar(pTHX_ RExC_state_t *pRExC_state) 12721 { 12722 PERL_ARGS_ASSERT_NEXTCHAR; 12723 12724 if (RExC_parse < RExC_end) { 12725 assert( ! UTF 12726 || UTF8_IS_INVARIANT(*RExC_parse) 12727 || UTF8_IS_START(*RExC_parse)); 12728 12729 RExC_parse_inc_safe(); 12730 12731 skip_to_be_ignored_text(pRExC_state, &RExC_parse, 12732 FALSE /* Don't force /x */ ); 12733 } 12734 } 12735 12736 STATIC void 12737 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) 12738 { 12739 /* 'size' is the delta number of smallest regnode equivalents to add or 12740 * subtract from the current memory allocated to the regex engine being 12741 * constructed. */ 12742 12743 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; 12744 12745 RExC_size += size; 12746 12747 Renewc(RExC_rxi, 12748 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode), 12749 /* +1 for REG_MAGIC */ 12750 char, 12751 regexp_internal); 12752 if ( RExC_rxi == NULL ) 12753 FAIL("Regexp out of space"); 12754 RXi_SET(RExC_rx, RExC_rxi); 12755 12756 RExC_emit_start = RExC_rxi->program; 12757 if (size > 0) { 12758 Zero(REGNODE_p(RExC_emit), size, regnode); 12759 } 12760 } 12761 12762 STATIC regnode_offset 12763 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size) 12764 { 12765 /* Allocate a regnode that is (1 + extra_size) times as big as the 12766 * smallest regnode worth of space, and also aligns and increments 12767 * RExC_size appropriately. 12768 * 12769 * It returns the regnode's offset into the regex engine program */ 12770 12771 const regnode_offset ret = RExC_emit; 12772 12773 PERL_ARGS_ASSERT_REGNODE_GUTS; 12774 12775 SIZE_ALIGN(RExC_size); 12776 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size); 12777 NODE_ALIGN_FILL(REGNODE_p(ret)); 12778 return(ret); 12779 } 12780 12781 #ifdef DEBUGGING 12782 12783 STATIC regnode_offset 12784 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) { 12785 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG; 12786 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF); 12787 return S_regnode_guts(aTHX_ pRExC_state, extra_size); 12788 } 12789 12790 #endif 12791 12792 12793 12794 /* 12795 - reg_node - emit a node 12796 */ 12797 STATIC regnode_offset /* Location. */ 12798 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) 12799 { 12800 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12801 regnode_offset ptr = ret; 12802 12803 PERL_ARGS_ASSERT_REG_NODE; 12804 12805 assert(REGNODE_ARG_LEN(op) == 0); 12806 12807 FILL_ADVANCE_NODE(ptr, op); 12808 RExC_emit = ptr; 12809 return(ret); 12810 } 12811 12812 /* 12813 - reg1node - emit a node with an argument 12814 */ 12815 STATIC regnode_offset /* Location. */ 12816 S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) 12817 { 12818 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12819 regnode_offset ptr = ret; 12820 12821 PERL_ARGS_ASSERT_REG1NODE; 12822 12823 /* ANYOF are special cased to allow non-length 1 args */ 12824 assert(REGNODE_ARG_LEN(op) == 1); 12825 12826 FILL_ADVANCE_NODE_ARG1u(ptr, op, arg); 12827 RExC_emit = ptr; 12828 return(ret); 12829 } 12830 12831 /* 12832 - regpnode - emit a temporary node with a SV* argument 12833 */ 12834 STATIC regnode_offset /* Location. */ 12835 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg) 12836 { 12837 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12838 regnode_offset ptr = ret; 12839 12840 PERL_ARGS_ASSERT_REGPNODE; 12841 12842 FILL_ADVANCE_NODE_ARGp(ptr, op, arg); 12843 RExC_emit = ptr; 12844 return(ret); 12845 } 12846 12847 STATIC regnode_offset 12848 S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) 12849 { 12850 /* emit a node with U32 and I32 arguments */ 12851 12852 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op)); 12853 regnode_offset ptr = ret; 12854 12855 PERL_ARGS_ASSERT_REG2NODE; 12856 12857 assert(REGNODE_ARG_LEN(op) == 2); 12858 12859 FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2); 12860 RExC_emit = ptr; 12861 return(ret); 12862 } 12863 12864 /* 12865 - reginsert - insert an operator in front of already-emitted operand 12866 * 12867 * That means that on exit 'operand' is the offset of the newly inserted 12868 * operator, and the original operand has been relocated. 12869 * 12870 * IMPORTANT NOTE - it is the *callers* responsibility to correctly 12871 * set up NEXT_OFF() of the inserted node if needed. Something like this: 12872 * 12873 * reginsert(pRExC, OPFAIL, orig_emit, depth+1); 12874 * NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE; 12875 * 12876 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well. 12877 */ 12878 STATIC void 12879 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, 12880 const regnode_offset operand, const U32 depth) 12881 { 12882 regnode *src; 12883 regnode *dst; 12884 regnode *place; 12885 const int offset = REGNODE_ARG_LEN((U8)op); 12886 const int size = NODE_STEP_REGNODE + offset; 12887 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12888 12889 PERL_ARGS_ASSERT_REGINSERT; 12890 PERL_UNUSED_CONTEXT; 12891 PERL_UNUSED_ARG(depth); 12892 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op)); 12893 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started 12894 studying. If this is wrong then we need to adjust RExC_recurse 12895 below like we do with RExC_open_parens/RExC_close_parens. */ 12896 change_engine_size(pRExC_state, (Ptrdiff_t) size); 12897 src = REGNODE_p(RExC_emit); 12898 RExC_emit += size; 12899 dst = REGNODE_p(RExC_emit); 12900 12901 /* If we are in a "count the parentheses" pass, the numbers are unreliable, 12902 * and [perl #133871] shows this can lead to problems, so skip this 12903 * realignment of parens until a later pass when they are reliable */ 12904 if (! IN_PARENS_PASS && RExC_open_parens) { 12905 int paren; 12906 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/ 12907 /* remember that RExC_npar is rex->nparens + 1, 12908 * iow it is 1 more than the number of parens seen in 12909 * the pattern so far. */ 12910 for ( paren=0 ; paren < RExC_npar ; paren++ ) { 12911 /* note, RExC_open_parens[0] is the start of the 12912 * regex, it can't move. RExC_close_parens[0] is the end 12913 * of the regex, it *can* move. */ 12914 if ( paren && RExC_open_parens[paren] >= operand ) { 12915 /*DEBUG_PARSE_FMT("open"," - %d", size);*/ 12916 RExC_open_parens[paren] += size; 12917 } else { 12918 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ 12919 } 12920 if ( RExC_close_parens[paren] >= operand ) { 12921 /*DEBUG_PARSE_FMT("close"," - %d", size);*/ 12922 RExC_close_parens[paren] += size; 12923 } else { 12924 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ 12925 } 12926 } 12927 } 12928 if (RExC_end_op) 12929 RExC_end_op += size; 12930 12931 while (src > REGNODE_p(operand)) { 12932 StructCopy(--src, --dst, regnode); 12933 } 12934 12935 place = REGNODE_p(operand); /* Op node, where operand used to be. */ 12936 src = place + 1; /* NOT REGNODE_AFTER! */ 12937 FLAGS(place) = 0; 12938 FILL_NODE(operand, op); 12939 12940 /* Zero out any arguments in the new node */ 12941 Zero(src, offset, regnode); 12942 } 12943 12944 /* 12945 - regtail - set the next-pointer at the end of a node chain of p to val. If 12946 that value won't fit in the space available, instead returns FALSE. 12947 (Except asserts if we can't fit in the largest space the regex 12948 engine is designed for.) 12949 - SEE ALSO: regtail_study 12950 */ 12951 STATIC bool 12952 S_regtail(pTHX_ RExC_state_t * pRExC_state, 12953 const regnode_offset p, 12954 const regnode_offset val, 12955 const U32 depth) 12956 { 12957 regnode_offset scan; 12958 DECLARE_AND_GET_RE_DEBUG_FLAGS; 12959 12960 PERL_ARGS_ASSERT_REGTAIL; 12961 #ifndef DEBUGGING 12962 PERL_UNUSED_ARG(depth); 12963 #endif 12964 12965 /* The final node in the chain is the first one with a nonzero next pointer 12966 * */ 12967 scan = (regnode_offset) p; 12968 for (;;) { 12969 regnode * const temp = regnext(REGNODE_p(scan)); 12970 DEBUG_PARSE_r({ 12971 DEBUG_PARSE_MSG((scan==p ? "tail" : "")); 12972 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); 12973 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n", 12974 SvPV_nolen_const(RExC_mysv), scan, 12975 (temp == NULL ? "->" : ""), 12976 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "") 12977 ); 12978 }); 12979 if (temp == NULL) 12980 break; 12981 scan = REGNODE_OFFSET(temp); 12982 } 12983 12984 /* Populate this node's next pointer */ 12985 assert(val >= scan); 12986 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) { 12987 assert((UV) (val - scan) <= U32_MAX); 12988 ARG1u_SET(REGNODE_p(scan), val - scan); 12989 } 12990 else { 12991 if (val - scan > U16_MAX) { 12992 /* Populate this with something that won't loop and will likely 12993 * lead to a crash if the caller ignores the failure return, and 12994 * execution continues */ 12995 NEXT_OFF(REGNODE_p(scan)) = U16_MAX; 12996 return FALSE; 12997 } 12998 NEXT_OFF(REGNODE_p(scan)) = val - scan; 12999 } 13000 13001 return TRUE; 13002 } 13003 13004 #ifdef DEBUGGING 13005 /* 13006 - regtail_study - set the next-pointer at the end of a node chain of p to val. 13007 - Look for optimizable sequences at the same time. 13008 - currently only looks for EXACT chains. 13009 13010 This is experimental code. The idea is to use this routine to perform 13011 in place optimizations on branches and groups as they are constructed, 13012 with the long term intention of removing optimization from study_chunk so 13013 that it is purely analytical. 13014 13015 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used 13016 to control which is which. 13017 13018 This used to return a value that was ignored. It was a problem that it is 13019 #ifdef'd to be another function that didn't return a value. khw has changed it 13020 so both currently return a pass/fail return. 13021 13022 */ 13023 /* TODO: All four parms should be const */ 13024 13025 STATIC bool 13026 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, 13027 const regnode_offset val, U32 depth) 13028 { 13029 regnode_offset scan; 13030 U8 exact = PSEUDO; 13031 #ifdef EXPERIMENTAL_INPLACESCAN 13032 I32 min = 0; 13033 #endif 13034 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13035 13036 PERL_ARGS_ASSERT_REGTAIL_STUDY; 13037 13038 13039 /* Find last node. */ 13040 13041 scan = p; 13042 for (;;) { 13043 regnode * const temp = regnext(REGNODE_p(scan)); 13044 #ifdef EXPERIMENTAL_INPLACESCAN 13045 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) { 13046 bool unfolded_multi_char; /* Unexamined in this routine */ 13047 if (join_exact(pRExC_state, scan, &min, 13048 &unfolded_multi_char, 1, REGNODE_p(val), depth+1)) 13049 return TRUE; /* Was return EXACT */ 13050 } 13051 #endif 13052 if ( exact ) { 13053 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) { 13054 if (exact == PSEUDO ) 13055 exact= OP(REGNODE_p(scan)); 13056 else if (exact != OP(REGNODE_p(scan)) ) 13057 exact= 0; 13058 } 13059 else if (OP(REGNODE_p(scan)) != NOTHING) { 13060 exact= 0; 13061 } 13062 } 13063 DEBUG_PARSE_r({ 13064 DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); 13065 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); 13066 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n", 13067 SvPV_nolen_const(RExC_mysv), 13068 scan, 13069 REGNODE_NAME(exact)); 13070 }); 13071 if (temp == NULL) 13072 break; 13073 scan = REGNODE_OFFSET(temp); 13074 } 13075 DEBUG_PARSE_r({ 13076 DEBUG_PARSE_MSG(""); 13077 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state); 13078 Perl_re_printf( aTHX_ 13079 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n", 13080 SvPV_nolen_const(RExC_mysv), 13081 (IV)val, 13082 (IV)(val - scan) 13083 ); 13084 }); 13085 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) { 13086 assert((UV) (val - scan) <= U32_MAX); 13087 ARG1u_SET(REGNODE_p(scan), val - scan); 13088 } 13089 else { 13090 if (val - scan > U16_MAX) { 13091 /* Populate this with something that won't loop and will likely 13092 * lead to a crash if the caller ignores the failure return, and 13093 * execution continues */ 13094 NEXT_OFF(REGNODE_p(scan)) = U16_MAX; 13095 return FALSE; 13096 } 13097 NEXT_OFF(REGNODE_p(scan)) = val - scan; 13098 } 13099 13100 return TRUE; /* Was 'return exact' */ 13101 } 13102 #endif 13103 13104 13105 #ifdef PERL_RE_BUILD_AUX 13106 SV* 13107 Perl_get_ANYOFM_contents(pTHX_ const regnode * n) { 13108 13109 /* Returns an inversion list of all the code points matched by the 13110 * ANYOFM/NANYOFM node 'n' */ 13111 13112 SV * cp_list = _new_invlist(-1); 13113 const U8 lowest = (U8) ARG1u(n); 13114 unsigned int i; 13115 U8 count = 0; 13116 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)]; 13117 13118 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS; 13119 13120 /* Starting with the lowest code point, any code point that ANDed with the 13121 * mask yields the lowest code point is in the set */ 13122 for (i = lowest; i <= 0xFF; i++) { 13123 if ((i & FLAGS(n)) == ARG1u(n)) { 13124 cp_list = add_cp_to_invlist(cp_list, i); 13125 count++; 13126 13127 /* We know how many code points (a power of two) that are in the 13128 * set. No use looking once we've got that number */ 13129 if (count >= needed) break; 13130 } 13131 } 13132 13133 if (OP(n) == NANYOFM) { 13134 _invlist_invert(cp_list); 13135 } 13136 return cp_list; 13137 } 13138 13139 SV * 13140 Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) { 13141 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS; 13142 13143 SV * cp_list = NULL; 13144 populate_invlist_from_bitmap( 13145 ((struct regnode_bbm *) n)->bitmap, 13146 REGNODE_BBM_BITMAP_LEN * CHARBITS, 13147 &cp_list, 13148 13149 /* The base cp is from the start byte plus a zero continuation */ 13150 TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n), 13151 UTF_CONTINUATION_MARK | 0)); 13152 return cp_list; 13153 } 13154 #endif /* PERL_RE_BUILD_AUX */ 13155 13156 13157 SV * 13158 Perl_re_intuit_string(pTHX_ REGEXP * const r) 13159 { /* Assume that RE_INTUIT is set */ 13160 /* Returns an SV containing a string that must appear in the target for it 13161 * to match, or NULL if nothing is known that must match. 13162 * 13163 * CAUTION: the SV can be freed during execution of the regex engine */ 13164 13165 struct regexp *const prog = ReANY(r); 13166 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13167 13168 PERL_ARGS_ASSERT_RE_INTUIT_STRING; 13169 PERL_UNUSED_CONTEXT; 13170 13171 DEBUG_COMPILE_r( 13172 { 13173 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) { 13174 const char * const s = SvPV_nolen_const(RX_UTF8(r) 13175 ? prog->check_utf8 : prog->check_substr); 13176 13177 if (!PL_colorset) reginitcolors(); 13178 Perl_re_printf( aTHX_ 13179 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", 13180 PL_colors[4], 13181 RX_UTF8(r) ? "utf8 " : "", 13182 PL_colors[5], PL_colors[0], 13183 s, 13184 PL_colors[1], 13185 (strlen(s) > PL_dump_re_max_len ? "..." : "")); 13186 } 13187 } ); 13188 13189 /* use UTF8 check substring if regexp pattern itself is in UTF8 */ 13190 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; 13191 } 13192 13193 /* 13194 pregfree() 13195 13196 handles refcounting and freeing the perl core regexp structure. When 13197 it is necessary to actually free the structure the first thing it 13198 does is call the 'free' method of the regexp_engine associated to 13199 the regexp, allowing the handling of the void *pprivate; member 13200 first. (This routine is not overridable by extensions, which is why 13201 the extensions free is called first.) 13202 13203 See regdupe and regdupe_internal if you change anything here. 13204 */ 13205 #ifndef PERL_IN_XSUB_RE 13206 void 13207 Perl_pregfree(pTHX_ REGEXP *r) 13208 { 13209 SvREFCNT_dec(r); 13210 } 13211 13212 void 13213 Perl_pregfree2(pTHX_ REGEXP *rx) 13214 { 13215 struct regexp *const r = ReANY(rx); 13216 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13217 13218 PERL_ARGS_ASSERT_PREGFREE2; 13219 13220 if (! r) 13221 return; 13222 13223 if (r->mother_re) { 13224 ReREFCNT_dec(r->mother_re); 13225 } else { 13226 CALLREGFREE_PVT(rx); /* free the private data */ 13227 SvREFCNT_dec(RXp_PAREN_NAMES(r)); 13228 } 13229 if (r->substrs) { 13230 int i; 13231 for (i = 0; i < 2; i++) { 13232 SvREFCNT_dec(r->substrs->data[i].substr); 13233 SvREFCNT_dec(r->substrs->data[i].utf8_substr); 13234 } 13235 Safefree(r->substrs); 13236 } 13237 RX_MATCH_COPY_FREE(rx); 13238 #ifdef PERL_ANY_COW 13239 SvREFCNT_dec(r->saved_copy); 13240 #endif 13241 Safefree(RXp_OFFSp(r)); 13242 if (r->logical_to_parno) { 13243 Safefree(r->logical_to_parno); 13244 Safefree(r->parno_to_logical); 13245 Safefree(r->parno_to_logical_next); 13246 } 13247 13248 SvREFCNT_dec(r->qr_anoncv); 13249 if (r->recurse_locinput) 13250 Safefree(r->recurse_locinput); 13251 } 13252 13253 13254 /* reg_temp_copy() 13255 13256 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV, 13257 except that dsv will be created if NULL. 13258 13259 This function is used in two main ways. First to implement 13260 $r = qr/....; $s = $$r; 13261 13262 Secondly, it is used as a hacky workaround to the structural issue of 13263 match results 13264 being stored in the regexp structure which is in turn stored in 13265 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern 13266 could be PL_curpm in multiple contexts, and could require multiple 13267 result sets being associated with the pattern simultaneously, such 13268 as when doing a recursive match with (??{$qr}) 13269 13270 The solution is to make a lightweight copy of the regexp structure 13271 when a qr// is returned from the code executed by (??{$qr}) this 13272 lightweight copy doesn't actually own any of its data except for 13273 the starp/end and the actual regexp structure itself. 13274 13275 */ 13276 13277 13278 REGEXP * 13279 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv) 13280 { 13281 struct regexp *drx; 13282 struct regexp *const srx = ReANY(ssv); 13283 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV; 13284 13285 PERL_ARGS_ASSERT_REG_TEMP_COPY; 13286 13287 if (!dsv) 13288 dsv = (REGEXP*) newSV_type(SVt_REGEXP); 13289 else { 13290 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV)); 13291 13292 /* our only valid caller, sv_setsv_flags(), should have done 13293 * a SV_CHECK_THINKFIRST_COW_DROP() by now */ 13294 assert(!SvOOK(dsv)); 13295 assert(!SvIsCOW(dsv)); 13296 assert(!SvROK(dsv)); 13297 13298 if (SvPVX_const(dsv)) { 13299 if (SvLEN(dsv)) 13300 Safefree(SvPVX(dsv)); 13301 SvPVX(dsv) = NULL; 13302 } 13303 SvLEN_set(dsv, 0); 13304 SvCUR_set(dsv, 0); 13305 SvOK_off((SV *)dsv); 13306 13307 if (islv) { 13308 /* For PVLVs, the head (sv_any) points to an XPVLV, while 13309 * the LV's xpvlenu_rx will point to a regexp body, which 13310 * we allocate here */ 13311 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); 13312 assert(!SvPVX(dsv)); 13313 /* We "steal" the body from the newly allocated SV temp, changing 13314 * the pointer in its HEAD to NULL. We then change its type to 13315 * SVt_NULL so that when we immediately release its only reference, 13316 * no memory deallocation happens. 13317 * 13318 * The body will eventually be freed (from the PVLV) either in 13319 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and 13320 * the regexp body needs to be removed) 13321 * or in Perl_sv_clear() (if the PVLV still holds the pointer until 13322 * the PVLV itself is deallocated). */ 13323 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any; 13324 temp->sv_any = NULL; 13325 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; 13326 SvREFCNT_dec_NN(temp); 13327 /* SvCUR still resides in the xpvlv struct, so the regexp copy- 13328 ing below will not set it. */ 13329 SvCUR_set(dsv, SvCUR(ssv)); 13330 } 13331 } 13332 /* This ensures that SvTHINKFIRST(sv) is true, and hence that 13333 sv_force_normal(sv) is called. */ 13334 SvFAKE_on(dsv); 13335 drx = ReANY(dsv); 13336 13337 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8); 13338 SvPV_set(dsv, RX_WRAPPED(ssv)); 13339 /* We share the same string buffer as the original regexp, on which we 13340 hold a reference count, incremented when mother_re is set below. 13341 The string pointer is copied here, being part of the regexp struct. 13342 */ 13343 memcpy(&(drx->xpv_cur), &(srx->xpv_cur), 13344 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); 13345 13346 if (!islv) 13347 SvLEN_set(dsv, 0); 13348 if (RXp_OFFSp(srx)) { 13349 const I32 npar = srx->nparens+1; 13350 NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair); 13351 } 13352 if (srx->substrs) { 13353 int i; 13354 Newx(drx->substrs, 1, struct reg_substr_data); 13355 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data); 13356 13357 for (i = 0; i < 2; i++) { 13358 SvREFCNT_inc_void(drx->substrs->data[i].substr); 13359 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr); 13360 } 13361 13362 /* check_substr and check_utf8, if non-NULL, point to either their 13363 anchored or float namesakes, and don't hold a second reference. */ 13364 } 13365 if (srx->logical_to_parno) { 13366 NewCopy(srx->logical_to_parno, 13367 drx->logical_to_parno, 13368 srx->nparens+1, I32); 13369 NewCopy(srx->parno_to_logical, 13370 drx->parno_to_logical, 13371 srx->nparens+1, I32); 13372 NewCopy(srx->parno_to_logical_next, 13373 drx->parno_to_logical_next, 13374 srx->nparens+1, I32); 13375 } else { 13376 drx->logical_to_parno = NULL; 13377 drx->parno_to_logical = NULL; 13378 drx->parno_to_logical_next = NULL; 13379 } 13380 drx->logical_nparens = srx->logical_nparens; 13381 13382 RX_MATCH_COPIED_off(dsv); 13383 #ifdef PERL_ANY_COW 13384 RXp_SAVED_COPY(drx) = NULL; 13385 #endif 13386 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv); 13387 SvREFCNT_inc_void(drx->qr_anoncv); 13388 if (srx->recurse_locinput) 13389 Newx(drx->recurse_locinput, srx->nparens + 1, char *); 13390 13391 return dsv; 13392 } 13393 #endif 13394 13395 13396 /* regfree_internal() 13397 13398 Free the private data in a regexp. This is overloadable by 13399 extensions. Perl takes care of the regexp structure in pregfree(), 13400 this covers the *pprivate pointer which technically perl doesn't 13401 know about, however of course we have to handle the 13402 regexp_internal structure when no extension is in use. 13403 13404 Note this is called before freeing anything in the regexp 13405 structure. 13406 */ 13407 13408 void 13409 Perl_regfree_internal(pTHX_ REGEXP * const rx) 13410 { 13411 struct regexp *const r = ReANY(rx); 13412 RXi_GET_DECL(r, ri); 13413 DECLARE_AND_GET_RE_DEBUG_FLAGS; 13414 13415 PERL_ARGS_ASSERT_REGFREE_INTERNAL; 13416 13417 if (! ri) { 13418 return; 13419 } 13420 13421 DEBUG_COMPILE_r({ 13422 if (!PL_colorset) 13423 reginitcolors(); 13424 { 13425 SV *dsv= sv_newmortal(); 13426 RE_PV_QUOTED_DECL(s, RX_UTF8(rx), 13427 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len); 13428 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n", 13429 PL_colors[4], PL_colors[5], s); 13430 } 13431 }); 13432 13433 if (ri->code_blocks) 13434 S_free_codeblocks(aTHX_ ri->code_blocks); 13435 13436 if (ri->data) { 13437 int n = ri->data->count; 13438 13439 while (--n >= 0) { 13440 /* If you add a ->what type here, update the comment in regcomp.h */ 13441 switch (ri->data->what[n]) { 13442 case 'a': 13443 case 'r': 13444 case 's': 13445 case 'S': 13446 case 'u': 13447 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n])); 13448 break; 13449 case 'f': 13450 Safefree(ri->data->data[n]); 13451 break; 13452 case 'l': 13453 case 'L': 13454 break; 13455 case 'T': 13456 { /* Aho Corasick add-on structure for a trie node. 13457 Used in stclass optimization only */ 13458 U32 refcount; 13459 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; 13460 OP_REFCNT_LOCK; 13461 refcount = --aho->refcount; 13462 OP_REFCNT_UNLOCK; 13463 if ( !refcount ) { 13464 PerlMemShared_free(aho->states); 13465 PerlMemShared_free(aho->fail); 13466 /* do this last!!!! */ 13467 PerlMemShared_free(ri->data->data[n]); 13468 /* we should only ever get called once, so 13469 * assert as much, and also guard the free 13470 * which /might/ happen twice. At the least 13471 * it will make code anlyzers happy and it 13472 * doesn't cost much. - Yves */ 13473 assert(ri->regstclass); 13474 if (ri->regstclass) { 13475 PerlMemShared_free(ri->regstclass); 13476 ri->regstclass = 0; 13477 } 13478 } 13479 } 13480 break; 13481 case 't': 13482 { 13483 /* trie structure. */ 13484 U32 refcount; 13485 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; 13486 OP_REFCNT_LOCK; 13487 refcount = --trie->refcount; 13488 OP_REFCNT_UNLOCK; 13489 if ( !refcount ) { 13490 PerlMemShared_free(trie->charmap); 13491 PerlMemShared_free(trie->states); 13492 PerlMemShared_free(trie->trans); 13493 if (trie->bitmap) 13494 PerlMemShared_free(trie->bitmap); 13495 if (trie->jump) 13496 PerlMemShared_free(trie->jump); 13497 if (trie->j_before_paren) 13498 PerlMemShared_free(trie->j_before_paren); 13499 if (trie->j_after_paren) 13500 PerlMemShared_free(trie->j_after_paren); 13501 PerlMemShared_free(trie->wordinfo); 13502 /* do this last!!!! */ 13503 PerlMemShared_free(ri->data->data[n]); 13504 } 13505 } 13506 break; 13507 case '%': 13508 /* NO-OP a '%' data contains a null pointer, so that reg_add_data 13509 * always returns non-zero, this should only ever happen in the 13510 * 0 index */ 13511 assert(n==0); 13512 break; 13513 default: 13514 Perl_croak(aTHX_ "panic: regfree data code '%c'", 13515 ri->data->what[n]); 13516 } 13517 } 13518 Safefree(ri->data->what); 13519 Safefree(ri->data); 13520 } 13521 13522 Safefree(ri); 13523 } 13524 13525 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL) 13526 13527 /* 13528 =for apidoc re_dup_guts 13529 Duplicate a regexp. 13530 13531 This routine is expected to clone a given regexp structure. It is only 13532 compiled under USE_ITHREADS. 13533 13534 After all of the core data stored in struct regexp is duplicated 13535 the C<regexp_engine.dupe> method is used to copy any private data 13536 stored in the *pprivate pointer. This allows extensions to handle 13537 any duplication they need to do. 13538 13539 =cut 13540 13541 See pregfree() and regfree_internal() if you change anything here. 13542 */ 13543 #if defined(USE_ITHREADS) 13544 #ifndef PERL_IN_XSUB_RE 13545 void 13546 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 13547 { 13548 I32 npar; 13549 const struct regexp *r = ReANY(sstr); 13550 struct regexp *ret = ReANY(dstr); 13551 13552 PERL_ARGS_ASSERT_RE_DUP_GUTS; 13553 13554 npar = r->nparens+1; 13555 NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair); 13556 13557 if (ret->substrs) { 13558 /* Do it this way to avoid reading from *r after the StructCopy(). 13559 That way, if any of the sv_dup_inc()s dislodge *r from the L1 13560 cache, it doesn't matter. */ 13561 int i; 13562 const bool anchored = r->check_substr 13563 ? r->check_substr == r->substrs->data[0].substr 13564 : r->check_utf8 == r->substrs->data[0].utf8_substr; 13565 Newx(ret->substrs, 1, struct reg_substr_data); 13566 StructCopy(r->substrs, ret->substrs, struct reg_substr_data); 13567 13568 for (i = 0; i < 2; i++) { 13569 ret->substrs->data[i].substr = 13570 sv_dup_inc(ret->substrs->data[i].substr, param); 13571 ret->substrs->data[i].utf8_substr = 13572 sv_dup_inc(ret->substrs->data[i].utf8_substr, param); 13573 } 13574 13575 /* check_substr and check_utf8, if non-NULL, point to either their 13576 anchored or float namesakes, and don't hold a second reference. */ 13577 13578 if (ret->check_substr) { 13579 if (anchored) { 13580 assert(r->check_utf8 == r->substrs->data[0].utf8_substr); 13581 13582 ret->check_substr = ret->substrs->data[0].substr; 13583 ret->check_utf8 = ret->substrs->data[0].utf8_substr; 13584 } else { 13585 assert(r->check_substr == r->substrs->data[1].substr); 13586 assert(r->check_utf8 == r->substrs->data[1].utf8_substr); 13587 13588 ret->check_substr = ret->substrs->data[1].substr; 13589 ret->check_utf8 = ret->substrs->data[1].utf8_substr; 13590 } 13591 } else if (ret->check_utf8) { 13592 if (anchored) { 13593 ret->check_utf8 = ret->substrs->data[0].utf8_substr; 13594 } else { 13595 ret->check_utf8 = ret->substrs->data[1].utf8_substr; 13596 } 13597 } 13598 } 13599 13600 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); 13601 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); 13602 if (r->recurse_locinput) 13603 Newx(ret->recurse_locinput, r->nparens + 1, char *); 13604 13605 if (ret->pprivate) 13606 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param)); 13607 13608 if (RX_MATCH_COPIED(dstr)) 13609 RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret)); 13610 else 13611 RXp_SUBBEG(ret) = NULL; 13612 #ifdef PERL_ANY_COW 13613 RXp_SAVED_COPY(ret) = NULL; 13614 #endif 13615 13616 if (r->logical_to_parno) { 13617 /* we use total_parens for all three just for symmetry */ 13618 ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32)); 13619 ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32)); 13620 ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32)); 13621 } else { 13622 ret->logical_to_parno = NULL; 13623 ret->parno_to_logical = NULL; 13624 ret->parno_to_logical_next = NULL; 13625 } 13626 13627 ret->logical_nparens = r->logical_nparens; 13628 13629 /* Whether mother_re be set or no, we need to copy the string. We 13630 cannot refrain from copying it when the storage points directly to 13631 our mother regexp, because that's 13632 1: a buffer in a different thread 13633 2: something we no longer hold a reference on 13634 so we need to copy it locally. */ 13635 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1); 13636 /* set malloced length to a non-zero value so it will be freed 13637 * (otherwise in combination with SVf_FAKE it looks like an alien 13638 * buffer). It doesn't have to be the actual malloced size, since it 13639 * should never be grown */ 13640 SvLEN_set(dstr, SvCUR(sstr)+1); 13641 ret->mother_re = NULL; 13642 } 13643 #endif /* PERL_IN_XSUB_RE */ 13644 13645 /* 13646 regdupe_internal() 13647 13648 This is the internal complement to regdupe() which is used to copy 13649 the structure pointed to by the *pprivate pointer in the regexp. 13650 This is the core version of the extension overridable cloning hook. 13651 The regexp structure being duplicated will be copied by perl prior 13652 to this and will be provided as the regexp *r argument, however 13653 with the /old/ structures pprivate pointer value. Thus this routine 13654 may override any copying normally done by perl. 13655 13656 It returns a pointer to the new regexp_internal structure. 13657 */ 13658 13659 void * 13660 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) 13661 { 13662 struct regexp *const r = ReANY(rx); 13663 regexp_internal *reti; 13664 int len; 13665 RXi_GET_DECL(r, ri); 13666 13667 PERL_ARGS_ASSERT_REGDUPE_INTERNAL; 13668 13669 len = ProgLen(ri); 13670 13671 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), 13672 char, regexp_internal); 13673 Copy(ri->program, reti->program, len+1, regnode); 13674 13675 13676 if (ri->code_blocks) { 13677 int n; 13678 Newx(reti->code_blocks, 1, struct reg_code_blocks); 13679 Newx(reti->code_blocks->cb, ri->code_blocks->count, 13680 struct reg_code_block); 13681 Copy(ri->code_blocks->cb, reti->code_blocks->cb, 13682 ri->code_blocks->count, struct reg_code_block); 13683 for (n = 0; n < ri->code_blocks->count; n++) 13684 reti->code_blocks->cb[n].src_regex = (REGEXP*) 13685 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param); 13686 reti->code_blocks->count = ri->code_blocks->count; 13687 reti->code_blocks->refcnt = 1; 13688 } 13689 else 13690 reti->code_blocks = NULL; 13691 13692 reti->regstclass = NULL; 13693 13694 if (ri->data) { 13695 struct reg_data *d; 13696 const int count = ri->data->count; 13697 int i; 13698 13699 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *), 13700 char, struct reg_data); 13701 Newx(d->what, count, U8); 13702 13703 d->count = count; 13704 for (i = 0; i < count; i++) { 13705 d->what[i] = ri->data->what[i]; 13706 switch (d->what[i]) { 13707 /* see also regcomp.h and regfree_internal() */ 13708 case 'a': /* actually an AV, but the dup function is identical. 13709 values seem to be "plain sv's" generally. */ 13710 case 'r': /* a compiled regex (but still just another SV) */ 13711 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code) 13712 this use case should go away, the code could have used 13713 'a' instead - see S_set_ANYOF_arg() for array contents. */ 13714 case 'S': /* actually an SV, but the dup function is identical. */ 13715 case 'u': /* actually an HV, but the dup function is identical. 13716 values are "plain sv's" */ 13717 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); 13718 break; 13719 case 'f': 13720 /* Synthetic Start Class - "Fake" charclass we generate to optimize 13721 * patterns which could start with several different things. Pre-TRIE 13722 * this was more important than it is now, however this still helps 13723 * in some places, for instance /x?a+/ might produce a SSC equivalent 13724 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass() 13725 * in regexec.c 13726 */ 13727 /* This is cheating. */ 13728 Newx(d->data[i], 1, regnode_ssc); 13729 StructCopy(ri->data->data[i], d->data[i], regnode_ssc); 13730 reti->regstclass = (regnode*)d->data[i]; 13731 break; 13732 case 'T': 13733 /* AHO-CORASICK fail table */ 13734 /* Trie stclasses are readonly and can thus be shared 13735 * without duplication. We free the stclass in pregfree 13736 * when the corresponding reg_ac_data struct is freed. 13737 */ 13738 reti->regstclass= ri->regstclass; 13739 /* FALLTHROUGH */ 13740 case 't': 13741 /* TRIE transition table */ 13742 OP_REFCNT_LOCK; 13743 ((reg_trie_data*)ri->data->data[i])->refcount++; 13744 OP_REFCNT_UNLOCK; 13745 /* FALLTHROUGH */ 13746 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */ 13747 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code 13748 is not from another regexp */ 13749 d->data[i] = ri->data->data[i]; 13750 break; 13751 case '%': 13752 /* this is a placeholder type, it exists purely so that 13753 * reg_add_data always returns a non-zero value, this type of 13754 * entry should ONLY be present in the 0 slot of the array */ 13755 assert(i == 0); 13756 d->data[i]= ri->data->data[i]; 13757 break; 13758 default: 13759 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'", 13760 ri->data->what[i]); 13761 } 13762 } 13763 13764 reti->data = d; 13765 } 13766 else 13767 reti->data = NULL; 13768 13769 if (ri->regstclass && !reti->regstclass) { 13770 /* Assume that the regstclass is a regnode which is inside of the 13771 * program which we have to copy over */ 13772 regnode *node= ri->regstclass; 13773 assert(node >= ri->program && (node - ri->program) < len); 13774 reti->regstclass = reti->program + (node - ri->program); 13775 } 13776 13777 13778 reti->name_list_idx = ri->name_list_idx; 13779 13780 SetProgLen(reti, len); 13781 13782 return (void*)reti; 13783 } 13784 13785 #endif /* USE_ITHREADS */ 13786 13787 STATIC void 13788 S_re_croak(pTHX_ bool utf8, const char* pat,...) 13789 { 13790 va_list args; 13791 STRLEN len = strlen(pat); 13792 char buf[512]; 13793 SV *msv; 13794 const char *message; 13795 13796 PERL_ARGS_ASSERT_RE_CROAK; 13797 13798 if (len > 510) 13799 len = 510; 13800 Copy(pat, buf, len , char); 13801 buf[len] = '\n'; 13802 buf[len + 1] = '\0'; 13803 va_start(args, pat); 13804 msv = vmess(buf, &args); 13805 va_end(args); 13806 message = SvPV_const(msv, len); 13807 if (len > 512) 13808 len = 512; 13809 Copy(message, buf, len , char); 13810 /* len-1 to avoid \n */ 13811 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf)); 13812 } 13813 13814 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ 13815 13816 #ifndef PERL_IN_XSUB_RE 13817 void 13818 Perl_save_re_context(pTHX) 13819 { 13820 I32 nparens = -1; 13821 I32 i; 13822 13823 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */ 13824 13825 if (PL_curpm) { 13826 const REGEXP * const rx = PM_GETRE(PL_curpm); 13827 if (rx) 13828 nparens = RX_NPARENS(rx); 13829 } 13830 13831 /* RT #124109. This is a complete hack; in the SWASHNEW case we know 13832 * that PL_curpm will be null, but that utf8.pm and the modules it 13833 * loads will only use $1..$3. 13834 * The t/porting/re_context.t test file checks this assumption. 13835 */ 13836 if (nparens == -1) 13837 nparens = 3; 13838 13839 for (i = 1; i <= nparens; i++) { 13840 char digits[TYPE_CHARS(long)]; 13841 const STRLEN len = my_snprintf(digits, sizeof(digits), 13842 "%lu", (long)i); 13843 GV *const *const gvp 13844 = (GV**)hv_fetch(PL_defstash, digits, len, 0); 13845 13846 if (gvp) { 13847 GV * const gv = *gvp; 13848 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv)) 13849 save_scalar(gv); 13850 } 13851 } 13852 } 13853 #endif 13854 13855 #ifndef PERL_IN_XSUB_RE 13856 13857 # include "uni_keywords.h" 13858 13859 void 13860 Perl_init_uniprops(pTHX) 13861 { 13862 13863 # ifdef DEBUGGING 13864 char * dump_len_string; 13865 13866 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); 13867 if ( ! dump_len_string 13868 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) 13869 { 13870 PL_dump_re_max_len = 60; /* A reasonable default */ 13871 } 13872 # endif 13873 13874 PL_user_def_props = newHV(); 13875 13876 # ifdef USE_ITHREADS 13877 13878 HvSHAREKEYS_off(PL_user_def_props); 13879 PL_user_def_props_aTHX = aTHX; 13880 13881 # endif 13882 13883 /* Set up the inversion list interpreter-level variables */ 13884 13885 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); 13886 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]); 13887 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]); 13888 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]); 13889 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]); 13890 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]); 13891 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]); 13892 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]); 13893 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]); 13894 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]); 13895 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]); 13896 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]); 13897 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]); 13898 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]); 13899 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]); 13900 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]); 13901 13902 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); 13903 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]); 13904 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]); 13905 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]); 13906 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_]; 13907 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]); 13908 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]); 13909 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]); 13910 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]); 13911 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]); 13912 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]); 13913 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]); 13914 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]); 13915 PL_Posix_ptrs[CC_VERTSPACE_] = NULL; 13916 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]); 13917 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]); 13918 13919 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist); 13920 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist); 13921 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist); 13922 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist); 13923 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist); 13924 13925 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist); 13926 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); 13927 PL_Latin1 = _new_invlist_C_array(Latin1_invlist); 13928 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); 13929 13930 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]); 13931 13932 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]); 13933 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]); 13934 13935 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]); 13936 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]); 13937 13938 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); 13939 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ 13940 UNI__PERL_FOLDS_TO_MULTI_CHAR]); 13941 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ 13942 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]); 13943 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); 13944 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); 13945 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist); 13946 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist); 13947 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist); 13948 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); 13949 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]); 13950 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist); 13951 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]); 13952 13953 # ifdef UNI_XIDC 13954 /* The below are used only by deprecated functions. They could be removed */ 13955 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]); 13956 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]); 13957 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]); 13958 # endif 13959 } 13960 13961 /* These four functions are compiled only in regcomp.c, where they have access 13962 * to the data they return. They are a way for re_comp.c to get access to that 13963 * data without having to compile the whole data structures. */ 13964 13965 I16 13966 Perl_do_uniprop_match(const char * const key, const U16 key_len) 13967 { 13968 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH; 13969 13970 return match_uniprop((U8 *) key, key_len); 13971 } 13972 13973 SV * 13974 Perl_get_prop_definition(pTHX_ const int table_index) 13975 { 13976 PERL_ARGS_ASSERT_GET_PROP_DEFINITION; 13977 13978 /* Create and return the inversion list */ 13979 return _new_invlist_C_array(uni_prop_ptrs[table_index]); 13980 } 13981 13982 const char * const * 13983 Perl_get_prop_values(const int table_index) 13984 { 13985 PERL_ARGS_ASSERT_GET_PROP_VALUES; 13986 13987 return UNI_prop_value_ptrs[table_index]; 13988 } 13989 13990 const char * 13991 Perl_get_deprecated_property_msg(const Size_t warning_offset) 13992 { 13993 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG; 13994 13995 return deprecated_property_msgs[warning_offset]; 13996 } 13997 13998 # if 0 13999 14000 This code was mainly added for backcompat to give a warning for non-portable 14001 code points in user-defined properties. But experiments showed that the 14002 warning in earlier perls were only omitted on overflow, which should be an 14003 error, so there really isnt a backcompat issue, and actually adding the 14004 warning when none was present before might cause breakage, for little gain. So 14005 khw left this code in, but not enabled. Tests were never added. 14006 14007 embed.fnc entry: 14008 Ei |const char *|get_extended_utf8_msg|const UV cp 14009 14010 PERL_STATIC_INLINE const char * 14011 S_get_extended_utf8_msg(pTHX_ const UV cp) 14012 { 14013 U8 dummy[UTF8_MAXBYTES + 1]; 14014 HV *msgs; 14015 SV **msg; 14016 14017 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED, 14018 &msgs); 14019 14020 msg = hv_fetchs(msgs, "text", 0); 14021 assert(msg); 14022 14023 (void) sv_2mortal((SV *) msgs); 14024 14025 return SvPVX(*msg); 14026 } 14027 14028 # endif 14029 #endif /* end of ! PERL_IN_XSUB_RE */ 14030 14031 STATIC REGEXP * 14032 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len, 14033 const bool ignore_case) 14034 { 14035 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it 14036 * possibly with /i if the 'ignore_case' parameter is true. Use /aa 14037 * because nothing outside of ASCII will match. Use /m because the input 14038 * string may be a bunch of lines strung together. 14039 * 14040 * Also sets up the debugging info */ 14041 14042 U32 flags = PMf_MULTILINE|PMf_WILDCARD; 14043 U32 rx_flags; 14044 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP); 14045 REGEXP * subpattern_re; 14046 DECLARE_AND_GET_RE_DEBUG_FLAGS; 14047 14048 PERL_ARGS_ASSERT_COMPILE_WILDCARD; 14049 14050 if (ignore_case) { 14051 flags |= PMf_FOLD; 14052 } 14053 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET); 14054 14055 /* Like in op.c, we copy the compile time pm flags to the rx ones */ 14056 rx_flags = flags & RXf_PMf_COMPILETIME; 14057 14058 #ifndef PERL_IN_XSUB_RE 14059 /* Use the core engine if this file is regcomp.c. That means no 14060 * 'use re "Debug ..." is in effect, so the core engine is sufficient */ 14061 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL, 14062 &PL_core_reg_engine, 14063 NULL, NULL, 14064 rx_flags, flags); 14065 #else 14066 if (isDEBUG_WILDCARD) { 14067 /* Use the special debugging engine if this file is re_comp.c and wants 14068 * to output the wildcard matching. This uses whatever 14069 * 'use re "Debug ..." is in effect */ 14070 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL, 14071 &my_reg_engine, 14072 NULL, NULL, 14073 rx_flags, flags); 14074 } 14075 else { 14076 /* Use the special wildcard engine if this file is re_comp.c and 14077 * doesn't want to output the wildcard matching. This uses whatever 14078 * 'use re "Debug ..." is in effect for compilation, but this engine 14079 * structure has been set up so that it uses the core engine for 14080 * execution, so no execution debugging as a result of re.pm will be 14081 * displayed. */ 14082 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL, 14083 &wild_reg_engine, 14084 NULL, NULL, 14085 rx_flags, flags); 14086 /* XXX The above has the effect that any user-supplied regex engine 14087 * won't be called for matching wildcards. That might be good, or bad. 14088 * It could be changed in several ways. The reason it is done the 14089 * current way is to avoid having to save and restore 14090 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps 14091 * could be used. Another suggestion is to keep the authoritative 14092 * value of the debug flags in a thread-local variable and add set/get 14093 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date. 14094 * Still another is to pass a flag, say in the engine's intflags that 14095 * would be checked each time before doing the debug output */ 14096 } 14097 #endif 14098 14099 assert(subpattern_re); /* Should have died if didn't compile successfully */ 14100 return subpattern_re; 14101 } 14102 14103 STATIC I32 14104 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend, 14105 char *strbeg, SSize_t minend, SV *screamer, U32 nosave) 14106 { 14107 I32 result; 14108 DECLARE_AND_GET_RE_DEBUG_FLAGS; 14109 14110 PERL_ARGS_ASSERT_EXECUTE_WILDCARD; 14111 14112 ENTER; 14113 14114 /* The compilation has set things up so that if the program doesn't want to 14115 * see the wildcard matching procedure, it will get the core execution 14116 * engine, which is subject only to -Dr. So we have to turn that off 14117 * around this procedure */ 14118 if (! isDEBUG_WILDCARD) { 14119 /* Note! Casts away 'volatile' */ 14120 SAVEI32(PL_debug); 14121 PL_debug &= ~ DEBUG_r_FLAG; 14122 } 14123 14124 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer, 14125 NULL, nosave); 14126 LEAVE; 14127 14128 return result; 14129 } 14130 14131 SV * 14132 S_handle_user_defined_property(pTHX_ 14133 14134 /* Parses the contents of a user-defined property definition; returning the 14135 * expanded definition if possible. If so, the return is an inversion 14136 * list. 14137 * 14138 * If there are subroutines that are part of the expansion and which aren't 14139 * known at the time of the call to this function, this returns what 14140 * parse_uniprop_string() returned for the first one encountered. 14141 * 14142 * If an error was found, NULL is returned, and 'msg' gets a suitable 14143 * message appended to it. (Appending allows the back trace of how we got 14144 * to the faulty definition to be displayed through nested calls of 14145 * user-defined subs.) 14146 * 14147 * The caller IS responsible for freeing any returned SV. 14148 * 14149 * The syntax of the contents is pretty much described in perlunicode.pod, 14150 * but we also allow comments on each line */ 14151 14152 const char * name, /* Name of property */ 14153 const STRLEN name_len, /* The name's length in bytes */ 14154 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 14155 const bool to_fold, /* ? Is this under /i */ 14156 const bool runtime, /* ? Are we in compile- or run-time */ 14157 const bool deferrable, /* Is it ok for this property's full definition 14158 to be deferred until later? */ 14159 SV* contents, /* The property's definition */ 14160 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be 14161 getting called unless this is thought to be 14162 a user-defined property */ 14163 SV * msg, /* Any error or warning msg(s) are appended to 14164 this */ 14165 const STRLEN level) /* Recursion level of this call */ 14166 { 14167 STRLEN len; 14168 const char * string = SvPV_const(contents, len); 14169 const char * const e = string + len; 14170 const bool is_contents_utf8 = cBOOL(SvUTF8(contents)); 14171 const STRLEN msgs_length_on_entry = SvCUR(msg); 14172 14173 const char * s0 = string; /* Points to first byte in the current line 14174 being parsed in 'string' */ 14175 const char overflow_msg[] = "Code point too large in \""; 14176 SV* running_definition = NULL; 14177 14178 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY; 14179 14180 *user_defined_ptr = TRUE; 14181 14182 /* Look at each line */ 14183 while (s0 < e) { 14184 const char * s; /* Current byte */ 14185 char op = '+'; /* Default operation is 'union' */ 14186 IV min = 0; /* range begin code point */ 14187 IV max = -1; /* and range end */ 14188 SV* this_definition; 14189 14190 /* Skip comment lines */ 14191 if (*s0 == '#') { 14192 s0 = strchr(s0, '\n'); 14193 if (s0 == NULL) { 14194 break; 14195 } 14196 s0++; 14197 continue; 14198 } 14199 14200 /* For backcompat, allow an empty first line */ 14201 if (*s0 == '\n') { 14202 s0++; 14203 continue; 14204 } 14205 14206 /* First character in the line may optionally be the operation */ 14207 if ( *s0 == '+' 14208 || *s0 == '!' 14209 || *s0 == '-' 14210 || *s0 == '&') 14211 { 14212 op = *s0++; 14213 } 14214 14215 /* If the line is one or two hex digits separated by blank space, its 14216 * a range; otherwise it is either another user-defined property or an 14217 * error */ 14218 14219 s = s0; 14220 14221 if (! isXDIGIT(*s)) { 14222 goto check_if_property; 14223 } 14224 14225 do { /* Each new hex digit will add 4 bits. */ 14226 if (min > ( (IV) MAX_LEGAL_CP >> 4)) { 14227 s = strchr(s, '\n'); 14228 if (s == NULL) { 14229 s = e; 14230 } 14231 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14232 sv_catpv(msg, overflow_msg); 14233 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14234 UTF8fARG(is_contents_utf8, s - s0, s0)); 14235 sv_catpvs(msg, "\""); 14236 goto return_failure; 14237 } 14238 14239 /* Accumulate this digit into the value */ 14240 min = (min << 4) + READ_XDIGIT(s); 14241 } while (isXDIGIT(*s)); 14242 14243 while (isBLANK(*s)) { s++; } 14244 14245 /* We allow comments at the end of the line */ 14246 if (*s == '#') { 14247 s = strchr(s, '\n'); 14248 if (s == NULL) { 14249 s = e; 14250 } 14251 s++; 14252 } 14253 else if (s < e && *s != '\n') { 14254 if (! isXDIGIT(*s)) { 14255 goto check_if_property; 14256 } 14257 14258 /* Look for the high point of the range */ 14259 max = 0; 14260 do { 14261 if (max > ( (IV) MAX_LEGAL_CP >> 4)) { 14262 s = strchr(s, '\n'); 14263 if (s == NULL) { 14264 s = e; 14265 } 14266 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14267 sv_catpv(msg, overflow_msg); 14268 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14269 UTF8fARG(is_contents_utf8, s - s0, s0)); 14270 sv_catpvs(msg, "\""); 14271 goto return_failure; 14272 } 14273 14274 max = (max << 4) + READ_XDIGIT(s); 14275 } while (isXDIGIT(*s)); 14276 14277 while (isBLANK(*s)) { s++; } 14278 14279 if (*s == '#') { 14280 s = strchr(s, '\n'); 14281 if (s == NULL) { 14282 s = e; 14283 } 14284 } 14285 else if (s < e && *s != '\n') { 14286 goto check_if_property; 14287 } 14288 } 14289 14290 if (max == -1) { /* The line only had one entry */ 14291 max = min; 14292 } 14293 else if (max < min) { 14294 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14295 sv_catpvs(msg, "Illegal range in \""); 14296 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14297 UTF8fARG(is_contents_utf8, s - s0, s0)); 14298 sv_catpvs(msg, "\""); 14299 goto return_failure; 14300 } 14301 14302 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */ 14303 14304 if ( UNICODE_IS_PERL_EXTENDED(min) 14305 || UNICODE_IS_PERL_EXTENDED(max)) 14306 { 14307 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 14308 14309 /* If both code points are non-portable, warn only on the lower 14310 * one. */ 14311 sv_catpv(msg, get_extended_utf8_msg( 14312 (UNICODE_IS_PERL_EXTENDED(min)) 14313 ? min : max)); 14314 sv_catpvs(msg, " in \""); 14315 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, 14316 UTF8fARG(is_contents_utf8, s - s0, s0)); 14317 sv_catpvs(msg, "\""); 14318 } 14319 14320 # endif 14321 14322 /* Here, this line contains a legal range */ 14323 this_definition = sv_2mortal(_new_invlist(2)); 14324 this_definition = _add_range_to_invlist(this_definition, min, max); 14325 goto calculate; 14326 14327 check_if_property: 14328 14329 /* Here it isn't a legal range line. See if it is a legal property 14330 * line. First find the end of the meat of the line */ 14331 s = strpbrk(s, "#\n"); 14332 if (s == NULL) { 14333 s = e; 14334 } 14335 14336 /* Ignore trailing blanks in keeping with the requirements of 14337 * parse_uniprop_string() */ 14338 s--; 14339 while (s > s0 && isBLANK_A(*s)) { 14340 s--; 14341 } 14342 s++; 14343 14344 this_definition = parse_uniprop_string(s0, s - s0, 14345 is_utf8, to_fold, runtime, 14346 deferrable, 14347 NULL, 14348 user_defined_ptr, msg, 14349 (name_len == 0) 14350 ? level /* Don't increase level 14351 if input is empty */ 14352 : level + 1 14353 ); 14354 if (this_definition == NULL) { 14355 goto return_failure; /* 'msg' should have had the reason 14356 appended to it by the above call */ 14357 } 14358 14359 if (! is_invlist(this_definition)) { /* Unknown at this time */ 14360 return newSVsv(this_definition); 14361 } 14362 14363 if (*s != '\n') { 14364 s = strchr(s, '\n'); 14365 if (s == NULL) { 14366 s = e; 14367 } 14368 } 14369 14370 calculate: 14371 14372 switch (op) { 14373 case '+': 14374 _invlist_union(running_definition, this_definition, 14375 &running_definition); 14376 break; 14377 case '-': 14378 _invlist_subtract(running_definition, this_definition, 14379 &running_definition); 14380 break; 14381 case '&': 14382 _invlist_intersection(running_definition, this_definition, 14383 &running_definition); 14384 break; 14385 case '!': 14386 _invlist_union_complement_2nd(running_definition, 14387 this_definition, &running_definition); 14388 break; 14389 default: 14390 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d", 14391 __FILE__, __LINE__, op); 14392 break; 14393 } 14394 14395 /* Position past the '\n' */ 14396 s0 = s + 1; 14397 } /* End of loop through the lines of 'contents' */ 14398 14399 /* Here, we processed all the lines in 'contents' without error. If we 14400 * didn't add any warnings, simply return success */ 14401 if (msgs_length_on_entry == SvCUR(msg)) { 14402 14403 /* If the expansion was empty, the answer isn't nothing: its an empty 14404 * inversion list */ 14405 if (running_definition == NULL) { 14406 running_definition = _new_invlist(1); 14407 } 14408 14409 return running_definition; 14410 } 14411 14412 /* Otherwise, add some explanatory text, but we will return success */ 14413 goto return_msg; 14414 14415 return_failure: 14416 running_definition = NULL; 14417 14418 return_msg: 14419 14420 if (name_len > 0) { 14421 sv_catpvs(msg, " in expansion of "); 14422 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); 14423 } 14424 14425 return running_definition; 14426 } 14427 14428 /* As explained below, certain operations need to take place in the first 14429 * thread created. These macros switch contexts */ 14430 # ifdef USE_ITHREADS 14431 # define DECLARATION_FOR_GLOBAL_CONTEXT \ 14432 PerlInterpreter * save_aTHX = aTHX; 14433 # define SWITCH_TO_GLOBAL_CONTEXT \ 14434 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX)) 14435 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX)); 14436 # define CUR_CONTEXT aTHX 14437 # define ORIGINAL_CONTEXT save_aTHX 14438 # else 14439 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP 14440 # define SWITCH_TO_GLOBAL_CONTEXT NOOP 14441 # define RESTORE_CONTEXT NOOP 14442 # define CUR_CONTEXT NULL 14443 # define ORIGINAL_CONTEXT NULL 14444 # endif 14445 14446 STATIC void 14447 S_delete_recursion_entry(pTHX_ void *key) 14448 { 14449 /* Deletes the entry used to detect recursion when expanding user-defined 14450 * properties. This is a function so it can be set up to be called even if 14451 * the program unexpectedly quits */ 14452 14453 SV ** current_entry; 14454 const STRLEN key_len = strlen((const char *) key); 14455 DECLARATION_FOR_GLOBAL_CONTEXT; 14456 14457 SWITCH_TO_GLOBAL_CONTEXT; 14458 14459 /* If the entry is one of these types, it is a permanent entry, and not the 14460 * one used to detect recursions. This function should delete only the 14461 * recursion entry */ 14462 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0); 14463 if ( current_entry 14464 && ! is_invlist(*current_entry) 14465 && ! SvPOK(*current_entry)) 14466 { 14467 (void) hv_delete(PL_user_def_props, (const char *) key, key_len, 14468 G_DISCARD); 14469 } 14470 14471 RESTORE_CONTEXT; 14472 } 14473 14474 STATIC SV * 14475 S_get_fq_name(pTHX_ 14476 const char * const name, /* The first non-blank in the \p{}, \P{} */ 14477 const Size_t name_len, /* Its length in bytes, not including any trailing space */ 14478 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 14479 const bool has_colon_colon 14480 ) 14481 { 14482 /* Returns a mortal SV containing the fully qualified version of the input 14483 * name */ 14484 14485 SV * fq_name; 14486 14487 fq_name = newSVpvs_flags("", SVs_TEMP); 14488 14489 /* Use the current package if it wasn't included in our input */ 14490 if (! has_colon_colon) { 14491 const HV * pkg = (IN_PERL_COMPILETIME) 14492 ? PL_curstash 14493 : CopSTASH(PL_curcop); 14494 const char* pkgname = HvNAME(pkg); 14495 14496 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f, 14497 UTF8fARG(is_utf8, strlen(pkgname), pkgname)); 14498 sv_catpvs(fq_name, "::"); 14499 } 14500 14501 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f, 14502 UTF8fARG(is_utf8, name_len, name)); 14503 return fq_name; 14504 } 14505 14506 STATIC SV * 14507 S_parse_uniprop_string(pTHX_ 14508 14509 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable 14510 * now. If so, the return is an inversion list. 14511 * 14512 * If the property is user-defined, it is a subroutine, which in turn 14513 * may call other subroutines. This function will call the whole nest of 14514 * them to get the definition they return; if some aren't known at the time 14515 * of the call to this function, the fully qualified name of the highest 14516 * level sub is returned. It is an error to call this function at runtime 14517 * without every sub defined. 14518 * 14519 * If an error was found, NULL is returned, and 'msg' gets a suitable 14520 * message appended to it. (Appending allows the back trace of how we got 14521 * to the faulty definition to be displayed through nested calls of 14522 * user-defined subs.) 14523 * 14524 * The caller should NOT try to free any returned inversion list. 14525 * 14526 * Other parameters will be set on return as described below */ 14527 14528 const char * const name, /* The first non-blank in the \p{}, \P{} */ 14529 Size_t name_len, /* Its length in bytes, not including any 14530 trailing space */ 14531 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ 14532 const bool to_fold, /* ? Is this under /i */ 14533 const bool runtime, /* TRUE if this is being called at run time */ 14534 const bool deferrable, /* TRUE if it's ok for the definition to not be 14535 known at this call */ 14536 AV ** strings, /* To return string property values, like named 14537 sequences */ 14538 bool *user_defined_ptr, /* Upon return from this function it will be 14539 set to TRUE if any component is a 14540 user-defined property */ 14541 SV * msg, /* Any error or warning msg(s) are appended to 14542 this */ 14543 const STRLEN level) /* Recursion level of this call */ 14544 { 14545 char* lookup_name; /* normalized name for lookup in our tables */ 14546 unsigned lookup_len; /* Its length */ 14547 enum { Not_Strict = 0, /* Some properties have stricter name */ 14548 Strict, /* normalization rules, which we decide */ 14549 As_Is /* upon based on parsing */ 14550 } stricter = Not_Strict; 14551 14552 /* nv= or numeric_value=, or possibly one of the cjk numeric properties 14553 * (though it requires extra effort to download them from Unicode and 14554 * compile perl to know about them) */ 14555 bool is_nv_type = FALSE; 14556 14557 unsigned int i = 0, i_zero = 0, j = 0; 14558 int equals_pos = -1; /* Where the '=' is found, or negative if none */ 14559 int slash_pos = -1; /* Where the '/' is found, or negative if none */ 14560 int table_index = 0; /* The entry number for this property in the table 14561 of all Unicode property names */ 14562 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */ 14563 Size_t lookup_offset = 0; /* Used to ignore the first few characters of 14564 the normalized name in certain situations */ 14565 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't 14566 part of a package name */ 14567 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */ 14568 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined 14569 property rather than a Unicode 14570 one. */ 14571 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL 14572 if an error. If it is an inversion list, 14573 it is the definition. Otherwise it is a 14574 string containing the fully qualified sub 14575 name of 'name' */ 14576 SV * fq_name = NULL; /* For user-defined properties, the fully 14577 qualified name */ 14578 bool invert_return = FALSE; /* ? Do we need to complement the result before 14579 returning it */ 14580 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an 14581 explicit utf8:: package that we strip 14582 off */ 14583 /* The expansion of properties that could be either user-defined or 14584 * official unicode ones is deferred until runtime, including a marker for 14585 * those that might be in the latter category. This boolean indicates if 14586 * we've seen that marker. If not, what we're parsing can't be such an 14587 * official Unicode property whose expansion was deferred */ 14588 bool could_be_deferred_official = FALSE; 14589 14590 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING; 14591 14592 /* The input will be normalized into 'lookup_name' */ 14593 Newx(lookup_name, name_len, char); 14594 SAVEFREEPV(lookup_name); 14595 14596 /* Parse the input. */ 14597 for (i = 0; i < name_len; i++) { 14598 char cur = name[i]; 14599 14600 /* Most of the characters in the input will be of this ilk, being parts 14601 * of a name */ 14602 if (isIDCONT_A(cur)) { 14603 14604 /* Case differences are ignored. Our lookup routine assumes 14605 * everything is lowercase, so normalize to that */ 14606 if (isUPPER_A(cur)) { 14607 lookup_name[j++] = toLOWER_A(cur); 14608 continue; 14609 } 14610 14611 if (cur == '_') { /* Don't include these in the normalized name */ 14612 continue; 14613 } 14614 14615 lookup_name[j++] = cur; 14616 14617 /* The first character in a user-defined name must be of this type. 14618 * */ 14619 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) { 14620 could_be_user_defined = FALSE; 14621 } 14622 14623 continue; 14624 } 14625 14626 /* Here, the character is not something typically in a name, But these 14627 * two types of characters (and the '_' above) can be freely ignored in 14628 * most situations. Later it may turn out we shouldn't have ignored 14629 * them, and we have to reparse, but we don't have enough information 14630 * yet to make that decision */ 14631 if (cur == '-' || isSPACE_A(cur)) { 14632 could_be_user_defined = FALSE; 14633 continue; 14634 } 14635 14636 /* An equals sign or single colon mark the end of the first part of 14637 * the property name */ 14638 if ( cur == '=' 14639 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':'))) 14640 { 14641 lookup_name[j++] = '='; /* Treat the colon as an '=' */ 14642 equals_pos = j; /* Note where it occurred in the input */ 14643 could_be_user_defined = FALSE; 14644 break; 14645 } 14646 14647 /* If this looks like it is a marker we inserted at compile time, 14648 * set a flag and otherwise ignore it. If it isn't in the final 14649 * position, keep it as it would have been user input. */ 14650 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc) 14651 && ! deferrable 14652 && could_be_user_defined 14653 && i == name_len - 1) 14654 { 14655 name_len--; 14656 could_be_deferred_official = TRUE; 14657 continue; 14658 } 14659 14660 /* Otherwise, this character is part of the name. */ 14661 lookup_name[j++] = cur; 14662 14663 /* Here it isn't a single colon, so if it is a colon, it must be a 14664 * double colon */ 14665 if (cur == ':') { 14666 14667 /* A double colon should be a package qualifier. We note its 14668 * position and continue. Note that one could have 14669 * pkg1::pkg2::...::foo 14670 * so that the position at the end of the loop will be just after 14671 * the final qualifier */ 14672 14673 i++; 14674 non_pkg_begin = i + 1; 14675 lookup_name[j++] = ':'; 14676 lun_non_pkg_begin = j; 14677 } 14678 else { /* Only word chars (and '::') can be in a user-defined name */ 14679 could_be_user_defined = FALSE; 14680 } 14681 } /* End of parsing through the lhs of the property name (or all of it if 14682 no rhs) */ 14683 14684 /* If there is a single package name 'utf8::', it is ambiguous. It could 14685 * be for a user-defined property, or it could be a Unicode property, as 14686 * all of them are considered to be for that package. For the purposes of 14687 * parsing the rest of the property, strip it off */ 14688 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) { 14689 lookup_name += STRLENs("utf8::"); 14690 j -= STRLENs("utf8::"); 14691 equals_pos -= STRLENs("utf8::"); 14692 i_zero = STRLENs("utf8::"); /* When resetting 'i' to reparse 14693 from the beginning, it has to be 14694 set past what we're stripping 14695 off */ 14696 stripped_utf8_pkg = TRUE; 14697 } 14698 14699 /* Here, we are either done with the whole property name, if it was simple; 14700 * or are positioned just after the '=' if it is compound. */ 14701 14702 if (equals_pos >= 0) { 14703 assert(stricter == Not_Strict); /* We shouldn't have set this yet */ 14704 14705 /* Space immediately after the '=' is ignored */ 14706 i++; 14707 for (; i < name_len; i++) { 14708 if (! isSPACE_A(name[i])) { 14709 break; 14710 } 14711 } 14712 14713 /* Most punctuation after the equals indicates a subpattern, like 14714 * \p{foo=/bar/} */ 14715 if ( isPUNCT_A(name[i]) 14716 && name[i] != '-' 14717 && name[i] != '+' 14718 && name[i] != '_' 14719 && name[i] != '{' 14720 /* A backslash means the real delimiter is the next character, 14721 * but it must be punctuation */ 14722 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1])))) 14723 { 14724 bool special_property = memEQs(lookup_name, j - 1, "name") 14725 || memEQs(lookup_name, j - 1, "na"); 14726 if (! special_property) { 14727 /* Find the property. The table includes the equals sign, so 14728 * we use 'j' as-is */ 14729 table_index = do_uniprop_match(lookup_name, j); 14730 } 14731 if (special_property || table_index) { 14732 REGEXP * subpattern_re; 14733 char open = name[i++]; 14734 char close; 14735 const char * pos_in_brackets; 14736 const char * const * prop_values; 14737 bool escaped = 0; 14738 14739 /* Backslash => delimiter is the character following. We 14740 * already checked that it is punctuation */ 14741 if (open == '\\') { 14742 open = name[i++]; 14743 escaped = 1; 14744 } 14745 14746 /* This data structure is constructed so that the matching 14747 * closing bracket is 3 past its matching opening. The second 14748 * set of closing is so that if the opening is something like 14749 * ']', the closing will be that as well. Something similar is 14750 * done in toke.c */ 14751 pos_in_brackets = memCHRs("([<)]>)]>", open); 14752 close = (pos_in_brackets) ? pos_in_brackets[3] : open; 14753 14754 if ( i >= name_len 14755 || name[name_len-1] != close 14756 || (escaped && name[name_len-2] != '\\') 14757 /* Also make sure that there are enough characters. 14758 * e.g., '\\\' would show up incorrectly as legal even 14759 * though it is too short */ 14760 || (SSize_t) (name_len - i - 1 - escaped) < 0) 14761 { 14762 sv_catpvs(msg, "Unicode property wildcard not terminated"); 14763 goto append_name_to_msg; 14764 } 14765 14766 Perl_ck_warner_d(aTHX_ 14767 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS), 14768 "The Unicode property wildcards feature is experimental"); 14769 14770 if (special_property) { 14771 const char * error_msg; 14772 const char * revised_name = name + i; 14773 Size_t revised_name_len = name_len - (i + 1 + escaped); 14774 14775 /* Currently, the only 'special_property' is name, which we 14776 * lookup in _charnames.pm */ 14777 14778 if (! load_charnames(newSVpvs("placeholder"), 14779 revised_name, revised_name_len, 14780 &error_msg)) 14781 { 14782 sv_catpv(msg, error_msg); 14783 goto append_name_to_msg; 14784 } 14785 14786 /* Farm this out to a function just to make the current 14787 * function less unwieldy */ 14788 if (handle_names_wildcard(revised_name, revised_name_len, 14789 &prop_definition, 14790 strings)) 14791 { 14792 return prop_definition; 14793 } 14794 14795 goto failed; 14796 } 14797 14798 prop_values = get_prop_values(table_index); 14799 14800 /* Now create and compile the wildcard subpattern. Use /i 14801 * because the property values are supposed to match with case 14802 * ignored. */ 14803 subpattern_re = compile_wildcard(name + i, 14804 name_len - i - 1 - escaped, 14805 TRUE /* /i */ 14806 ); 14807 14808 /* For each legal property value, see if the supplied pattern 14809 * matches it. */ 14810 while (*prop_values) { 14811 const char * const entry = *prop_values; 14812 const Size_t len = strlen(entry); 14813 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP); 14814 14815 if (execute_wildcard(subpattern_re, 14816 (char *) entry, 14817 (char *) entry + len, 14818 (char *) entry, 0, 14819 entry_sv, 14820 0)) 14821 { /* Here, matched. Add to the returned list */ 14822 Size_t total_len = j + len; 14823 SV * sub_invlist = NULL; 14824 char * this_string; 14825 14826 /* We know this is a legal \p{property=value}. Call 14827 * the function to return the list of code points that 14828 * match it */ 14829 Newxz(this_string, total_len + 1, char); 14830 Copy(lookup_name, this_string, j, char); 14831 my_strlcat(this_string, entry, total_len + 1); 14832 SAVEFREEPV(this_string); 14833 sub_invlist = parse_uniprop_string(this_string, 14834 total_len, 14835 is_utf8, 14836 to_fold, 14837 runtime, 14838 deferrable, 14839 NULL, 14840 user_defined_ptr, 14841 msg, 14842 level + 1); 14843 _invlist_union(prop_definition, sub_invlist, 14844 &prop_definition); 14845 } 14846 14847 prop_values++; /* Next iteration, look at next propvalue */ 14848 } /* End of looking through property values; (the data 14849 structure is terminated by a NULL ptr) */ 14850 14851 SvREFCNT_dec_NN(subpattern_re); 14852 14853 if (prop_definition) { 14854 return prop_definition; 14855 } 14856 14857 sv_catpvs(msg, "No Unicode property value wildcard matches:"); 14858 goto append_name_to_msg; 14859 } 14860 14861 /* Here's how khw thinks we should proceed to handle the properties 14862 * not yet done: Bidi Mirroring Glyph can map to "" 14863 Bidi Paired Bracket can map to "" 14864 Case Folding (both full and simple) 14865 Shouldn't /i be good enough for Full 14866 Decomposition Mapping 14867 Equivalent Unified Ideograph can map to "" 14868 Lowercase Mapping (both full and simple) 14869 NFKC Case Fold can map to "" 14870 Titlecase Mapping (both full and simple) 14871 Uppercase Mapping (both full and simple) 14872 * Handle these the same way Name is done, using say, _wild.pm, but 14873 * having both loose and full, like in charclass_invlists.h. 14874 * Perhaps move block and script to that as they are somewhat large 14875 * in charclass_invlists.h. 14876 * For properties where the default is the code point itself, such 14877 * as any of the case changing mappings, the string would otherwise 14878 * consist of all Unicode code points in UTF-8 strung together. 14879 * This would be impractical. So instead, examine their compiled 14880 * pattern, looking at the ssc. If none, reject the pattern as an 14881 * error. Otherwise run the pattern against every code point in 14882 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets 14883 * And it might be good to create an API to return the ssc. 14884 * Or handle them like the algorithmic names are done 14885 */ 14886 } /* End of is a wildcard subppattern */ 14887 14888 /* \p{name=...} is handled specially. Instead of using the normal 14889 * mechanism involving charclass_invlists.h, it uses _charnames.pm 14890 * which has the necessary (huge) data accessible to it, and which 14891 * doesn't get loaded unless necessary. The legal syntax for names is 14892 * somewhat different than other properties due both to the vagaries of 14893 * a few outlier official names, and the fact that only a few ASCII 14894 * characters are permitted in them */ 14895 if ( memEQs(lookup_name, j - 1, "name") 14896 || memEQs(lookup_name, j - 1, "na")) 14897 { 14898 dSP; 14899 HV * table; 14900 SV * character; 14901 const char * error_msg; 14902 CV* lookup_loose; 14903 SV * character_name; 14904 STRLEN character_len; 14905 UV cp; 14906 14907 stricter = As_Is; 14908 14909 /* Since the RHS (after skipping initial space) is passed unchanged 14910 * to charnames, and there are different criteria for what are 14911 * legal characters in the name, just parse it here. A character 14912 * name must begin with an ASCII alphabetic */ 14913 if (! isALPHA(name[i])) { 14914 goto failed; 14915 } 14916 lookup_name[j++] = name[i]; 14917 14918 for (++i; i < name_len; i++) { 14919 /* Official names can only be in the ASCII range, and only 14920 * certain characters */ 14921 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) { 14922 goto failed; 14923 } 14924 lookup_name[j++] = name[i]; 14925 } 14926 14927 /* Finished parsing, save the name into an SV */ 14928 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos); 14929 14930 /* Make sure _charnames is loaded. (The parameters give context 14931 * for any errors generated */ 14932 table = load_charnames(character_name, name, name_len, &error_msg); 14933 if (table == NULL) { 14934 sv_catpv(msg, error_msg); 14935 goto append_name_to_msg; 14936 } 14937 14938 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0); 14939 if (! lookup_loose) { 14940 Perl_croak(aTHX_ 14941 "panic: Can't find '_charnames::_loose_regcomp_lookup"); 14942 } 14943 14944 PUSHSTACKi(PERLSI_REGCOMP); 14945 ENTER ; 14946 SAVETMPS; 14947 save_re_context(); 14948 14949 PUSHMARK(SP) ; 14950 XPUSHs(character_name); 14951 PUTBACK; 14952 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR); 14953 14954 SPAGAIN ; 14955 14956 character = POPs; 14957 SvREFCNT_inc_simple_void_NN(character); 14958 14959 PUTBACK ; 14960 FREETMPS ; 14961 LEAVE ; 14962 POPSTACK; 14963 14964 if (! SvOK(character)) { 14965 goto failed; 14966 } 14967 14968 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len); 14969 if (character_len == SvCUR(character)) { 14970 prop_definition = add_cp_to_invlist(NULL, cp); 14971 } 14972 else { 14973 AV * this_string; 14974 14975 /* First of the remaining characters in the string. */ 14976 char * remaining = SvPVX(character) + character_len; 14977 14978 if (strings == NULL) { 14979 goto failed; /* XXX Perhaps a specific msg instead, like 14980 'not available here' */ 14981 } 14982 14983 if (*strings == NULL) { 14984 *strings = newAV(); 14985 } 14986 14987 this_string = newAV(); 14988 av_push_simple(this_string, newSVuv(cp)); 14989 14990 do { 14991 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len); 14992 av_push_simple(this_string, newSVuv(cp)); 14993 remaining += character_len; 14994 } while (remaining < SvEND(character)); 14995 14996 av_push_simple(*strings, (SV *) this_string); 14997 } 14998 14999 return prop_definition; 15000 } 15001 15002 /* Certain properties whose values are numeric need special handling. 15003 * They may optionally be prefixed by 'is'. Ignore that prefix for the 15004 * purposes of checking if this is one of those properties */ 15005 if (memBEGINPs(lookup_name, j, "is")) { 15006 lookup_offset = 2; 15007 } 15008 15009 /* Then check if it is one of these specially-handled properties. The 15010 * possibilities are hard-coded because easier this way, and the list 15011 * is unlikely to change. 15012 * 15013 * All numeric value type properties are of this ilk, and are also 15014 * special in a different way later on. So find those first. There 15015 * are several numeric value type properties in the Unihan DB (which is 15016 * unlikely to be compiled with perl, but we handle it here in case it 15017 * does get compiled). They all end with 'numeric'. The interiors 15018 * aren't checked for the precise property. This would stop working if 15019 * a cjk property were to be created that ended with 'numeric' and 15020 * wasn't a numeric type */ 15021 is_nv_type = memEQs(lookup_name + lookup_offset, 15022 j - 1 - lookup_offset, "numericvalue") 15023 || memEQs(lookup_name + lookup_offset, 15024 j - 1 - lookup_offset, "nv") 15025 || ( memENDPs(lookup_name + lookup_offset, 15026 j - 1 - lookup_offset, "numeric") 15027 && ( memBEGINPs(lookup_name + lookup_offset, 15028 j - 1 - lookup_offset, "cjk") 15029 || memBEGINPs(lookup_name + lookup_offset, 15030 j - 1 - lookup_offset, "k"))); 15031 if ( is_nv_type 15032 || memEQs(lookup_name + lookup_offset, 15033 j - 1 - lookup_offset, "canonicalcombiningclass") 15034 || memEQs(lookup_name + lookup_offset, 15035 j - 1 - lookup_offset, "ccc") 15036 || memEQs(lookup_name + lookup_offset, 15037 j - 1 - lookup_offset, "age") 15038 || memEQs(lookup_name + lookup_offset, 15039 j - 1 - lookup_offset, "in") 15040 || memEQs(lookup_name + lookup_offset, 15041 j - 1 - lookup_offset, "presentin")) 15042 { 15043 unsigned int k; 15044 15045 /* Since the stuff after the '=' is a number, we can't throw away 15046 * '-' willy-nilly, as those could be a minus sign. Other stricter 15047 * rules also apply. However, these properties all can have the 15048 * rhs not be a number, in which case they contain at least one 15049 * alphabetic. In those cases, the stricter rules don't apply. 15050 * But the numeric type properties can have the alphas [Ee] to 15051 * signify an exponent, and it is still a number with stricter 15052 * rules. So look for an alpha that signifies not-strict */ 15053 stricter = Strict; 15054 for (k = i; k < name_len; k++) { 15055 if ( isALPHA_A(name[k]) 15056 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E'))) 15057 { 15058 stricter = Not_Strict; 15059 break; 15060 } 15061 } 15062 } 15063 15064 if (stricter) { 15065 15066 /* A number may have a leading '+' or '-'. The latter is retained 15067 * */ 15068 if (name[i] == '+') { 15069 i++; 15070 } 15071 else if (name[i] == '-') { 15072 lookup_name[j++] = '-'; 15073 i++; 15074 } 15075 15076 /* Skip leading zeros including single underscores separating the 15077 * zeros, or between the final leading zero and the first other 15078 * digit */ 15079 for (; i < name_len - 1; i++) { 15080 if ( name[i] != '0' 15081 && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) 15082 { 15083 break; 15084 } 15085 } 15086 15087 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by 15088 * underling libc implementation. */ 15089 if ( i == name_len - 1 15090 && name[name_len-1] == '0' 15091 && lookup_name[j-1] == '-') 15092 { 15093 j--; 15094 } 15095 } 15096 } 15097 else { /* No '=' */ 15098 15099 /* Only a few properties without an '=' should be parsed with stricter 15100 * rules. The list is unlikely to change. */ 15101 if ( memBEGINPs(lookup_name, j, "perl") 15102 && memNEs(lookup_name + 4, j - 4, "space") 15103 && memNEs(lookup_name + 4, j - 4, "word")) 15104 { 15105 stricter = Strict; 15106 15107 /* We set the inputs back to 0 and the code below will reparse, 15108 * using strict */ 15109 i = i_zero; 15110 j = 0; 15111 } 15112 } 15113 15114 /* Here, we have either finished the property, or are positioned to parse 15115 * the remainder, and we know if stricter rules apply. Finish out, if not 15116 * already done */ 15117 for (; i < name_len; i++) { 15118 char cur = name[i]; 15119 15120 /* In all instances, case differences are ignored, and we normalize to 15121 * lowercase */ 15122 if (isUPPER_A(cur)) { 15123 lookup_name[j++] = toLOWER(cur); 15124 continue; 15125 } 15126 15127 /* An underscore is skipped, but not under strict rules unless it 15128 * separates two digits */ 15129 if (cur == '_') { 15130 if ( stricter 15131 && ( i == i_zero || (int) i == equals_pos || i == name_len- 1 15132 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1]))) 15133 { 15134 lookup_name[j++] = '_'; 15135 } 15136 continue; 15137 } 15138 15139 /* Hyphens are skipped except under strict */ 15140 if (cur == '-' && ! stricter) { 15141 continue; 15142 } 15143 15144 /* XXX Bug in documentation. It says white space skipped adjacent to 15145 * non-word char. Maybe we should, but shouldn't skip it next to a dot 15146 * in a number */ 15147 if (isSPACE_A(cur) && ! stricter) { 15148 continue; 15149 } 15150 15151 lookup_name[j++] = cur; 15152 15153 /* Unless this is a non-trailing slash, we are done with it */ 15154 if (i >= name_len - 1 || cur != '/') { 15155 continue; 15156 } 15157 15158 slash_pos = j; 15159 15160 /* A slash in the 'numeric value' property indicates that what follows 15161 * is a denominator. It can have a leading '+' and '0's that should be 15162 * skipped. But we have never allowed a negative denominator, so treat 15163 * a minus like every other character. (No need to rule out a second 15164 * '/', as that won't match anything anyway */ 15165 if (is_nv_type) { 15166 i++; 15167 if (i < name_len && name[i] == '+') { 15168 i++; 15169 } 15170 15171 /* Skip leading zeros including underscores separating digits */ 15172 for (; i < name_len - 1; i++) { 15173 if ( name[i] != '0' 15174 && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) 15175 { 15176 break; 15177 } 15178 } 15179 15180 /* Store the first real character in the denominator */ 15181 if (i < name_len) { 15182 lookup_name[j++] = name[i]; 15183 } 15184 } 15185 } 15186 15187 /* Here are completely done parsing the input 'name', and 'lookup_name' 15188 * contains a copy, normalized. 15189 * 15190 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and 15191 * different from without the underscores. */ 15192 if ( ( UNLIKELY(memEQs(lookup_name, j, "l")) 15193 || UNLIKELY(memEQs(lookup_name, j, "gc=l"))) 15194 && UNLIKELY(name[name_len-1] == '_')) 15195 { 15196 lookup_name[j++] = '&'; 15197 } 15198 15199 /* If the original input began with 'In' or 'Is', it could be a subroutine 15200 * call to a user-defined property instead of a Unicode property name. */ 15201 if ( name_len - non_pkg_begin > 2 15202 && name[non_pkg_begin+0] == 'I' 15203 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's')) 15204 { 15205 /* Names that start with In have different characteristics than those 15206 * that start with Is */ 15207 if (name[non_pkg_begin+1] == 's') { 15208 starts_with_Is = TRUE; 15209 } 15210 } 15211 else { 15212 could_be_user_defined = FALSE; 15213 } 15214 15215 if (could_be_user_defined) { 15216 CV* user_sub; 15217 15218 /* If the user defined property returns the empty string, it could 15219 * easily be because the pattern is being compiled before the data it 15220 * actually needs to compile is available. This could be argued to be 15221 * a bug in the perl code, but this is a change of behavior for Perl, 15222 * so we handle it. This means that intentionally returning nothing 15223 * will not be resolved until runtime */ 15224 bool empty_return = FALSE; 15225 15226 /* Here, the name could be for a user defined property, which are 15227 * implemented as subs. */ 15228 user_sub = get_cvn_flags(name, name_len, 0); 15229 if (! user_sub) { 15230 15231 /* Here, the property name could be a user-defined one, but there 15232 * is no subroutine to handle it (as of now). Defer handling it 15233 * until runtime. Otherwise, a block defined by Unicode in a later 15234 * release would get the synonym InFoo added for it, and existing 15235 * code that used that name would suddenly break if it referred to 15236 * the property before the sub was declared. See [perl #134146] */ 15237 if (deferrable) { 15238 goto definition_deferred; 15239 } 15240 15241 /* Here, we are at runtime, and didn't find the user property. It 15242 * could be an official property, but only if no package was 15243 * specified, or just the utf8:: package. */ 15244 if (could_be_deferred_official) { 15245 lookup_name += lun_non_pkg_begin; 15246 j -= lun_non_pkg_begin; 15247 } 15248 else if (! stripped_utf8_pkg) { 15249 goto unknown_user_defined; 15250 } 15251 15252 /* Drop down to look up in the official properties */ 15253 } 15254 else { 15255 const char insecure[] = "Insecure user-defined property"; 15256 15257 /* Here, there is a sub by the correct name. Normally we call it 15258 * to get the property definition */ 15259 dSP; 15260 SV * user_sub_sv = MUTABLE_SV(user_sub); 15261 SV * error; /* Any error returned by calling 'user_sub' */ 15262 SV * key; /* The key into the hash of user defined sub names 15263 */ 15264 SV * placeholder; 15265 SV ** saved_user_prop_ptr; /* Hash entry for this property */ 15266 15267 /* How many times to retry when another thread is in the middle of 15268 * expanding the same definition we want */ 15269 PERL_INT_FAST8_T retry_countdown = 10; 15270 15271 DECLARATION_FOR_GLOBAL_CONTEXT; 15272 15273 /* If we get here, we know this property is user-defined */ 15274 *user_defined_ptr = TRUE; 15275 15276 /* We refuse to call a potentially tainted subroutine; returning an 15277 * error instead */ 15278 if (TAINT_get) { 15279 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15280 sv_catpvn(msg, insecure, sizeof(insecure) - 1); 15281 goto append_name_to_msg; 15282 } 15283 15284 /* In principal, we only call each subroutine property definition 15285 * once during the life of the program. This guarantees that the 15286 * property definition never changes. The results of the single 15287 * sub call are stored in a hash, which is used instead for future 15288 * references to this property. The property definition is thus 15289 * immutable. But, to allow the user to have a /i-dependent 15290 * definition, we call the sub once for non-/i, and once for /i, 15291 * should the need arise, passing the /i status as a parameter. 15292 * 15293 * We start by constructing the hash key name, consisting of the 15294 * fully qualified subroutine name, preceded by the /i status, so 15295 * that there is a key for /i and a different key for non-/i */ 15296 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP); 15297 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, 15298 non_pkg_begin != 0); 15299 sv_catsv(key, fq_name); 15300 15301 /* We only call the sub once throughout the life of the program 15302 * (with the /i, non-/i exception noted above). That means the 15303 * hash must be global and accessible to all threads. It is 15304 * created at program start-up, before any threads are created, so 15305 * is accessible to all children. But this creates some 15306 * complications. 15307 * 15308 * 1) The keys can't be shared, or else problems arise; sharing is 15309 * turned off at hash creation time 15310 * 2) All SVs in it are there for the remainder of the life of the 15311 * program, and must be created in the same interpreter context 15312 * as the hash, or else they will be freed from the wrong pool 15313 * at global destruction time. This is handled by switching to 15314 * the hash's context to create each SV going into it, and then 15315 * immediately switching back 15316 * 3) All accesses to the hash must be controlled by a mutex, to 15317 * prevent two threads from getting an unstable state should 15318 * they simultaneously be accessing it. The code below is 15319 * crafted so that the mutex is locked whenever there is an 15320 * access and unlocked only when the next stable state is 15321 * achieved. 15322 * 15323 * The hash stores either the definition of the property if it was 15324 * valid, or, if invalid, the error message that was raised. We 15325 * use the type of SV to distinguish. 15326 * 15327 * There's also the need to guard against the definition expansion 15328 * from infinitely recursing. This is handled by storing the aTHX 15329 * of the expanding thread during the expansion. Again the SV type 15330 * is used to distinguish this from the other two cases. If we 15331 * come to here and the hash entry for this property is our aTHX, 15332 * it means we have recursed, and the code assumes that we would 15333 * infinitely recurse, so instead stops and raises an error. 15334 * (Any recursion has always been treated as infinite recursion in 15335 * this feature.) 15336 * 15337 * If instead, the entry is for a different aTHX, it means that 15338 * that thread has gotten here first, and hasn't finished expanding 15339 * the definition yet. We just have to wait until it is done. We 15340 * sleep and retry a few times, returning an error if the other 15341 * thread doesn't complete. */ 15342 15343 re_fetch: 15344 USER_PROP_MUTEX_LOCK; 15345 15346 /* If we have an entry for this key, the subroutine has already 15347 * been called once with this /i status. */ 15348 saved_user_prop_ptr = hv_fetch(PL_user_def_props, 15349 SvPVX(key), SvCUR(key), 0); 15350 if (saved_user_prop_ptr) { 15351 15352 /* If the saved result is an inversion list, it is the valid 15353 * definition of this property */ 15354 if (is_invlist(*saved_user_prop_ptr)) { 15355 prop_definition = *saved_user_prop_ptr; 15356 15357 /* The SV in the hash won't be removed until global 15358 * destruction, so it is stable and we can unlock */ 15359 USER_PROP_MUTEX_UNLOCK; 15360 15361 /* The caller shouldn't try to free this SV */ 15362 return prop_definition; 15363 } 15364 15365 /* Otherwise, if it is a string, it is the error message 15366 * that was returned when we first tried to evaluate this 15367 * property. Fail, and append the message */ 15368 if (SvPOK(*saved_user_prop_ptr)) { 15369 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15370 sv_catsv(msg, *saved_user_prop_ptr); 15371 15372 /* The SV in the hash won't be removed until global 15373 * destruction, so it is stable and we can unlock */ 15374 USER_PROP_MUTEX_UNLOCK; 15375 15376 return NULL; 15377 } 15378 15379 assert(SvIOK(*saved_user_prop_ptr)); 15380 15381 /* Here, we have an unstable entry in the hash. Either another 15382 * thread is in the middle of expanding the property's 15383 * definition, or we are ourselves recursing. We use the aTHX 15384 * in it to distinguish */ 15385 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) { 15386 15387 /* Here, it's another thread doing the expanding. We've 15388 * looked as much as we are going to at the contents of the 15389 * hash entry. It's safe to unlock. */ 15390 USER_PROP_MUTEX_UNLOCK; 15391 15392 /* Retry a few times */ 15393 if (retry_countdown-- > 0) { 15394 PerlProc_sleep(1); 15395 goto re_fetch; 15396 } 15397 15398 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15399 sv_catpvs(msg, "Timeout waiting for another thread to " 15400 "define"); 15401 goto append_name_to_msg; 15402 } 15403 15404 /* Here, we are recursing; don't dig any deeper */ 15405 USER_PROP_MUTEX_UNLOCK; 15406 15407 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15408 sv_catpvs(msg, 15409 "Infinite recursion in user-defined property"); 15410 goto append_name_to_msg; 15411 } 15412 15413 /* Here, this thread has exclusive control, and there is no entry 15414 * for this property in the hash. So we have the go ahead to 15415 * expand the definition ourselves. */ 15416 15417 PUSHSTACKi(PERLSI_REGCOMP); 15418 ENTER; 15419 15420 /* Create a temporary placeholder in the hash to detect recursion 15421 * */ 15422 SWITCH_TO_GLOBAL_CONTEXT; 15423 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT)); 15424 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0); 15425 RESTORE_CONTEXT; 15426 15427 /* Now that we have a placeholder, we can let other threads 15428 * continue */ 15429 USER_PROP_MUTEX_UNLOCK; 15430 15431 /* Make sure the placeholder always gets destroyed */ 15432 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key)); 15433 15434 PUSHMARK(SP); 15435 SAVETMPS; 15436 15437 /* Call the user's function, with the /i status as a parameter. 15438 * Note that we have gone to a lot of trouble to keep this call 15439 * from being within the locked mutex region. */ 15440 XPUSHs(boolSV(to_fold)); 15441 PUTBACK; 15442 15443 /* The following block was taken from swash_init(). Presumably 15444 * they apply to here as well, though we no longer use a swash -- 15445 * khw */ 15446 SAVEHINTS(); 15447 save_re_context(); 15448 /* We might get here via a subroutine signature which uses a utf8 15449 * parameter name, at which point PL_subname will have been set 15450 * but not yet used. */ 15451 save_item(PL_subname); 15452 15453 /* G_SCALAR guarantees a single return value */ 15454 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR); 15455 15456 SPAGAIN; 15457 15458 error = ERRSV; 15459 if (TAINT_get || SvTRUE(error)) { 15460 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15461 if (SvTRUE(error)) { 15462 sv_catpvs(msg, "Error \""); 15463 sv_catsv(msg, error); 15464 sv_catpvs(msg, "\""); 15465 } 15466 if (TAINT_get) { 15467 if (SvTRUE(error)) sv_catpvs(msg, "; "); 15468 sv_catpvn(msg, insecure, sizeof(insecure) - 1); 15469 } 15470 15471 if (name_len > 0) { 15472 sv_catpvs(msg, " in expansion of "); 15473 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, 15474 name_len, 15475 name)); 15476 } 15477 15478 (void) POPs; 15479 prop_definition = NULL; 15480 } 15481 else { 15482 SV * contents = POPs; 15483 15484 /* The contents is supposed to be the expansion of the property 15485 * definition. If the definition is deferrable, and we got an 15486 * empty string back, set a flag to later defer it (after clean 15487 * up below). */ 15488 if ( deferrable 15489 && (! SvPOK(contents) || SvCUR(contents) == 0)) 15490 { 15491 empty_return = TRUE; 15492 } 15493 else { /* Otherwise, call a function to check for valid syntax, 15494 and handle it */ 15495 15496 prop_definition = handle_user_defined_property( 15497 name, name_len, 15498 is_utf8, to_fold, runtime, 15499 deferrable, 15500 contents, user_defined_ptr, 15501 msg, 15502 level); 15503 } 15504 } 15505 15506 /* Here, we have the results of the expansion. Delete the 15507 * placeholder, and if the definition is now known, replace it with 15508 * that definition. We need exclusive access to the hash, and we 15509 * can't let anyone else in, between when we delete the placeholder 15510 * and add the permanent entry */ 15511 USER_PROP_MUTEX_LOCK; 15512 15513 S_delete_recursion_entry(aTHX_ SvPVX(key)); 15514 15515 if ( ! empty_return 15516 && (! prop_definition || is_invlist(prop_definition))) 15517 { 15518 /* If we got success we use the inversion list defining the 15519 * property; otherwise use the error message */ 15520 SWITCH_TO_GLOBAL_CONTEXT; 15521 (void) hv_store_ent(PL_user_def_props, 15522 key, 15523 ((prop_definition) 15524 ? newSVsv(prop_definition) 15525 : newSVsv(msg)), 15526 0); 15527 RESTORE_CONTEXT; 15528 } 15529 15530 /* All done, and the hash now has a permanent entry for this 15531 * property. Give up exclusive control */ 15532 USER_PROP_MUTEX_UNLOCK; 15533 15534 FREETMPS; 15535 LEAVE; 15536 POPSTACK; 15537 15538 if (empty_return) { 15539 goto definition_deferred; 15540 } 15541 15542 if (prop_definition) { 15543 15544 /* If the definition is for something not known at this time, 15545 * we toss it, and go return the main property name, as that's 15546 * the one the user will be aware of */ 15547 if (! is_invlist(prop_definition)) { 15548 SvREFCNT_dec_NN(prop_definition); 15549 goto definition_deferred; 15550 } 15551 15552 sv_2mortal(prop_definition); 15553 } 15554 15555 /* And return */ 15556 return prop_definition; 15557 15558 } /* End of calling the subroutine for the user-defined property */ 15559 } /* End of it could be a user-defined property */ 15560 15561 /* Here it wasn't a user-defined property that is known at this time. See 15562 * if it is a Unicode property */ 15563 15564 lookup_len = j; /* This is a more mnemonic name than 'j' */ 15565 15566 /* Get the index into our pointer table of the inversion list corresponding 15567 * to the property */ 15568 table_index = do_uniprop_match(lookup_name, lookup_len); 15569 15570 /* If it didn't find the property ... */ 15571 if (table_index == 0) { 15572 15573 /* Try again stripping off any initial 'Is'. This is because we 15574 * promise that an initial Is is optional. The same isn't true of 15575 * names that start with 'In'. Those can match only blocks, and the 15576 * lookup table already has those accounted for. The lookup table also 15577 * has already accounted for Perl extensions (without and = sign) 15578 * starting with 'i's'. */ 15579 if (starts_with_Is && equals_pos >= 0) { 15580 lookup_name += 2; 15581 lookup_len -= 2; 15582 equals_pos -= 2; 15583 slash_pos -= 2; 15584 15585 table_index = do_uniprop_match(lookup_name, lookup_len); 15586 } 15587 15588 if (table_index == 0) { 15589 char * canonical; 15590 15591 /* Here, we didn't find it. If not a numeric type property, and 15592 * can't be a user-defined one, it isn't a legal property */ 15593 if (! is_nv_type) { 15594 if (! could_be_user_defined) { 15595 goto failed; 15596 } 15597 15598 /* Here, the property name is legal as a user-defined one. At 15599 * compile time, it might just be that the subroutine for that 15600 * property hasn't been encountered yet, but at runtime, it's 15601 * an error to try to use an undefined one */ 15602 if (! deferrable) { 15603 goto unknown_user_defined; 15604 } 15605 15606 goto definition_deferred; 15607 } /* End of isn't a numeric type property */ 15608 15609 /* The numeric type properties need more work to decide. What we 15610 * do is make sure we have the number in canonical form and look 15611 * that up. */ 15612 15613 if (slash_pos < 0) { /* No slash */ 15614 15615 /* When it isn't a rational, take the input, convert it to a 15616 * NV, then create a canonical string representation of that 15617 * NV. */ 15618 15619 NV value; 15620 SSize_t value_len = lookup_len - equals_pos; 15621 15622 /* Get the value */ 15623 if ( value_len <= 0 15624 || my_atof3(lookup_name + equals_pos, &value, 15625 value_len) 15626 != lookup_name + lookup_len) 15627 { 15628 goto failed; 15629 } 15630 15631 /* If the value is an integer, the canonical value is integral 15632 * */ 15633 if (Perl_ceil(value) == value) { 15634 canonical = Perl_form(aTHX_ "%.*s%.0" NVff, 15635 equals_pos, lookup_name, value); 15636 } 15637 else { /* Otherwise, it is %e with a known precision */ 15638 char * exp_ptr; 15639 15640 canonical = Perl_form(aTHX_ "%.*s%.*" NVef, 15641 equals_pos, lookup_name, 15642 PL_E_FORMAT_PRECISION, value); 15643 15644 /* The exponent generated is expecting two digits, whereas 15645 * %e on some systems will generate three. Remove leading 15646 * zeros in excess of 2 from the exponent. We start 15647 * looking for them after the '=' */ 15648 exp_ptr = strchr(canonical + equals_pos, 'e'); 15649 if (exp_ptr) { 15650 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */ 15651 SSize_t excess_exponent_len = strlen(cur_ptr) - 2; 15652 15653 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+'); 15654 15655 if (excess_exponent_len > 0) { 15656 SSize_t leading_zeros = strspn(cur_ptr, "0"); 15657 SSize_t excess_leading_zeros 15658 = MIN(leading_zeros, excess_exponent_len); 15659 if (excess_leading_zeros > 0) { 15660 Move(cur_ptr + excess_leading_zeros, 15661 cur_ptr, 15662 strlen(cur_ptr) - excess_leading_zeros 15663 + 1, /* Copy the NUL as well */ 15664 char); 15665 } 15666 } 15667 } 15668 } 15669 } 15670 else { /* Has a slash. Create a rational in canonical form */ 15671 UV numerator, denominator, gcd, trial; 15672 const char * end_ptr; 15673 const char * sign = ""; 15674 15675 /* We can't just find the numerator, denominator, and do the 15676 * division, then use the method above, because that is 15677 * inexact. And the input could be a rational that is within 15678 * epsilon (given our precision) of a valid rational, and would 15679 * then incorrectly compare valid. 15680 * 15681 * We're only interested in the part after the '=' */ 15682 const char * this_lookup_name = lookup_name + equals_pos; 15683 lookup_len -= equals_pos; 15684 slash_pos -= equals_pos; 15685 15686 /* Handle any leading minus */ 15687 if (this_lookup_name[0] == '-') { 15688 sign = "-"; 15689 this_lookup_name++; 15690 lookup_len--; 15691 slash_pos--; 15692 } 15693 15694 /* Convert the numerator to numeric */ 15695 end_ptr = this_lookup_name + slash_pos; 15696 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) { 15697 goto failed; 15698 } 15699 15700 /* It better have included all characters before the slash */ 15701 if (*end_ptr != '/') { 15702 goto failed; 15703 } 15704 15705 /* Set to look at just the denominator */ 15706 this_lookup_name += slash_pos; 15707 lookup_len -= slash_pos; 15708 end_ptr = this_lookup_name + lookup_len; 15709 15710 /* Convert the denominator to numeric */ 15711 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) { 15712 goto failed; 15713 } 15714 15715 /* It better be the rest of the characters, and don't divide by 15716 * 0 */ 15717 if ( end_ptr != this_lookup_name + lookup_len 15718 || denominator == 0) 15719 { 15720 goto failed; 15721 } 15722 15723 /* Get the greatest common denominator using 15724 https://en.wikipedia.org/wiki/Euclidean_algorithm */ 15725 gcd = numerator; 15726 trial = denominator; 15727 while (trial != 0) { 15728 UV temp = trial; 15729 trial = gcd % trial; 15730 gcd = temp; 15731 } 15732 15733 /* If already in lowest possible terms, we have already tried 15734 * looking this up */ 15735 if (gcd == 1) { 15736 goto failed; 15737 } 15738 15739 /* Reduce the rational, which should put it in canonical form 15740 * */ 15741 numerator /= gcd; 15742 denominator /= gcd; 15743 15744 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf, 15745 equals_pos, lookup_name, sign, numerator, denominator); 15746 } 15747 15748 /* Here, we have the number in canonical form. Try that */ 15749 table_index = do_uniprop_match(canonical, strlen(canonical)); 15750 if (table_index == 0) { 15751 goto failed; 15752 } 15753 } /* End of still didn't find the property in our table */ 15754 } /* End of didn't find the property in our table */ 15755 15756 /* Here, we have a non-zero return, which is an index into a table of ptrs. 15757 * A negative return signifies that the real index is the absolute value, 15758 * but the result needs to be inverted */ 15759 if (table_index < 0) { 15760 invert_return = TRUE; 15761 table_index = -table_index; 15762 } 15763 15764 /* Out-of band indices indicate a deprecated property. The proper index is 15765 * modulo it with the table size. And dividing by the table size yields 15766 * an offset into a table constructed by regen/mk_invlists.pl to contain 15767 * the corresponding warning message */ 15768 if (table_index > MAX_UNI_KEYWORD_INDEX) { 15769 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX; 15770 table_index %= MAX_UNI_KEYWORD_INDEX; 15771 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME), 15772 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s", 15773 (int) name_len, name, 15774 get_deprecated_property_msg(warning_offset)); 15775 } 15776 15777 /* In a few properties, a different property is used under /i. These are 15778 * unlikely to change, so are hard-coded here. */ 15779 if (to_fold) { 15780 if ( table_index == UNI_XPOSIXUPPER 15781 || table_index == UNI_XPOSIXLOWER 15782 || table_index == UNI_TITLE) 15783 { 15784 table_index = UNI_CASED; 15785 } 15786 else if ( table_index == UNI_UPPERCASELETTER 15787 || table_index == UNI_LOWERCASELETTER 15788 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */ 15789 || table_index == UNI_TITLECASELETTER 15790 # endif 15791 ) { 15792 table_index = UNI_CASEDLETTER; 15793 } 15794 else if ( table_index == UNI_POSIXUPPER 15795 || table_index == UNI_POSIXLOWER) 15796 { 15797 table_index = UNI_POSIXALPHA; 15798 } 15799 } 15800 15801 /* Create and return the inversion list */ 15802 prop_definition = get_prop_definition(table_index); 15803 sv_2mortal(prop_definition); 15804 15805 /* See if there is a private use override to add to this definition */ 15806 { 15807 COPHH * hinthash = (IN_PERL_COMPILETIME) 15808 ? CopHINTHASH_get(&PL_compiling) 15809 : CopHINTHASH_get(PL_curcop); 15810 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0); 15811 15812 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) { 15813 15814 /* See if there is an element in the hints hash for this table */ 15815 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index); 15816 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup)); 15817 15818 if (pos) { 15819 bool dummy; 15820 SV * pu_definition; 15821 SV * pu_invlist; 15822 SV * expanded_prop_definition = 15823 sv_2mortal(invlist_clone(prop_definition, NULL)); 15824 15825 /* If so, it's definition is the string from here to the next 15826 * \a character. And its format is the same as a user-defined 15827 * property */ 15828 pos += SvCUR(pu_lookup); 15829 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos); 15830 pu_invlist = handle_user_defined_property(lookup_name, 15831 lookup_len, 15832 0, /* Not UTF-8 */ 15833 0, /* Not folded */ 15834 runtime, 15835 deferrable, 15836 pu_definition, 15837 &dummy, 15838 msg, 15839 level); 15840 if (TAINT_get) { 15841 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15842 sv_catpvs(msg, "Insecure private-use override"); 15843 goto append_name_to_msg; 15844 } 15845 15846 /* For now, as a safety measure, make sure that it doesn't 15847 * override non-private use code points */ 15848 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist); 15849 15850 /* Add it to the list to be returned */ 15851 _invlist_union(prop_definition, pu_invlist, 15852 &expanded_prop_definition); 15853 prop_definition = expanded_prop_definition; 15854 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental"); 15855 } 15856 } 15857 } 15858 15859 if (invert_return) { 15860 _invlist_invert(prop_definition); 15861 } 15862 return prop_definition; 15863 15864 unknown_user_defined: 15865 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15866 sv_catpvs(msg, "Unknown user-defined property name"); 15867 goto append_name_to_msg; 15868 15869 failed: 15870 if (non_pkg_begin != 0) { 15871 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15872 sv_catpvs(msg, "Illegal user-defined property name"); 15873 } 15874 else { 15875 if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); 15876 sv_catpvs(msg, "Can't find Unicode property definition"); 15877 } 15878 /* FALLTHROUGH */ 15879 15880 append_name_to_msg: 15881 { 15882 const char * prefix = (runtime && level == 0) ? " \\p{" : " \""; 15883 const char * suffix = (runtime && level == 0) ? "}" : "\""; 15884 15885 sv_catpv(msg, prefix); 15886 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); 15887 sv_catpv(msg, suffix); 15888 } 15889 15890 return NULL; 15891 15892 definition_deferred: 15893 15894 { 15895 bool is_qualified = non_pkg_begin != 0; /* If has "::" */ 15896 15897 /* Here it could yet to be defined, so defer evaluation of this until 15898 * its needed at runtime. We need the fully qualified property name to 15899 * avoid ambiguity */ 15900 if (! fq_name) { 15901 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8, 15902 is_qualified); 15903 } 15904 15905 /* If it didn't come with a package, or the package is utf8::, this 15906 * actually could be an official Unicode property whose inclusion we 15907 * are deferring until runtime to make sure that it isn't overridden by 15908 * a user-defined property of the same name (which we haven't 15909 * encountered yet). Add a marker to indicate this possibility, for 15910 * use at such time when we first need the definition during pattern 15911 * matching execution */ 15912 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) { 15913 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs); 15914 } 15915 15916 /* We also need a trailing newline */ 15917 sv_catpvs(fq_name, "\n"); 15918 15919 *user_defined_ptr = TRUE; 15920 return fq_name; 15921 } 15922 } 15923 15924 STATIC bool 15925 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */ 15926 const STRLEN wname_len, /* Its length */ 15927 SV ** prop_definition, 15928 AV ** strings) 15929 { 15930 /* Deal with Name property wildcard subpatterns; returns TRUE if there were 15931 * any matches, adding them to prop_definition */ 15932 15933 dSP; 15934 15935 CV * get_names_info; /* entry to charnames.pm to get info we need */ 15936 SV * names_string; /* Contains all character names, except algo */ 15937 SV * algorithmic_names; /* Contains info about algorithmically 15938 generated character names */ 15939 REGEXP * subpattern_re; /* The user's pattern to match with */ 15940 struct regexp * prog; /* The compiled pattern */ 15941 char * all_names_start; /* lib/unicore/Name.pl string of every 15942 (non-algorithmic) character name */ 15943 char * cur_pos; /* We match, effectively using /gc; this is 15944 where we are now */ 15945 bool found_matches = FALSE; /* Did any name match so far? */ 15946 SV * empty; /* For matching zero length names */ 15947 SV * must_sv; /* Contains the substring, if any, that must be 15948 in a name for the subpattern to match */ 15949 const char * must; /* The PV of 'must' */ 15950 STRLEN must_len; /* And its length */ 15951 SV * syllable_name = NULL; /* For Hangul syllables */ 15952 const char hangul_prefix[] = "HANGUL SYLLABLE "; 15953 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1; 15954 15955 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul 15956 * syllable name, and these are immutable and guaranteed by the Unicode 15957 * standard to never be extended */ 15958 const STRLEN syl_max_len = hangul_prefix_len + 7; 15959 15960 IV i; 15961 15962 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD; 15963 15964 /* Make sure _charnames is loaded. (The parameters give context 15965 * for any errors generated */ 15966 get_names_info = get_cv("_charnames::_get_names_info", 0); 15967 if (! get_names_info) { 15968 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info"); 15969 } 15970 15971 /* Get the charnames data */ 15972 PUSHSTACKi(PERLSI_REGCOMP); 15973 ENTER ; 15974 SAVETMPS; 15975 save_re_context(); 15976 15977 PUSHMARK(SP) ; 15978 PUTBACK; 15979 15980 /* Special _charnames entry point that returns the info this routine 15981 * requires */ 15982 call_sv(MUTABLE_SV(get_names_info), G_LIST); 15983 15984 SPAGAIN ; 15985 15986 /* Data structure for names which end in their very own code points */ 15987 algorithmic_names = POPs; 15988 SvREFCNT_inc_simple_void_NN(algorithmic_names); 15989 15990 /* The lib/unicore/Name.pl string */ 15991 names_string = POPs; 15992 SvREFCNT_inc_simple_void_NN(names_string); 15993 15994 PUTBACK ; 15995 FREETMPS ; 15996 LEAVE ; 15997 POPSTACK; 15998 15999 if ( ! SvROK(names_string) 16000 || ! SvROK(algorithmic_names)) 16001 { /* Perhaps should panic instead XXX */ 16002 SvREFCNT_dec(names_string); 16003 SvREFCNT_dec(algorithmic_names); 16004 return FALSE; 16005 } 16006 16007 names_string = sv_2mortal(SvRV(names_string)); 16008 all_names_start = SvPVX(names_string); 16009 cur_pos = all_names_start; 16010 16011 algorithmic_names= sv_2mortal(SvRV(algorithmic_names)); 16012 16013 /* Compile the subpattern consisting of the name being looked for */ 16014 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ ); 16015 16016 must_sv = re_intuit_string(subpattern_re); 16017 if (must_sv) { 16018 /* regexec.c can free the re_intuit_string() return. GH #17734 */ 16019 must_sv = sv_2mortal(newSVsv(must_sv)); 16020 must = SvPV(must_sv, must_len); 16021 } 16022 else { 16023 must = ""; 16024 must_len = 0; 16025 } 16026 16027 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it. 16028 * This works because the NUL causes the function to return early, thus 16029 * showing that there are characters in it other than the acceptable ones, 16030 * which is our desired result.) */ 16031 16032 prog = ReANY(subpattern_re); 16033 16034 /* If only nothing is matched, skip to where empty names are looked for */ 16035 if (prog->maxlen == 0) { 16036 goto check_empty; 16037 } 16038 16039 /* And match against the string of all names /gc. Don't even try if it 16040 * must match a character not found in any name. */ 16041 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len) 16042 { 16043 while (execute_wildcard(subpattern_re, 16044 cur_pos, 16045 SvEND(names_string), 16046 all_names_start, 0, 16047 names_string, 16048 0)) 16049 { /* Here, matched. */ 16050 16051 /* Note the string entries look like 16052 * 00001\nSTART OF HEADING\n\n 16053 * so we could match anywhere in that string. We have to rule out 16054 * matching a code point line */ 16055 char * this_name_start = all_names_start 16056 + RX_OFFS_START(subpattern_re,0); 16057 char * this_name_end = all_names_start 16058 + RX_OFFS_END(subpattern_re,0); 16059 char * cp_start; 16060 char * cp_end; 16061 UV cp = 0; /* Silences some compilers */ 16062 AV * this_string = NULL; 16063 bool is_multi = FALSE; 16064 16065 /* If matched nothing, advance to next possible match */ 16066 if (this_name_start == this_name_end) { 16067 cur_pos = (char *) memchr(this_name_end + 1, '\n', 16068 SvEND(names_string) - this_name_end); 16069 if (cur_pos == NULL) { 16070 break; 16071 } 16072 } 16073 else { 16074 /* Position the next match to start beyond the current returned 16075 * entry */ 16076 cur_pos = (char *) memchr(this_name_end, '\n', 16077 SvEND(names_string) - this_name_end); 16078 } 16079 16080 /* Back up to the \n just before the beginning of the character. */ 16081 cp_end = (char *) my_memrchr(all_names_start, 16082 '\n', 16083 this_name_start - all_names_start); 16084 16085 /* If we didn't find a \n, it means it matched somewhere in the 16086 * initial '00000' in the string, so isn't a real match */ 16087 if (cp_end == NULL) { 16088 continue; 16089 } 16090 16091 this_name_start = cp_end + 1; /* The name starts just after */ 16092 cp_end--; /* the \n, and the code point */ 16093 /* ends just before it */ 16094 16095 /* All code points are 5 digits long */ 16096 cp_start = cp_end - 4; 16097 16098 /* This shouldn't happen, as we found a \n, and the first \n is 16099 * further along than what we subtracted */ 16100 assert(cp_start >= all_names_start); 16101 16102 if (cp_start == all_names_start) { 16103 *prop_definition = add_cp_to_invlist(*prop_definition, 0); 16104 continue; 16105 } 16106 16107 /* If the character is a blank, we either have a named sequence, or 16108 * something is wrong */ 16109 if (*(cp_start - 1) == ' ') { 16110 cp_start = (char *) my_memrchr(all_names_start, 16111 '\n', 16112 cp_start - all_names_start); 16113 cp_start++; 16114 } 16115 16116 assert(cp_start != NULL && cp_start >= all_names_start + 2); 16117 16118 /* Except for the first line in the string, the sequence before the 16119 * code point is \n\n. If that isn't the case here, we didn't 16120 * match the name of a character. (We could have matched a named 16121 * sequence, not currently handled */ 16122 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') { 16123 continue; 16124 } 16125 16126 /* We matched! Add this to the list */ 16127 found_matches = TRUE; 16128 16129 /* Loop through all the code points in the sequence */ 16130 while (cp_start < cp_end) { 16131 16132 /* Calculate this code point from its 5 digits */ 16133 cp = (XDIGIT_VALUE(cp_start[0]) << 16) 16134 + (XDIGIT_VALUE(cp_start[1]) << 12) 16135 + (XDIGIT_VALUE(cp_start[2]) << 8) 16136 + (XDIGIT_VALUE(cp_start[3]) << 4) 16137 + XDIGIT_VALUE(cp_start[4]); 16138 16139 cp_start += 6; /* Go past any blank */ 16140 16141 if (cp_start < cp_end || is_multi) { 16142 if (this_string == NULL) { 16143 this_string = newAV(); 16144 } 16145 16146 is_multi = TRUE; 16147 av_push_simple(this_string, newSVuv(cp)); 16148 } 16149 } 16150 16151 if (is_multi) { /* Was more than one code point */ 16152 if (*strings == NULL) { 16153 *strings = newAV(); 16154 } 16155 16156 av_push_simple(*strings, (SV *) this_string); 16157 } 16158 else { /* Only a single code point */ 16159 *prop_definition = add_cp_to_invlist(*prop_definition, cp); 16160 } 16161 } /* End of loop through the non-algorithmic names string */ 16162 } 16163 16164 /* There are also character names not in 'names_string'. These are 16165 * algorithmically generatable. Try this pattern on each possible one. 16166 * (khw originally planned to leave this out given the large number of 16167 * matches attempted; but the speed turned out to be quite acceptable 16168 * 16169 * There are plenty of opportunities to optimize to skip many of the tests. 16170 * beyond the rudimentary ones already here */ 16171 16172 /* First see if the subpattern matches any of the algorithmic generatable 16173 * Hangul syllable names. 16174 * 16175 * We know none of these syllable names will match if the input pattern 16176 * requires more bytes than any syllable has, or if the input pattern only 16177 * matches an empty name, or if the pattern has something it must match and 16178 * one of the characters in that isn't in any Hangul syllable. */ 16179 if ( prog->minlen <= (SSize_t) syl_max_len 16180 && prog->maxlen > 0 16181 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len)) 16182 { 16183 /* These constants, names, values, and algorithm are adapted from the 16184 * Unicode standard, version 5.1, section 3.12, and should never 16185 * change. */ 16186 const char * JamoL[] = { 16187 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", 16188 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" 16189 }; 16190 const int LCount = C_ARRAY_LENGTH(JamoL); 16191 16192 const char * JamoV[] = { 16193 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", 16194 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", 16195 "I" 16196 }; 16197 const int VCount = C_ARRAY_LENGTH(JamoV); 16198 16199 const char * JamoT[] = { 16200 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", 16201 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B", 16202 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 16203 }; 16204 const int TCount = C_ARRAY_LENGTH(JamoT); 16205 16206 int L, V, T; 16207 16208 /* This is the initial Hangul syllable code point; each time through the 16209 * inner loop, it maps to the next higher code point. For more info, 16210 * see the Hangul syllable section of the Unicode standard. */ 16211 int cp = 0xAC00; 16212 16213 syllable_name = sv_2mortal(newSV(syl_max_len)); 16214 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len); 16215 16216 for (L = 0; L < LCount; L++) { 16217 for (V = 0; V < VCount; V++) { 16218 for (T = 0; T < TCount; T++) { 16219 16220 /* Truncate back to the prefix, which is unvarying */ 16221 SvCUR_set(syllable_name, hangul_prefix_len); 16222 16223 sv_catpv(syllable_name, JamoL[L]); 16224 sv_catpv(syllable_name, JamoV[V]); 16225 sv_catpv(syllable_name, JamoT[T]); 16226 16227 if (execute_wildcard(subpattern_re, 16228 SvPVX(syllable_name), 16229 SvEND(syllable_name), 16230 SvPVX(syllable_name), 0, 16231 syllable_name, 16232 0)) 16233 { 16234 *prop_definition = add_cp_to_invlist(*prop_definition, 16235 cp); 16236 found_matches = TRUE; 16237 } 16238 16239 cp++; 16240 } 16241 } 16242 } 16243 } 16244 16245 /* The rest of the algorithmically generatable names are of the form 16246 * "PREFIX-code_point". The prefixes and the code point limits of each 16247 * were returned to us in the array 'algorithmic_names' from data in 16248 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */ 16249 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) { 16250 IV j; 16251 16252 /* Each element of the array is a hash, giving the details for the 16253 * series of names it covers. There is the base name of the characters 16254 * in the series, and the low and high code points in the series. And, 16255 * for optimization purposes a string containing all the legal 16256 * characters that could possibly be in a name in this series. */ 16257 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0)); 16258 SV * prefix = * hv_fetchs(this_series, "name", 0); 16259 IV low = SvIV(* hv_fetchs(this_series, "low", 0)); 16260 IV high = SvIV(* hv_fetchs(this_series, "high", 0)); 16261 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0)); 16262 16263 /* Pre-allocate an SV with enough space */ 16264 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000", 16265 SvPVX(prefix))); 16266 if (high >= 0x10000) { 16267 sv_catpvs(algo_name, "0"); 16268 } 16269 16270 /* This series can be skipped entirely if the pattern requires 16271 * something longer than any name in the series, or can only match an 16272 * empty name, or contains a character not found in any name in the 16273 * series */ 16274 if ( prog->minlen <= (SSize_t) SvCUR(algo_name) 16275 && prog->maxlen > 0 16276 && (strspn(must, legal) == must_len)) 16277 { 16278 for (j = low; j <= high; j++) { /* For each code point in the series */ 16279 16280 /* Get its name, and see if it matches the subpattern */ 16281 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix), 16282 (unsigned) j); 16283 16284 if (execute_wildcard(subpattern_re, 16285 SvPVX(algo_name), 16286 SvEND(algo_name), 16287 SvPVX(algo_name), 0, 16288 algo_name, 16289 0)) 16290 { 16291 *prop_definition = add_cp_to_invlist(*prop_definition, j); 16292 found_matches = TRUE; 16293 } 16294 } 16295 } 16296 } 16297 16298 check_empty: 16299 /* Finally, see if the subpattern matches an empty string */ 16300 empty = newSVpvs(""); 16301 if (execute_wildcard(subpattern_re, 16302 SvPVX(empty), 16303 SvEND(empty), 16304 SvPVX(empty), 0, 16305 empty, 16306 0)) 16307 { 16308 /* Many code points have empty names. Currently these are the \p{GC=C} 16309 * ones, minus CC and CF */ 16310 16311 SV * empty_names_ref = get_prop_definition(UNI_C); 16312 SV * empty_names = invlist_clone(empty_names_ref, NULL); 16313 16314 SV * subtract = get_prop_definition(UNI_CC); 16315 16316 _invlist_subtract(empty_names, subtract, &empty_names); 16317 SvREFCNT_dec_NN(empty_names_ref); 16318 SvREFCNT_dec_NN(subtract); 16319 16320 subtract = get_prop_definition(UNI_CF); 16321 _invlist_subtract(empty_names, subtract, &empty_names); 16322 SvREFCNT_dec_NN(subtract); 16323 16324 _invlist_union(*prop_definition, empty_names, prop_definition); 16325 found_matches = TRUE; 16326 SvREFCNT_dec_NN(empty_names); 16327 } 16328 SvREFCNT_dec_NN(empty); 16329 16330 #if 0 16331 /* If we ever were to accept aliases for, say private use names, we would 16332 * need to do something fancier to find empty names. The code below works 16333 * (at the time it was written), and is slower than the above */ 16334 const char empties_pat[] = "^."; 16335 if (strNE(name, empties_pat)) { 16336 SV * empty = newSVpvs(""); 16337 if (execute_wildcard(subpattern_re, 16338 SvPVX(empty), 16339 SvEND(empty), 16340 SvPVX(empty), 0, 16341 empty, 16342 0)) 16343 { 16344 SV * empties = NULL; 16345 16346 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties); 16347 16348 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition); 16349 SvREFCNT_dec_NN(empties); 16350 16351 found_matches = TRUE; 16352 } 16353 SvREFCNT_dec_NN(empty); 16354 } 16355 #endif 16356 16357 SvREFCNT_dec_NN(subpattern_re); 16358 return found_matches; 16359 } 16360 16361 /* 16362 * ex: set ts=8 sts=4 sw=4 et: 16363 */ 16364