1 #ifdef PERL_EXT_RE_BUILD 2 #include "re_top.h" 3 #endif 4 5 #include "EXTERN.h" 6 #define PERL_IN_REGEX_ENGINE 7 #define PERL_IN_REGCOMP_ANY 8 #define PERL_IN_REGCOMP_DEBUG_C 9 #include "perl.h" 10 11 #ifdef PERL_IN_XSUB_RE 12 # include "re_comp.h" 13 #else 14 # include "regcomp.h" 15 #endif 16 17 #include "invlist_inline.h" 18 #include "unicode_constants.h" 19 #include "regcomp_internal.h" 20 21 #ifdef PERL_RE_BUILD_DEBUG 22 int 23 Perl_re_printf(pTHX_ const char *fmt, ...) 24 { 25 va_list ap; 26 int result; 27 PerlIO *f= Perl_debug_log; 28 PERL_ARGS_ASSERT_RE_PRINTF; 29 va_start(ap, fmt); 30 result = PerlIO_vprintf(f, fmt, ap); 31 va_end(ap); 32 return result; 33 } 34 35 int 36 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) 37 { 38 va_list ap; 39 int result; 40 PerlIO *f= Perl_debug_log; 41 PERL_ARGS_ASSERT_RE_INDENTF; 42 va_start(ap, depth); 43 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); 44 result = PerlIO_vprintf(f, fmt, ap); 45 va_end(ap); 46 return result; 47 } 48 49 void 50 Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, 51 const char *close_str) 52 { 53 PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS; 54 if (!flags) 55 return; 56 57 Perl_re_printf( aTHX_ "%s", open_str); 58 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); 59 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); 60 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); 61 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); 62 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); 63 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); 64 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); 65 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); 66 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); 67 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); 68 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); 69 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); 70 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); 71 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); 72 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); 73 Perl_re_printf( aTHX_ "%s", close_str); 74 } 75 76 void 77 Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data, 78 U32 depth, int is_inf, 79 SSize_t min, SSize_t stopmin, SSize_t delta) 80 { 81 PERL_ARGS_ASSERT_DEBUG_STUDYDATA; 82 DECLARE_AND_GET_RE_DEBUG_FLAGS; 83 84 DEBUG_OPTIMISE_MORE_r({ 85 if (!data) { 86 Perl_re_indentf(aTHX_ "%s: NO DATA", 87 depth, 88 where); 89 return; 90 } 91 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, 92 depth, 93 where, 94 min, stopmin, delta, 95 (IV)data->pos_min, 96 (IV)data->pos_delta, 97 (UV)data->flags 98 ); 99 100 Perl_debug_show_study_flags(aTHX_ data->flags," [","]"); 101 102 Perl_re_printf( aTHX_ 103 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", 104 (IV)data->whilem_c, 105 (IV)(data->last_closep ? *((data)->last_closep) : -1), 106 is_inf ? "INF " : "" 107 ); 108 109 if (data->last_found) { 110 int i; 111 Perl_re_printf(aTHX_ 112 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, 113 SvPVX_const(data->last_found), 114 (IV)data->last_end, 115 (IV)data->last_start_min, 116 (IV)data->last_start_max 117 ); 118 119 for (i = 0; i < 2; i++) { 120 Perl_re_printf(aTHX_ 121 " %s%s: '%s' @ %" IVdf "/%" IVdf, 122 data->cur_is_floating == i ? "*" : "", 123 i ? "Float" : "Fixed", 124 SvPVX_const(data->substrs[i].str), 125 (IV)data->substrs[i].min_offset, 126 (IV)data->substrs[i].max_offset 127 ); 128 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); 129 } 130 } 131 132 Perl_re_printf( aTHX_ "\n"); 133 }); 134 } 135 136 137 void 138 Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, 139 regnode *scan, U32 depth, U32 flags) 140 { 141 PERL_ARGS_ASSERT_DEBUG_PEEP; 142 DECLARE_AND_GET_RE_DEBUG_FLAGS; 143 144 DEBUG_OPTIMISE_r({ 145 regnode *Next; 146 147 if (!scan) 148 return; 149 Next = regnext(scan); 150 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); 151 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", 152 depth, 153 str, 154 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), 155 Next ? (REG_NODE_NUM(Next)) : 0 ); 156 Perl_debug_show_study_flags(aTHX_ flags," [ ","]"); 157 Perl_re_printf( aTHX_ "\n"); 158 }); 159 } 160 161 const regnode * 162 Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 163 const regnode *last, const regnode *plast, 164 SV* sv, I32 indent, U32 depth) 165 { 166 const regnode *next; 167 const regnode *optstart= NULL; 168 169 RXi_GET_DECL(r, ri); 170 DECLARE_AND_GET_RE_DEBUG_FLAGS; 171 172 PERL_ARGS_ASSERT_DUMPUNTIL; 173 174 #ifdef DEBUG_DUMPUNTIL 175 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, 176 last ? last-start : 0, plast ? plast-start : 0); 177 #endif 178 179 if (plast && plast < last) 180 last= plast; 181 182 while (node && (!last || node < last)) { 183 const U8 op = OP(node); 184 185 if (op == CLOSE || op == SRCLOSE || op == WHILEM) 186 indent--; 187 next = regnext((regnode *)node); 188 const regnode *after = regnode_after((regnode *)node,0); 189 190 /* Where, what. */ 191 if (op == OPTIMIZED) { 192 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 193 optstart = node; 194 else 195 goto after_print; 196 } else 197 CLEAR_OPTSTART; 198 199 regprop(r, sv, node, NULL, NULL); 200 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), 201 (int)(2*indent + 1), "", SvPVX_const(sv)); 202 203 if (op != OPTIMIZED) { 204 if (next == NULL) /* Next ptr. */ 205 Perl_re_printf( aTHX_ " (0)"); 206 else if (REGNODE_TYPE(op) == BRANCH 207 && REGNODE_TYPE(OP(next)) != BRANCH ) 208 Perl_re_printf( aTHX_ " (FAIL)"); 209 else 210 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); 211 Perl_re_printf( aTHX_ "\n"); 212 } 213 214 after_print: 215 if (REGNODE_TYPE(op) == BRANCHJ) { 216 assert(next); 217 const regnode *nnode = (OP(next) == LONGJMP 218 ? regnext((regnode *)next) 219 : next); 220 if (last && nnode > last) 221 nnode = last; 222 DUMPUNTIL(after, nnode); 223 } 224 else if (REGNODE_TYPE(op) == BRANCH) { 225 assert(next); 226 DUMPUNTIL(after, next); 227 } 228 else if ( REGNODE_TYPE(op) == TRIE ) { 229 const regnode *this_trie = node; 230 const U32 n = ARG1u(node); 231 const reg_ac_data * const ac = op>=AHOCORASICK ? 232 (reg_ac_data *)ri->data->data[n] : 233 NULL; 234 const reg_trie_data * const trie = 235 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 236 #ifdef DEBUGGING 237 AV *const trie_words 238 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 239 #endif 240 const regnode *nextbranch= NULL; 241 I32 word_idx; 242 SvPVCLEAR(sv); 243 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 244 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); 245 246 Perl_re_indentf( aTHX_ "%s ", 247 indent+3, 248 elem_ptr 249 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), 250 SvCUR(*elem_ptr), PL_dump_re_max_len, 251 PL_colors[0], PL_colors[1], 252 (SvUTF8(*elem_ptr) 253 ? PERL_PV_ESCAPE_UNI 254 : 0) 255 | PERL_PV_PRETTY_ELLIPSES 256 | PERL_PV_PRETTY_LTGT 257 ) 258 : "???" 259 ); 260 if (trie->jump) { 261 U16 dist= trie->jump[word_idx+1]; 262 Perl_re_printf( aTHX_ "(%" UVuf ")\n", 263 (UV)((dist ? this_trie + dist : next) - start)); 264 if (dist) { 265 if (!nextbranch) 266 nextbranch= this_trie + trie->jump[0]; 267 DUMPUNTIL(this_trie + dist, nextbranch); 268 } 269 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) 270 nextbranch= regnext((regnode *)nextbranch); 271 } else { 272 Perl_re_printf( aTHX_ "\n"); 273 } 274 } 275 if (last && next > last) 276 node= last; 277 else 278 node= next; 279 } 280 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 281 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ 282 } 283 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { 284 assert(next); 285 DUMPUNTIL(after, next); 286 } 287 else if ( op == PLUS || op == STAR) { 288 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ 289 } 290 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { 291 /* Literal string, where present. */ 292 node = (const regnode *)REGNODE_AFTER_varies(node); 293 } 294 else { 295 node = REGNODE_AFTER_opcode(node,op); 296 } 297 if (op == CURLYX || op == OPEN || op == SROPEN) 298 indent++; 299 if (REGNODE_TYPE(op) == END) 300 break; 301 } 302 CLEAR_OPTSTART; 303 #ifdef DEBUG_DUMPUNTIL 304 Perl_re_printf( aTHX_ "--- %d\n", (int)indent); 305 #endif 306 return node; 307 } 308 309 #endif /* PERL_RE_BUILD_DEBUG */ 310 311 /* 312 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 313 */ 314 #ifdef DEBUGGING 315 static void 316 S_regdump_intflags(pTHX_ const char *lead, const U32 flags) 317 { 318 int bit; 319 int set=0; 320 321 STATIC_ASSERT_STMT(REG_INTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS); 322 323 for (bit=0; bit < REG_INTFLAGS_NAME_SIZE; bit++) { 324 if (flags & (1<<bit)) { 325 if (!set++ && lead) 326 Perl_re_printf( aTHX_ "%s", lead); 327 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); 328 } 329 } 330 if (lead) { 331 if (set) 332 Perl_re_printf( aTHX_ "\n"); 333 else 334 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 335 } 336 } 337 338 static void 339 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 340 { 341 int bit; 342 int set=0; 343 regex_charset cs; 344 345 STATIC_ASSERT_STMT(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS); 346 347 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { 348 if (flags & (1U<<bit)) { 349 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ 350 continue; 351 } 352 if (!set++ && lead) 353 Perl_re_printf( aTHX_ "%s", lead); 354 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); 355 } 356 } 357 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { 358 if (!set++ && lead) { 359 Perl_re_printf( aTHX_ "%s", lead); 360 } 361 switch (cs) { 362 case REGEX_UNICODE_CHARSET: 363 Perl_re_printf( aTHX_ "UNICODE"); 364 break; 365 case REGEX_LOCALE_CHARSET: 366 Perl_re_printf( aTHX_ "LOCALE"); 367 break; 368 case REGEX_ASCII_RESTRICTED_CHARSET: 369 Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); 370 break; 371 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 372 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); 373 break; 374 default: 375 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); 376 break; 377 } 378 } 379 if (lead) { 380 if (set) 381 Perl_re_printf( aTHX_ "\n"); 382 else 383 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 384 } 385 } 386 #endif 387 388 void 389 Perl_regdump(pTHX_ const regexp *r) 390 { 391 #ifdef DEBUGGING 392 int i; 393 SV * const sv = sv_newmortal(); 394 SV *dsv= sv_newmortal(); 395 RXi_GET_DECL(r, ri); 396 DECLARE_AND_GET_RE_DEBUG_FLAGS; 397 398 PERL_ARGS_ASSERT_REGDUMP; 399 400 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 401 402 /* Header fields of interest. */ 403 for (i = 0; i < 2; i++) { 404 if (r->substrs->data[i].substr) { 405 RE_PV_QUOTED_DECL(s, 0, dsv, 406 SvPVX_const(r->substrs->data[i].substr), 407 RE_SV_DUMPLEN(r->substrs->data[i].substr), 408 PL_dump_re_max_len); 409 Perl_re_printf( aTHX_ 410 "%s %s%s at %" IVdf "..%" UVuf " ", 411 i ? "floating" : "anchored", 412 s, 413 RE_SV_TAIL(r->substrs->data[i].substr), 414 (IV)r->substrs->data[i].min_offset, 415 (UV)r->substrs->data[i].max_offset); 416 } 417 else if (r->substrs->data[i].utf8_substr) { 418 RE_PV_QUOTED_DECL(s, 1, dsv, 419 SvPVX_const(r->substrs->data[i].utf8_substr), 420 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), 421 30); 422 Perl_re_printf( aTHX_ 423 "%s utf8 %s%s at %" IVdf "..%" UVuf " ", 424 i ? "floating" : "anchored", 425 s, 426 RE_SV_TAIL(r->substrs->data[i].utf8_substr), 427 (IV)r->substrs->data[i].min_offset, 428 (UV)r->substrs->data[i].max_offset); 429 } 430 } 431 432 if (r->check_substr || r->check_utf8) 433 Perl_re_printf( aTHX_ 434 (const char *) 435 ( r->check_substr == r->substrs->data[1].substr 436 && r->check_utf8 == r->substrs->data[1].utf8_substr 437 ? "(checking floating" : "(checking anchored")); 438 if (r->intflags & PREGf_NOSCAN) 439 Perl_re_printf( aTHX_ " noscan"); 440 if (r->extflags & RXf_CHECK_ALL) 441 Perl_re_printf( aTHX_ " isall"); 442 if (r->check_substr || r->check_utf8) 443 Perl_re_printf( aTHX_ ") "); 444 445 if (ri->regstclass) { 446 regprop(r, sv, ri->regstclass, NULL, NULL); 447 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); 448 } 449 if (r->intflags & PREGf_ANCH) { 450 Perl_re_printf( aTHX_ "anchored"); 451 if (r->intflags & PREGf_ANCH_MBOL) 452 Perl_re_printf( aTHX_ "(MBOL)"); 453 if (r->intflags & PREGf_ANCH_SBOL) 454 Perl_re_printf( aTHX_ "(SBOL)"); 455 if (r->intflags & PREGf_ANCH_GPOS) 456 Perl_re_printf( aTHX_ "(GPOS)"); 457 Perl_re_printf( aTHX_ " "); 458 } 459 if (r->intflags & PREGf_GPOS_SEEN) 460 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); 461 if (r->intflags & PREGf_SKIP) 462 Perl_re_printf( aTHX_ "plus "); 463 if (r->intflags & PREGf_IMPLICIT) 464 Perl_re_printf( aTHX_ "implicit "); 465 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); 466 if (r->extflags & RXf_EVAL_SEEN) 467 Perl_re_printf( aTHX_ "with eval "); 468 Perl_re_printf( aTHX_ "\n"); 469 DEBUG_FLAGS_r({ 470 regdump_extflags("r->extflags: ", r->extflags); 471 regdump_intflags("r->intflags: ", r->intflags); 472 }); 473 #else 474 PERL_ARGS_ASSERT_REGDUMP; 475 PERL_UNUSED_CONTEXT; 476 PERL_UNUSED_ARG(r); 477 #endif /* DEBUGGING */ 478 } 479 480 /* Should be synchronized with ANYOF_ #defines in regcomp.h */ 481 #ifdef DEBUGGING 482 483 # if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \ 484 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \ 485 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \ 486 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \ 487 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \ 488 || CC_VERTSPACE_ != 15 489 # error Need to adjust order of anyofs[] 490 # endif 491 static const char * const anyofs[] = { 492 "\\w", 493 "\\W", 494 "\\d", 495 "\\D", 496 "[:alpha:]", 497 "[:^alpha:]", 498 "[:lower:]", 499 "[:^lower:]", 500 "[:upper:]", 501 "[:^upper:]", 502 "[:punct:]", 503 "[:^punct:]", 504 "[:print:]", 505 "[:^print:]", 506 "[:alnum:]", 507 "[:^alnum:]", 508 "[:graph:]", 509 "[:^graph:]", 510 "[:cased:]", 511 "[:^cased:]", 512 "\\s", 513 "\\S", 514 "[:blank:]", 515 "[:^blank:]", 516 "[:xdigit:]", 517 "[:^xdigit:]", 518 "[:cntrl:]", 519 "[:^cntrl:]", 520 "[:ascii:]", 521 "[:^ascii:]", 522 "\\v", 523 "\\V" 524 }; 525 #endif 526 527 /* 528 - regprop - printable representation of opcode, with run time support 529 */ 530 531 void 532 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) 533 { 534 #ifdef DEBUGGING 535 U8 k; 536 const U8 op = OP(o); 537 RXi_GET_DECL(prog, progi); 538 DECLARE_AND_GET_RE_DEBUG_FLAGS; 539 540 PERL_ARGS_ASSERT_REGPROP; 541 542 SvPVCLEAR(sv); 543 544 if (op > REGNODE_MAX) { /* regnode.type is unsigned */ 545 if (pRExC_state) { /* This gives more info, if we have it */ 546 FAIL3("panic: corrupted regexp opcode %d > %d", 547 (int)op, (int)REGNODE_MAX); 548 } 549 else { 550 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", 551 (int)op, (int)REGNODE_MAX); 552 } 553 } 554 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ 555 556 k = REGNODE_TYPE(op); 557 if (op == BRANCH) { 558 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o)); 559 } 560 else if (op == BRANCHJ) { 561 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o)); 562 } 563 else if (k == EXACT) { 564 sv_catpvs(sv, " "); 565 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 566 * is a crude hack but it may be the best for now since 567 * we have no flag "this EXACTish node was UTF-8" 568 * --jhi */ 569 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, 570 PL_colors[0], PL_colors[1], 571 PERL_PV_ESCAPE_UNI_DETECT | 572 PERL_PV_ESCAPE_NONASCII | 573 PERL_PV_PRETTY_ELLIPSES | 574 PERL_PV_PRETTY_LTGT | 575 PERL_PV_PRETTY_NOCLEAR 576 ); 577 } else if (k == TRIE) { 578 /* print the details of the trie in dumpuntil instead, as 579 * progi->data isn't available here */ 580 const U32 n = ARG1u(o); 581 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 582 (reg_ac_data *)progi->data->data[n] : 583 NULL; 584 const reg_trie_data * const trie 585 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 586 587 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o))); 588 DEBUG_TRIE_COMPILE_r({ 589 if (trie->jump) 590 sv_catpvs(sv, "(JUMP)"); 591 Perl_sv_catpvf(aTHX_ sv, 592 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", 593 (UV)trie->startstate, 594 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 595 (UV)trie->wordcount, 596 (UV)trie->minlen, 597 (UV)trie->maxlen, 598 (UV)TRIE_CHARCOUNT(trie), 599 (UV)trie->uniquecharcount 600 ); 601 }); 602 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 603 sv_catpvs(sv, "["); 604 (void) put_charclass_bitmap_innards(sv, 605 ((IS_ANYOF_TRIE(op)) 606 ? ANYOF_BITMAP(o) 607 : TRIE_BITMAP(trie)), 608 NULL, 609 NULL, 610 NULL, 611 0, 612 FALSE 613 ); 614 sv_catpvs(sv, "]"); 615 } 616 if (trie->before_paren || trie->after_paren) 617 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", 618 (IV)trie->before_paren,(IV)trie->after_paren); 619 } else if (k == CURLY) { 620 U32 lo = ARG1i(o), hi = ARG2i(o); 621 if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */ 622 Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */ 623 if (op == CURLYM || op == CURLYN || op == CURLYX) 624 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */ 625 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); 626 if (hi == REG_INFTY) 627 sv_catpvs(sv, "INFTY"); 628 else 629 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); 630 sv_catpvs(sv, "}"); 631 } 632 else if (k == WHILEM && FLAGS(o)) /* Ordinal/of */ 633 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4); 634 else if (k == REF || k == OPEN || k == CLOSE 635 || k == GROUPP || op == ACCEPT) 636 { 637 AV *name_list= NULL; 638 U32 parno= (op == ACCEPT) ? ARG2u(o) : 639 (op == OPEN || op == CLOSE) ? PARNO(o) : 640 ARG1u(o); 641 if ( RXp_PAREN_NAMES(prog) ) { 642 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 643 } else if ( pRExC_state ) { 644 name_list= RExC_paren_name_list; 645 } 646 if ( name_list ) { 647 if ( k != REF || (op < REFN)) { 648 UV logical_parno = parno; 649 if (prog->parno_to_logical) 650 logical_parno = prog->parno_to_logical[parno]; 651 652 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ 653 if (parno != logical_parno) 654 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ 655 656 SV **name= av_fetch_simple(name_list, parno, 0 ); 657 if (name) 658 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 659 } 660 else 661 if (parno > 0) { 662 /* parno must always be larger than 0 for this block 663 * as it represents a slot into the data array, which 664 * has the 0 slot reserved for a placeholder so any valid 665 * index into it is always true, eg non-zero 666 * see the '%' "what" type and the implementation of 667 * S_reg_add_data() 668 */ 669 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); 670 I32 *nums=(I32*)SvPVX(sv_dat); 671 SV **name= av_fetch_simple(name_list, nums[0], 0 ); 672 I32 n; 673 if (name) { 674 for ( n=0; n<SvIVX(sv_dat); n++ ) { 675 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, 676 (n ? "," : ""), (IV)nums[n]); 677 } 678 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 679 } 680 } 681 } else if (parno>0) { 682 UV logical_parno = parno; 683 if (prog->parno_to_logical) 684 logical_parno = prog->parno_to_logical[parno]; 685 686 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ 687 if (logical_parno != parno) 688 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ 689 690 } 691 if ( k == REF ) { 692 Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o)); 693 } 694 if ( k == REF && reginfo) { 695 U32 n = ARG1u(o); /* which paren pair */ 696 I32 ln = RXp_OFFS_START(prog,n); 697 if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1) 698 Perl_sv_catpvf(aTHX_ sv, ": FAIL"); 699 else if (ln == RXp_OFFS_END(prog,n)) 700 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); 701 else { 702 const char *s = reginfo->strbeg + ln; 703 Perl_sv_catpvf(aTHX_ sv, ": "); 704 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0, 705 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); 706 } 707 } 708 } else if (k == GOSUB) { 709 AV *name_list= NULL; 710 IV parno = ARG1u(o); 711 IV logical_parno = (parno && prog->parno_to_logical) 712 ? prog->parno_to_logical[parno] 713 : parno; 714 if ( RXp_PAREN_NAMES(prog) ) { 715 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 716 } else if ( pRExC_state ) { 717 name_list= RExC_paren_name_list; 718 } 719 720 /* Paren and offset */ 721 Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno); 722 if (logical_parno != parno) 723 Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno); 724 725 Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o), 726 (int)((o + (int)ARG2i(o)) - progi->program) ); 727 if (name_list) { 728 SV **name= av_fetch_simple(name_list, ARG1u(o), 0 ); 729 if (name) 730 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 731 } 732 } 733 else if (k == LOGICAL) 734 /* 2: embedded, otherwise 1 */ 735 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); 736 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { 737 U8 flags; 738 char * bitmap; 739 U8 do_sep = 0; /* Do we need to separate various components of the 740 output? */ 741 /* Set if there is still an unresolved user-defined property */ 742 SV *unresolved = NULL; 743 744 /* Things that are ignored except when the runtime locale is UTF-8 */ 745 SV *only_utf8_locale_invlist = NULL; 746 747 /* Code points that don't fit in the bitmap */ 748 SV *nonbitmap_invlist = NULL; 749 750 /* And things that aren't in the bitmap, but are small enough to be */ 751 SV* bitmap_range_not_in_bitmap = NULL; 752 753 bool inverted; 754 755 if (k != ANYOF) { 756 flags = 0; 757 bitmap = NULL; 758 } 759 else { 760 flags = ANYOF_FLAGS(o); 761 bitmap = ANYOF_BITMAP(o); 762 } 763 764 if (op == ANYOFL || op == ANYOFPOSIXL) { 765 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) { 766 sv_catpvs(sv, "{utf8-locale-reqd}"); 767 } 768 if (flags & ANYOFL_FOLD) { 769 sv_catpvs(sv, "{i}"); 770 } 771 } 772 773 inverted = flags & ANYOF_INVERT; 774 775 /* If there is stuff outside the bitmap, get it */ 776 if (k == ANYOFR) { 777 778 /* For a single range, split into the parts inside vs outside the 779 * bitmap. */ 780 UV start = ANYOFRbase(o); 781 UV end = ANYOFRbase(o) + ANYOFRdelta(o); 782 783 if (start < NUM_ANYOF_CODE_POINTS) { 784 if (end < NUM_ANYOF_CODE_POINTS) { 785 bitmap_range_not_in_bitmap 786 = _add_range_to_invlist(bitmap_range_not_in_bitmap, 787 start, end); 788 } 789 else { 790 bitmap_range_not_in_bitmap 791 = _add_range_to_invlist(bitmap_range_not_in_bitmap, 792 start, NUM_ANYOF_CODE_POINTS); 793 start = NUM_ANYOF_CODE_POINTS; 794 } 795 } 796 797 if (start >= NUM_ANYOF_CODE_POINTS) { 798 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 799 ANYOFRbase(o), 800 ANYOFRbase(o) + ANYOFRdelta(o)); 801 } 802 } 803 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { 804 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 805 NUM_ANYOF_CODE_POINTS, 806 UV_MAX); 807 } 808 else if (ANYOF_HAS_AUX(o)) { 809 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, 810 &unresolved, 811 &only_utf8_locale_invlist, 812 &nonbitmap_invlist); 813 814 /* The aux data may contain stuff that could fit in the bitmap. 815 * This could come from a user-defined property being finally 816 * resolved when this call was done; or much more likely because 817 * there are matches that require UTF-8 to be valid, and so aren't 818 * in the bitmap (or ANYOFR). This is teased apart later */ 819 _invlist_intersection(nonbitmap_invlist, 820 PL_InBitmap, 821 &bitmap_range_not_in_bitmap); 822 /* Leave just the things that don't fit into the bitmap */ 823 _invlist_subtract(nonbitmap_invlist, 824 PL_InBitmap, 825 &nonbitmap_invlist); 826 } 827 828 /* Ready to start outputting. First, the initial left bracket */ 829 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 830 831 if ( bitmap 832 || bitmap_range_not_in_bitmap 833 || only_utf8_locale_invlist 834 || unresolved) 835 { 836 /* Then all the things that could fit in the bitmap */ 837 do_sep = put_charclass_bitmap_innards( 838 sv, 839 bitmap, 840 bitmap_range_not_in_bitmap, 841 only_utf8_locale_invlist, 842 o, 843 flags, 844 845 /* Can't try inverting for a 846 * better display if there 847 * are things that haven't 848 * been resolved */ 849 (unresolved != NULL || k == ANYOFR)); 850 SvREFCNT_dec(bitmap_range_not_in_bitmap); 851 852 /* If there are user-defined properties which haven't been defined 853 * yet, output them. If the result is not to be inverted, it is 854 * clearest to output them in a separate [] from the bitmap range 855 * stuff. If the result is to be complemented, we have to show 856 * everything in one [], as the inversion applies to the whole 857 * thing. Use {braces} to separate them from anything in the 858 * bitmap and anything above the bitmap. */ 859 if (unresolved) { 860 if (inverted) { 861 if (! do_sep) { /* If didn't output anything in the bitmap 862 */ 863 sv_catpvs(sv, "^"); 864 } 865 sv_catpvs(sv, "{"); 866 } 867 else if (do_sep) { 868 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], 869 PL_colors[0]); 870 } 871 sv_catsv(sv, unresolved); 872 if (inverted) { 873 sv_catpvs(sv, "}"); 874 } 875 do_sep = ! inverted; 876 } 877 else if ( do_sep == 2 878 && ! nonbitmap_invlist 879 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o)) 880 { 881 /* Here, the display shows the class as inverted, and 882 * everything above the lower display should also match, but 883 * there is no indication of that. Add this range so the code 884 * below will add it to the display */ 885 _invlist_union_complement_2nd(nonbitmap_invlist, 886 PL_InBitmap, 887 &nonbitmap_invlist); 888 } 889 } 890 891 /* And, finally, add the above-the-bitmap stuff */ 892 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { 893 SV* contents; 894 895 /* See if truncation size is overridden */ 896 const STRLEN dump_len = (PL_dump_re_max_len > 256) 897 ? PL_dump_re_max_len 898 : 256; 899 900 /* This is output in a separate [] */ 901 if (do_sep) { 902 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); 903 } 904 905 /* And, for easy of understanding, it is shown in the 906 * uncomplemented form if possible. The one exception being if 907 * there are unresolved items, where the inversion has to be 908 * delayed until runtime */ 909 if (inverted && ! unresolved) { 910 _invlist_invert(nonbitmap_invlist); 911 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); 912 } 913 914 contents = invlist_contents(nonbitmap_invlist, 915 FALSE /* output suitable for catsv */ 916 ); 917 918 /* If the output is shorter than the permissible maximum, just do it. */ 919 if (SvCUR(contents) <= dump_len) { 920 sv_catsv(sv, contents); 921 } 922 else { 923 const char * contents_string = SvPVX(contents); 924 STRLEN i = dump_len; 925 926 /* Otherwise, start at the permissible max and work back to the 927 * first break possibility */ 928 while (i > 0 && contents_string[i] != ' ') { 929 i--; 930 } 931 if (i == 0) { /* Fail-safe. Use the max if we couldn't 932 find a legal break */ 933 i = dump_len; 934 } 935 936 sv_catpvn(sv, contents_string, i); 937 sv_catpvs(sv, "..."); 938 } 939 940 SvREFCNT_dec_NN(contents); 941 SvREFCNT_dec_NN(nonbitmap_invlist); 942 } 943 944 /* And finally the matching, closing ']' */ 945 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 946 947 if (op == ANYOFHs) { 948 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); 949 } 950 else if (REGNODE_TYPE(op) != ANYOF) { 951 U8 lowest = (op != ANYOFHr) 952 ? FLAGS(o) 953 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); 954 U8 highest = (op == ANYOFHr) 955 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) 956 : (op == ANYOFH || op == ANYOFR) 957 ? 0xFF 958 : lowest; 959 #ifndef EBCDIC 960 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) 961 #endif 962 { 963 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); 964 if (lowest != highest) { 965 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); 966 } 967 Perl_sv_catpvf(aTHX_ sv, ")"); 968 } 969 } 970 971 SvREFCNT_dec(unresolved); 972 } 973 else if (k == ANYOFM) { 974 SV * cp_list = get_ANYOFM_contents(o); 975 976 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 977 if (op == NANYOFM) { 978 _invlist_invert(cp_list); 979 } 980 981 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); 982 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 983 984 SvREFCNT_dec(cp_list); 985 } 986 else if (k == ANYOFHbbm) { 987 SV * cp_list = get_ANYOFHbbm_contents(o); 988 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 989 990 sv_catsv(sv, invlist_contents(cp_list, 991 FALSE /* output suitable for catsv */ 992 )); 993 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 994 995 SvREFCNT_dec(cp_list); 996 } 997 else if (k == POSIXD || k == NPOSIXD) { 998 U8 index = FLAGS(o) * 2; 999 if (index < C_ARRAY_LENGTH(anyofs)) { 1000 if (*anyofs[index] != '[') { 1001 sv_catpvs(sv, "["); 1002 } 1003 sv_catpv(sv, anyofs[index]); 1004 if (*anyofs[index] != '[') { 1005 sv_catpvs(sv, "]"); 1006 } 1007 } 1008 else { 1009 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); 1010 } 1011 } 1012 else if (k == BOUND || k == NBOUND) { 1013 /* Must be synced with order of 'bound_type' in regcomp.h */ 1014 const char * const bounds[] = { 1015 "", /* Traditional */ 1016 "{gcb}", 1017 "{lb}", 1018 "{sb}", 1019 "{wb}" 1020 }; 1021 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); 1022 sv_catpv(sv, bounds[FLAGS(o)]); 1023 } 1024 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) { 1025 Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o))); 1026 if (NEXT_OFF(o)) { 1027 Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o)); 1028 } 1029 Perl_sv_catpvf(aTHX_ sv, "]"); 1030 } 1031 else if (op == SBOL) 1032 Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^"); 1033 else if (op == EVAL) { 1034 if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG) 1035 Perl_sv_catpvf(aTHX_ sv, " optimistic"); 1036 } 1037 1038 /* add on the verb argument if there is one */ 1039 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) { 1040 if ( ARG1u(o) ) 1041 Perl_sv_catpvf(aTHX_ sv, ":%" SVf, 1042 SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ])))); 1043 else 1044 sv_catpvs(sv, ":NULL"); 1045 } 1046 #else 1047 PERL_UNUSED_CONTEXT; 1048 PERL_UNUSED_ARG(sv); 1049 PERL_UNUSED_ARG(o); 1050 PERL_UNUSED_ARG(prog); 1051 PERL_UNUSED_ARG(reginfo); 1052 PERL_UNUSED_ARG(pRExC_state); 1053 #endif /* DEBUGGING */ 1054 } 1055 1056 1057 #ifdef DEBUGGING 1058 STATIC void 1059 S_put_code_point(pTHX_ SV *sv, UV c) 1060 { 1061 PERL_ARGS_ASSERT_PUT_CODE_POINT; 1062 1063 if (c > 255) { 1064 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); 1065 } 1066 else if (isPRINT(c)) { 1067 const char string = (char) c; 1068 1069 /* We use {phrase} as metanotation in the class, so also escape literal 1070 * braces */ 1071 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') 1072 sv_catpvs(sv, "\\"); 1073 sv_catpvn(sv, &string, 1); 1074 } 1075 else if (isMNEMONIC_CNTRL(c)) { 1076 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); 1077 } 1078 else { 1079 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); 1080 } 1081 } 1082 1083 STATIC void 1084 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) 1085 { 1086 /* Appends to 'sv' a displayable version of the range of code points from 1087 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls 1088 * that have them, when they occur at the beginning or end of the range. 1089 * It uses hex to output the remaining code points, unless 'allow_literals' 1090 * is true, in which case the printable ASCII ones are output as-is (though 1091 * some of these will be escaped by put_code_point()). 1092 * 1093 * NOTE: This is designed only for printing ranges of code points that fit 1094 * inside an ANYOF bitmap. Higher code points are simply suppressed 1095 */ 1096 1097 const unsigned int min_range_count = 3; 1098 1099 assert(start <= end); 1100 1101 PERL_ARGS_ASSERT_PUT_RANGE; 1102 1103 while (start <= end) { 1104 UV this_end; 1105 const char * format; 1106 1107 if ( end - start < min_range_count 1108 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) 1109 { 1110 /* Output a range of 1 or 2 chars individually, or longer ranges 1111 * when printable */ 1112 for (; start <= end; start++) { 1113 put_code_point(sv, start); 1114 } 1115 break; 1116 } 1117 1118 /* If permitted by the input options, and there is a possibility that 1119 * this range contains a printable literal, look to see if there is 1120 * one. */ 1121 if (allow_literals && start <= MAX_PRINT_A) { 1122 1123 /* If the character at the beginning of the range isn't an ASCII 1124 * printable, effectively split the range into two parts: 1125 * 1) the portion before the first such printable, 1126 * 2) the rest 1127 * and output them separately. */ 1128 if (! isPRINT_A(start)) { 1129 UV temp_end = start + 1; 1130 1131 /* There is no point looking beyond the final possible 1132 * printable, in MAX_PRINT_A */ 1133 UV max = MIN(end, MAX_PRINT_A); 1134 1135 while (temp_end <= max && ! isPRINT_A(temp_end)) { 1136 temp_end++; 1137 } 1138 1139 /* Here, temp_end points to one beyond the first printable if 1140 * found, or to one beyond 'max' if not. If none found, make 1141 * sure that we use the entire range */ 1142 if (temp_end > MAX_PRINT_A) { 1143 temp_end = end + 1; 1144 } 1145 1146 /* Output the first part of the split range: the part that 1147 * doesn't have printables, with the parameter set to not look 1148 * for literals (otherwise we would infinitely recurse) */ 1149 put_range(sv, start, temp_end - 1, FALSE); 1150 1151 /* The 2nd part of the range (if any) starts here. */ 1152 start = temp_end; 1153 1154 /* We do a continue, instead of dropping down, because even if 1155 * the 2nd part is non-empty, it could be so short that we want 1156 * to output it as individual characters, as tested for at the 1157 * top of this loop. */ 1158 continue; 1159 } 1160 1161 /* Here, 'start' is a printable ASCII. If it is an alphanumeric, 1162 * output a sub-range of just the digits or letters, then process 1163 * the remaining portion as usual. */ 1164 if (isALPHANUMERIC_A(start)) { 1165 UV mask = (isDIGIT_A(start)) 1166 ? CC_DIGIT_ 1167 : isUPPER_A(start) 1168 ? CC_UPPER_ 1169 : CC_LOWER_; 1170 UV temp_end = start + 1; 1171 1172 /* Find the end of the sub-range that includes just the 1173 * characters in the same class as the first character in it */ 1174 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) { 1175 temp_end++; 1176 } 1177 temp_end--; 1178 1179 /* For short ranges, don't duplicate the code above to output 1180 * them; just call recursively */ 1181 if (temp_end - start < min_range_count) { 1182 put_range(sv, start, temp_end, FALSE); 1183 } 1184 else { /* Output as a range */ 1185 put_code_point(sv, start); 1186 sv_catpvs(sv, "-"); 1187 put_code_point(sv, temp_end); 1188 } 1189 start = temp_end + 1; 1190 continue; 1191 } 1192 1193 /* We output any other printables as individual characters */ 1194 if (isPUNCT_A(start) || isSPACE_A(start)) { 1195 while (start <= end && (isPUNCT_A(start) 1196 || isSPACE_A(start))) 1197 { 1198 put_code_point(sv, start); 1199 start++; 1200 } 1201 continue; 1202 } 1203 } /* End of looking for literals */ 1204 1205 /* Here is not to output as a literal. Some control characters have 1206 * mnemonic names. Split off any of those at the beginning and end of 1207 * the range to print mnemonically. It isn't possible for many of 1208 * these to be in a row, so this won't overwhelm with output */ 1209 if ( start <= end 1210 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) 1211 { 1212 while (isMNEMONIC_CNTRL(start) && start <= end) { 1213 put_code_point(sv, start); 1214 start++; 1215 } 1216 1217 /* If this didn't take care of the whole range ... */ 1218 if (start <= end) { 1219 1220 /* Look backwards from the end to find the final non-mnemonic 1221 * */ 1222 UV temp_end = end; 1223 while (isMNEMONIC_CNTRL(temp_end)) { 1224 temp_end--; 1225 } 1226 1227 /* And separately output the interior range that doesn't start 1228 * or end with mnemonics */ 1229 put_range(sv, start, temp_end, FALSE); 1230 1231 /* Then output the mnemonic trailing controls */ 1232 start = temp_end + 1; 1233 while (start <= end) { 1234 put_code_point(sv, start); 1235 start++; 1236 } 1237 break; 1238 } 1239 } 1240 1241 /* As a final resort, output the range or subrange as hex. */ 1242 1243 if (start >= NUM_ANYOF_CODE_POINTS) { 1244 this_end = end; 1245 } 1246 else { /* Have to split range at the bitmap boundary */ 1247 this_end = (end < NUM_ANYOF_CODE_POINTS) 1248 ? end 1249 : NUM_ANYOF_CODE_POINTS - 1; 1250 } 1251 #if NUM_ANYOF_CODE_POINTS > 256 1252 format = (this_end < 256) 1253 ? "\\x%02" UVXf "-\\x%02" UVXf 1254 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; 1255 #else 1256 format = "\\x%02" UVXf "-\\x%02" UVXf; 1257 #endif 1258 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); 1259 Perl_sv_catpvf(aTHX_ sv, format, start, this_end); 1260 GCC_DIAG_RESTORE_STMT; 1261 break; 1262 } 1263 } 1264 1265 STATIC void 1266 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) 1267 { 1268 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list 1269 * 'invlist' */ 1270 1271 UV start, end; 1272 bool allow_literals = TRUE; 1273 1274 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; 1275 1276 /* Generally, it is more readable if printable characters are output as 1277 * literals, but if a range (nearly) spans all of them, it's best to output 1278 * it as a single range. This code will use a single range if all but 2 1279 * ASCII printables are in it */ 1280 invlist_iterinit(invlist); 1281 while (invlist_iternext(invlist, &start, &end)) { 1282 1283 /* If the range starts beyond the final printable, it doesn't have any 1284 * in it */ 1285 if (start > MAX_PRINT_A) { 1286 break; 1287 } 1288 1289 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span 1290 * all but two, the range must start and end no later than 2 from 1291 * either end */ 1292 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { 1293 if (end > MAX_PRINT_A) { 1294 end = MAX_PRINT_A; 1295 } 1296 if (start < ' ') { 1297 start = ' '; 1298 } 1299 if (end - start >= MAX_PRINT_A - ' ' - 2) { 1300 allow_literals = FALSE; 1301 } 1302 break; 1303 } 1304 } 1305 invlist_iterfinish(invlist); 1306 1307 /* Here we have figured things out. Output each range */ 1308 invlist_iterinit(invlist); 1309 while (invlist_iternext(invlist, &start, &end)) { 1310 if (start >= NUM_ANYOF_CODE_POINTS) { 1311 break; 1312 } 1313 put_range(sv, start, end, allow_literals); 1314 } 1315 invlist_iterfinish(invlist); 1316 1317 return; 1318 } 1319 1320 STATIC SV* 1321 S_put_charclass_bitmap_innards_common(pTHX_ 1322 SV* invlist, /* The bitmap */ 1323 SV* posixes, /* Under /l, things like [:word:], \S */ 1324 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ 1325 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ 1326 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ 1327 const bool invert /* Is the result to be inverted? */ 1328 ) 1329 { 1330 /* Create and return an SV containing a displayable version of the bitmap 1331 * and associated information determined by the input parameters. If the 1332 * output would have been only the inversion indicator '^', NULL is instead 1333 * returned. */ 1334 1335 SV * output; 1336 1337 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; 1338 1339 if (invert) { 1340 output = newSVpvs("^"); 1341 } 1342 else { 1343 output = newSVpvs(""); 1344 } 1345 1346 /* First, the code points in the bitmap that are unconditionally there */ 1347 put_charclass_bitmap_innards_invlist(output, invlist); 1348 1349 /* Traditionally, these have been placed after the main code points */ 1350 if (posixes) { 1351 sv_catsv(output, posixes); 1352 } 1353 1354 if (only_utf8 && _invlist_len(only_utf8)) { 1355 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); 1356 put_charclass_bitmap_innards_invlist(output, only_utf8); 1357 } 1358 1359 if (not_utf8 && _invlist_len(not_utf8)) { 1360 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); 1361 put_charclass_bitmap_innards_invlist(output, not_utf8); 1362 } 1363 1364 if (only_utf8_locale && _invlist_len(only_utf8_locale)) { 1365 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); 1366 put_charclass_bitmap_innards_invlist(output, only_utf8_locale); 1367 1368 /* This is the only list in this routine that can legally contain code 1369 * points outside the bitmap range. The call just above to 1370 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so 1371 * output them here. There's about a half-dozen possible, and none in 1372 * contiguous ranges longer than 2 */ 1373 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 1374 UV start, end; 1375 SV* above_bitmap = NULL; 1376 1377 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); 1378 1379 invlist_iterinit(above_bitmap); 1380 while (invlist_iternext(above_bitmap, &start, &end)) { 1381 UV i; 1382 1383 for (i = start; i <= end; i++) { 1384 put_code_point(output, i); 1385 } 1386 } 1387 invlist_iterfinish(above_bitmap); 1388 SvREFCNT_dec_NN(above_bitmap); 1389 } 1390 } 1391 1392 if (invert && SvCUR(output) == 1) { 1393 return NULL; 1394 } 1395 1396 return output; 1397 } 1398 1399 STATIC U8 1400 S_put_charclass_bitmap_innards(pTHX_ SV *sv, 1401 char *bitmap, 1402 SV *nonbitmap_invlist, 1403 SV *only_utf8_locale_invlist, 1404 const regnode * const node, 1405 const U8 flags, 1406 const bool force_as_is_display) 1407 { 1408 /* Appends to 'sv' a displayable version of the innards of the bracketed 1409 * character class defined by the other arguments: 1410 * 'bitmap' points to the bitmap, or NULL if to ignore that. 1411 * 'nonbitmap_invlist' is an inversion list of the code points that are in 1412 * the bitmap range, but for some reason aren't in the bitmap; NULL if 1413 * none. The reasons for this could be that they require some 1414 * condition such as the target string being or not being in UTF-8 1415 * (under /d), or because they came from a user-defined property that 1416 * was not resolved at the time of the regex compilation (under /u) 1417 * 'only_utf8_locale_invlist' is an inversion list of the code points that 1418 * are valid only if the runtime locale is a UTF-8 one; NULL if none 1419 * 'node' is the regex pattern ANYOF node. It is needed only when the 1420 * above two parameters are not null, and is passed so that this 1421 * routine can tease apart the various reasons for them. 1422 * 'flags' is the flags field of 'node' 1423 * 'force_as_is_display' is TRUE if this routine should definitely NOT try 1424 * to invert things to see if that leads to a cleaner display. If 1425 * FALSE, this routine is free to use its judgment about doing this. 1426 * 1427 * It returns 0 if nothing was actually output. (It may be that 1428 * the bitmap, etc is empty.) 1429 * 1 if the output wasn't inverted (didn't begin with a '^') 1430 * 2 if the output was inverted (did begin with a '^') 1431 * 1432 * When called for outputting the bitmap of a non-ANYOF node, just pass the 1433 * bitmap, with the succeeding parameters set to NULL, and the final one to 1434 * FALSE. 1435 */ 1436 1437 /* In general, it tries to display the 'cleanest' representation of the 1438 * innards, choosing whether to display them inverted or not, regardless of 1439 * whether the class itself is to be inverted. However, there are some 1440 * cases where it can't try inverting, as what actually matches isn't known 1441 * until runtime, and hence the inversion isn't either. */ 1442 1443 bool inverting_allowed = ! force_as_is_display; 1444 1445 int i; 1446 STRLEN orig_sv_cur = SvCUR(sv); 1447 1448 SV* invlist; /* Inversion list we accumulate of code points that 1449 are unconditionally matched */ 1450 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is 1451 UTF-8 */ 1452 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 1453 */ 1454 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ 1455 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale 1456 is UTF-8 */ 1457 1458 SV* as_is_display; /* The output string when we take the inputs 1459 literally */ 1460 SV* inverted_display; /* The output string when we invert the inputs */ 1461 1462 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted 1463 to match? */ 1464 /* We are biased in favor of displaying things without them being inverted, 1465 * as that is generally easier to understand */ 1466 const int bias = 5; 1467 1468 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; 1469 1470 /* Start off with whatever code points are passed in. (We clone, so we 1471 * don't change the caller's list) */ 1472 if (nonbitmap_invlist) { 1473 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); 1474 invlist = invlist_clone(nonbitmap_invlist, NULL); 1475 } 1476 else { /* Worst case size is every other code point is matched */ 1477 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); 1478 } 1479 1480 if (flags) { 1481 if (OP(node) == ANYOFD) { 1482 1483 /* This flag indicates that the code points below 0x100 in the 1484 * nonbitmap list are precisely the ones that match only when the 1485 * target is UTF-8 (they should all be non-ASCII). */ 1486 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) { 1487 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); 1488 _invlist_subtract(invlist, only_utf8, &invlist); 1489 } 1490 1491 /* And this flag for matching all non-ASCII 0xFF and below */ 1492 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) { 1493 not_utf8 = invlist_clone(PL_UpperLatin1, NULL); 1494 } 1495 } 1496 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { 1497 1498 /* If either of these flags are set, what matches isn't 1499 * determinable except during execution, so don't know enough here 1500 * to invert */ 1501 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { 1502 inverting_allowed = FALSE; 1503 } 1504 1505 /* What the posix classes match also varies at runtime, so these 1506 * will be output symbolically. */ 1507 if (ANYOF_POSIXL_TEST_ANY_SET(node)) { 1508 int i; 1509 1510 posixes = newSVpvs(""); 1511 for (i = 0; i < ANYOF_POSIXL_MAX; i++) { 1512 if (ANYOF_POSIXL_TEST(node, i)) { 1513 sv_catpv(posixes, anyofs[i]); 1514 } 1515 } 1516 } 1517 } 1518 } 1519 1520 /* Accumulate the bit map into the unconditional match list */ 1521 if (bitmap) { 1522 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 1523 if (BITMAP_TEST(bitmap, i)) { 1524 int start = i++; 1525 for (; 1526 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); 1527 i++) 1528 { /* empty */ } 1529 invlist = _add_range_to_invlist(invlist, start, i-1); 1530 } 1531 } 1532 } 1533 1534 /* Make sure that the conditional match lists don't have anything in them 1535 * that match unconditionally; otherwise the output is quite confusing. 1536 * This could happen if the code that populates these misses some 1537 * duplication. */ 1538 if (only_utf8) { 1539 _invlist_subtract(only_utf8, invlist, &only_utf8); 1540 } 1541 if (not_utf8) { 1542 _invlist_subtract(not_utf8, invlist, ¬_utf8); 1543 } 1544 1545 if (only_utf8_locale_invlist) { 1546 1547 /* Since this list is passed in, we have to make a copy before 1548 * modifying it */ 1549 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); 1550 1551 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); 1552 1553 /* And, it can get really weird for us to try outputting an inverted 1554 * form of this list when it has things above the bitmap, so don't even 1555 * try */ 1556 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 1557 inverting_allowed = FALSE; 1558 } 1559 } 1560 1561 /* Calculate what the output would be if we take the input as-is */ 1562 as_is_display = put_charclass_bitmap_innards_common(invlist, 1563 posixes, 1564 only_utf8, 1565 not_utf8, 1566 only_utf8_locale, 1567 invert); 1568 1569 /* If have to take the output as-is, just do that */ 1570 if (! inverting_allowed) { 1571 if (as_is_display) { 1572 sv_catsv(sv, as_is_display); 1573 SvREFCNT_dec_NN(as_is_display); 1574 } 1575 } 1576 else { /* But otherwise, create the output again on the inverted input, and 1577 use whichever version is shorter */ 1578 1579 int inverted_bias, as_is_bias; 1580 1581 /* We will apply our bias to whichever of the results doesn't have 1582 * the '^' */ 1583 bool trial_invert; 1584 if (invert) { 1585 trial_invert = FALSE; 1586 as_is_bias = bias; 1587 inverted_bias = 0; 1588 } 1589 else { 1590 trial_invert = TRUE; 1591 as_is_bias = 0; 1592 inverted_bias = bias; 1593 } 1594 1595 /* Now invert each of the lists that contribute to the output, 1596 * excluding from the result things outside the possible range */ 1597 1598 /* For the unconditional inversion list, we have to add in all the 1599 * conditional code points, so that when inverted, they will be gone 1600 * from it */ 1601 _invlist_union(only_utf8, invlist, &invlist); 1602 _invlist_union(not_utf8, invlist, &invlist); 1603 _invlist_union(only_utf8_locale, invlist, &invlist); 1604 _invlist_invert(invlist); 1605 _invlist_intersection(invlist, PL_InBitmap, &invlist); 1606 1607 if (only_utf8) { 1608 _invlist_invert(only_utf8); 1609 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); 1610 } 1611 else if (not_utf8) { 1612 1613 /* If a code point matches iff the target string is not in UTF-8, 1614 * then complementing the result has it not match iff not in UTF-8, 1615 * which is the same thing as matching iff it is UTF-8. */ 1616 only_utf8 = not_utf8; 1617 not_utf8 = NULL; 1618 } 1619 1620 if (only_utf8_locale) { 1621 _invlist_invert(only_utf8_locale); 1622 _invlist_intersection(only_utf8_locale, 1623 PL_InBitmap, 1624 &only_utf8_locale); 1625 } 1626 1627 inverted_display = put_charclass_bitmap_innards_common( 1628 invlist, 1629 posixes, 1630 only_utf8, 1631 not_utf8, 1632 only_utf8_locale, trial_invert); 1633 1634 /* Use the shortest representation, taking into account our bias 1635 * against showing it inverted */ 1636 if ( inverted_display 1637 && ( ! as_is_display 1638 || ( SvCUR(inverted_display) + inverted_bias 1639 < SvCUR(as_is_display) + as_is_bias))) 1640 { 1641 sv_catsv(sv, inverted_display); 1642 invert = ! invert; 1643 } 1644 else if (as_is_display) { 1645 sv_catsv(sv, as_is_display); 1646 } 1647 1648 SvREFCNT_dec(as_is_display); 1649 SvREFCNT_dec(inverted_display); 1650 } 1651 1652 SvREFCNT_dec_NN(invlist); 1653 SvREFCNT_dec(only_utf8); 1654 SvREFCNT_dec(not_utf8); 1655 SvREFCNT_dec(posixes); 1656 SvREFCNT_dec(only_utf8_locale); 1657 1658 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur); 1659 if (did_output_something) { 1660 /* Distinguish between non and inverted cases */ 1661 did_output_something += invert; 1662 } 1663 1664 return did_output_something; 1665 } 1666 #endif /* DEBUGGING */ 1667