1f2a19305Safresh1 #ifdef PERL_EXT_RE_BUILD 2f2a19305Safresh1 #include "re_top.h" 3f2a19305Safresh1 #endif 4f2a19305Safresh1 5f2a19305Safresh1 #include "EXTERN.h" 6f2a19305Safresh1 #define PERL_IN_REGEX_ENGINE 7f2a19305Safresh1 #define PERL_IN_REGCOMP_ANY 8f2a19305Safresh1 #define PERL_IN_REGCOMP_DEBUG_C 9f2a19305Safresh1 #include "perl.h" 10f2a19305Safresh1 11f2a19305Safresh1 #ifdef PERL_IN_XSUB_RE 12f2a19305Safresh1 # include "re_comp.h" 13f2a19305Safresh1 #else 14f2a19305Safresh1 # include "regcomp.h" 15f2a19305Safresh1 #endif 16f2a19305Safresh1 17f2a19305Safresh1 #include "invlist_inline.h" 18f2a19305Safresh1 #include "unicode_constants.h" 19f2a19305Safresh1 #include "regcomp_internal.h" 20f2a19305Safresh1 21*5486feefSafresh1 #ifdef PERL_RE_BUILD_DEBUG 22f2a19305Safresh1 int 23f2a19305Safresh1 Perl_re_printf(pTHX_ const char *fmt, ...) 24f2a19305Safresh1 { 25f2a19305Safresh1 va_list ap; 26f2a19305Safresh1 int result; 27f2a19305Safresh1 PerlIO *f= Perl_debug_log; 28f2a19305Safresh1 PERL_ARGS_ASSERT_RE_PRINTF; 29f2a19305Safresh1 va_start(ap, fmt); 30f2a19305Safresh1 result = PerlIO_vprintf(f, fmt, ap); 31f2a19305Safresh1 va_end(ap); 32f2a19305Safresh1 return result; 33f2a19305Safresh1 } 34f2a19305Safresh1 35f2a19305Safresh1 int 36f2a19305Safresh1 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) 37f2a19305Safresh1 { 38f2a19305Safresh1 va_list ap; 39f2a19305Safresh1 int result; 40f2a19305Safresh1 PerlIO *f= Perl_debug_log; 41f2a19305Safresh1 PERL_ARGS_ASSERT_RE_INDENTF; 42f2a19305Safresh1 va_start(ap, depth); 43f2a19305Safresh1 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); 44f2a19305Safresh1 result = PerlIO_vprintf(f, fmt, ap); 45f2a19305Safresh1 va_end(ap); 46f2a19305Safresh1 return result; 47f2a19305Safresh1 } 48f2a19305Safresh1 49f2a19305Safresh1 void 50f2a19305Safresh1 Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str, 51f2a19305Safresh1 const char *close_str) 52f2a19305Safresh1 { 53f2a19305Safresh1 PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS; 54f2a19305Safresh1 if (!flags) 55f2a19305Safresh1 return; 56f2a19305Safresh1 57f2a19305Safresh1 Perl_re_printf( aTHX_ "%s", open_str); 58f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL); 59f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL); 60f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF); 61f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR); 62f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR); 63f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL); 64f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR); 65f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND); 66f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR); 67f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS); 68f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS); 69f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY); 70f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT); 71f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY); 72f2a19305Safresh1 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE); 73f2a19305Safresh1 Perl_re_printf( aTHX_ "%s", close_str); 74f2a19305Safresh1 } 75f2a19305Safresh1 76f2a19305Safresh1 void 77f2a19305Safresh1 Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data, 78f2a19305Safresh1 U32 depth, int is_inf, 79f2a19305Safresh1 SSize_t min, SSize_t stopmin, SSize_t delta) 80f2a19305Safresh1 { 81f2a19305Safresh1 PERL_ARGS_ASSERT_DEBUG_STUDYDATA; 82f2a19305Safresh1 DECLARE_AND_GET_RE_DEBUG_FLAGS; 83f2a19305Safresh1 84f2a19305Safresh1 DEBUG_OPTIMISE_MORE_r({ 85f2a19305Safresh1 if (!data) { 86f2a19305Safresh1 Perl_re_indentf(aTHX_ "%s: NO DATA", 87f2a19305Safresh1 depth, 88f2a19305Safresh1 where); 89f2a19305Safresh1 return; 90f2a19305Safresh1 } 91f2a19305Safresh1 Perl_re_indentf(aTHX_ "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf, 92f2a19305Safresh1 depth, 93f2a19305Safresh1 where, 94f2a19305Safresh1 min, stopmin, delta, 95f2a19305Safresh1 (IV)data->pos_min, 96f2a19305Safresh1 (IV)data->pos_delta, 97f2a19305Safresh1 (UV)data->flags 98f2a19305Safresh1 ); 99f2a19305Safresh1 100f2a19305Safresh1 Perl_debug_show_study_flags(aTHX_ data->flags," [","]"); 101f2a19305Safresh1 102f2a19305Safresh1 Perl_re_printf( aTHX_ 103f2a19305Safresh1 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", 104f2a19305Safresh1 (IV)data->whilem_c, 105f2a19305Safresh1 (IV)(data->last_closep ? *((data)->last_closep) : -1), 106f2a19305Safresh1 is_inf ? "INF " : "" 107f2a19305Safresh1 ); 108f2a19305Safresh1 109f2a19305Safresh1 if (data->last_found) { 110f2a19305Safresh1 int i; 111f2a19305Safresh1 Perl_re_printf(aTHX_ 112f2a19305Safresh1 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf, 113f2a19305Safresh1 SvPVX_const(data->last_found), 114f2a19305Safresh1 (IV)data->last_end, 115f2a19305Safresh1 (IV)data->last_start_min, 116f2a19305Safresh1 (IV)data->last_start_max 117f2a19305Safresh1 ); 118f2a19305Safresh1 119f2a19305Safresh1 for (i = 0; i < 2; i++) { 120f2a19305Safresh1 Perl_re_printf(aTHX_ 121f2a19305Safresh1 " %s%s: '%s' @ %" IVdf "/%" IVdf, 122f2a19305Safresh1 data->cur_is_floating == i ? "*" : "", 123f2a19305Safresh1 i ? "Float" : "Fixed", 124f2a19305Safresh1 SvPVX_const(data->substrs[i].str), 125f2a19305Safresh1 (IV)data->substrs[i].min_offset, 126f2a19305Safresh1 (IV)data->substrs[i].max_offset 127f2a19305Safresh1 ); 128f2a19305Safresh1 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]"); 129f2a19305Safresh1 } 130f2a19305Safresh1 } 131f2a19305Safresh1 132f2a19305Safresh1 Perl_re_printf( aTHX_ "\n"); 133f2a19305Safresh1 }); 134f2a19305Safresh1 } 135f2a19305Safresh1 136f2a19305Safresh1 137f2a19305Safresh1 void 138f2a19305Safresh1 Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state, 139f2a19305Safresh1 regnode *scan, U32 depth, U32 flags) 140f2a19305Safresh1 { 141f2a19305Safresh1 PERL_ARGS_ASSERT_DEBUG_PEEP; 142f2a19305Safresh1 DECLARE_AND_GET_RE_DEBUG_FLAGS; 143f2a19305Safresh1 144f2a19305Safresh1 DEBUG_OPTIMISE_r({ 145f2a19305Safresh1 regnode *Next; 146f2a19305Safresh1 147f2a19305Safresh1 if (!scan) 148f2a19305Safresh1 return; 149f2a19305Safresh1 Next = regnext(scan); 150f2a19305Safresh1 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); 151f2a19305Safresh1 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)", 152f2a19305Safresh1 depth, 153f2a19305Safresh1 str, 154f2a19305Safresh1 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv), 155f2a19305Safresh1 Next ? (REG_NODE_NUM(Next)) : 0 ); 156f2a19305Safresh1 Perl_debug_show_study_flags(aTHX_ flags," [ ","]"); 157f2a19305Safresh1 Perl_re_printf( aTHX_ "\n"); 158f2a19305Safresh1 }); 159f2a19305Safresh1 } 160f2a19305Safresh1 161*5486feefSafresh1 const regnode * 162*5486feefSafresh1 Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, 163*5486feefSafresh1 const regnode *last, const regnode *plast, 164*5486feefSafresh1 SV* sv, I32 indent, U32 depth) 165*5486feefSafresh1 { 166*5486feefSafresh1 const regnode *next; 167*5486feefSafresh1 const regnode *optstart= NULL; 168*5486feefSafresh1 169*5486feefSafresh1 RXi_GET_DECL(r, ri); 170*5486feefSafresh1 DECLARE_AND_GET_RE_DEBUG_FLAGS; 171*5486feefSafresh1 172*5486feefSafresh1 PERL_ARGS_ASSERT_DUMPUNTIL; 173*5486feefSafresh1 174*5486feefSafresh1 #ifdef DEBUG_DUMPUNTIL 175*5486feefSafresh1 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start, 176*5486feefSafresh1 last ? last-start : 0, plast ? plast-start : 0); 177*5486feefSafresh1 #endif 178*5486feefSafresh1 179*5486feefSafresh1 if (plast && plast < last) 180*5486feefSafresh1 last= plast; 181*5486feefSafresh1 182*5486feefSafresh1 while (node && (!last || node < last)) { 183*5486feefSafresh1 const U8 op = OP(node); 184*5486feefSafresh1 185*5486feefSafresh1 if (op == CLOSE || op == SRCLOSE || op == WHILEM) 186*5486feefSafresh1 indent--; 187*5486feefSafresh1 next = regnext((regnode *)node); 188*5486feefSafresh1 const regnode *after = regnode_after((regnode *)node,0); 189*5486feefSafresh1 190*5486feefSafresh1 /* Where, what. */ 191*5486feefSafresh1 if (op == OPTIMIZED) { 192*5486feefSafresh1 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) 193*5486feefSafresh1 optstart = node; 194*5486feefSafresh1 else 195*5486feefSafresh1 goto after_print; 196*5486feefSafresh1 } else 197*5486feefSafresh1 CLEAR_OPTSTART; 198*5486feefSafresh1 199*5486feefSafresh1 regprop(r, sv, node, NULL, NULL); 200*5486feefSafresh1 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), 201*5486feefSafresh1 (int)(2*indent + 1), "", SvPVX_const(sv)); 202*5486feefSafresh1 203*5486feefSafresh1 if (op != OPTIMIZED) { 204*5486feefSafresh1 if (next == NULL) /* Next ptr. */ 205*5486feefSafresh1 Perl_re_printf( aTHX_ " (0)"); 206*5486feefSafresh1 else if (REGNODE_TYPE(op) == BRANCH 207*5486feefSafresh1 && REGNODE_TYPE(OP(next)) != BRANCH ) 208*5486feefSafresh1 Perl_re_printf( aTHX_ " (FAIL)"); 209*5486feefSafresh1 else 210*5486feefSafresh1 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); 211*5486feefSafresh1 Perl_re_printf( aTHX_ "\n"); 212*5486feefSafresh1 } 213*5486feefSafresh1 214*5486feefSafresh1 after_print: 215*5486feefSafresh1 if (REGNODE_TYPE(op) == BRANCHJ) { 216*5486feefSafresh1 assert(next); 217*5486feefSafresh1 const regnode *nnode = (OP(next) == LONGJMP 218*5486feefSafresh1 ? regnext((regnode *)next) 219*5486feefSafresh1 : next); 220*5486feefSafresh1 if (last && nnode > last) 221*5486feefSafresh1 nnode = last; 222*5486feefSafresh1 DUMPUNTIL(after, nnode); 223*5486feefSafresh1 } 224*5486feefSafresh1 else if (REGNODE_TYPE(op) == BRANCH) { 225*5486feefSafresh1 assert(next); 226*5486feefSafresh1 DUMPUNTIL(after, next); 227*5486feefSafresh1 } 228*5486feefSafresh1 else if ( REGNODE_TYPE(op) == TRIE ) { 229*5486feefSafresh1 const regnode *this_trie = node; 230*5486feefSafresh1 const U32 n = ARG1u(node); 231*5486feefSafresh1 const reg_ac_data * const ac = op>=AHOCORASICK ? 232*5486feefSafresh1 (reg_ac_data *)ri->data->data[n] : 233*5486feefSafresh1 NULL; 234*5486feefSafresh1 const reg_trie_data * const trie = 235*5486feefSafresh1 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie]; 236*5486feefSafresh1 #ifdef DEBUGGING 237*5486feefSafresh1 AV *const trie_words 238*5486feefSafresh1 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]); 239*5486feefSafresh1 #endif 240*5486feefSafresh1 const regnode *nextbranch= NULL; 241*5486feefSafresh1 I32 word_idx; 242*5486feefSafresh1 SvPVCLEAR(sv); 243*5486feefSafresh1 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { 244*5486feefSafresh1 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0); 245*5486feefSafresh1 246*5486feefSafresh1 Perl_re_indentf( aTHX_ "%s ", 247*5486feefSafresh1 indent+3, 248*5486feefSafresh1 elem_ptr 249*5486feefSafresh1 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), 250*5486feefSafresh1 SvCUR(*elem_ptr), PL_dump_re_max_len, 251*5486feefSafresh1 PL_colors[0], PL_colors[1], 252*5486feefSafresh1 (SvUTF8(*elem_ptr) 253*5486feefSafresh1 ? PERL_PV_ESCAPE_UNI 254*5486feefSafresh1 : 0) 255*5486feefSafresh1 | PERL_PV_PRETTY_ELLIPSES 256*5486feefSafresh1 | PERL_PV_PRETTY_LTGT 257*5486feefSafresh1 ) 258*5486feefSafresh1 : "???" 259*5486feefSafresh1 ); 260*5486feefSafresh1 if (trie->jump) { 261*5486feefSafresh1 U16 dist= trie->jump[word_idx+1]; 262*5486feefSafresh1 Perl_re_printf( aTHX_ "(%" UVuf ")\n", 263*5486feefSafresh1 (UV)((dist ? this_trie + dist : next) - start)); 264*5486feefSafresh1 if (dist) { 265*5486feefSafresh1 if (!nextbranch) 266*5486feefSafresh1 nextbranch= this_trie + trie->jump[0]; 267*5486feefSafresh1 DUMPUNTIL(this_trie + dist, nextbranch); 268*5486feefSafresh1 } 269*5486feefSafresh1 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) 270*5486feefSafresh1 nextbranch= regnext((regnode *)nextbranch); 271*5486feefSafresh1 } else { 272*5486feefSafresh1 Perl_re_printf( aTHX_ "\n"); 273*5486feefSafresh1 } 274*5486feefSafresh1 } 275*5486feefSafresh1 if (last && next > last) 276*5486feefSafresh1 node= last; 277*5486feefSafresh1 else 278*5486feefSafresh1 node= next; 279*5486feefSafresh1 } 280*5486feefSafresh1 else if ( op == CURLY ) { /* "next" might be very big: optimizer */ 281*5486feefSafresh1 DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */ 282*5486feefSafresh1 } 283*5486feefSafresh1 else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) { 284*5486feefSafresh1 assert(next); 285*5486feefSafresh1 DUMPUNTIL(after, next); 286*5486feefSafresh1 } 287*5486feefSafresh1 else if ( op == PLUS || op == STAR) { 288*5486feefSafresh1 DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */ 289*5486feefSafresh1 } 290*5486feefSafresh1 else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) { 291*5486feefSafresh1 /* Literal string, where present. */ 292*5486feefSafresh1 node = (const regnode *)REGNODE_AFTER_varies(node); 293*5486feefSafresh1 } 294*5486feefSafresh1 else { 295*5486feefSafresh1 node = REGNODE_AFTER_opcode(node,op); 296*5486feefSafresh1 } 297*5486feefSafresh1 if (op == CURLYX || op == OPEN || op == SROPEN) 298*5486feefSafresh1 indent++; 299*5486feefSafresh1 if (REGNODE_TYPE(op) == END) 300*5486feefSafresh1 break; 301*5486feefSafresh1 } 302*5486feefSafresh1 CLEAR_OPTSTART; 303*5486feefSafresh1 #ifdef DEBUG_DUMPUNTIL 304*5486feefSafresh1 Perl_re_printf( aTHX_ "--- %d\n", (int)indent); 305*5486feefSafresh1 #endif 306*5486feefSafresh1 return node; 307*5486feefSafresh1 } 308*5486feefSafresh1 309*5486feefSafresh1 #endif /* PERL_RE_BUILD_DEBUG */ 310f2a19305Safresh1 311f2a19305Safresh1 /* 312f2a19305Safresh1 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form 313f2a19305Safresh1 */ 314f2a19305Safresh1 #ifdef DEBUGGING 315f2a19305Safresh1 static void 316f2a19305Safresh1 S_regdump_intflags(pTHX_ const char *lead, const U32 flags) 317f2a19305Safresh1 { 318f2a19305Safresh1 int bit; 319f2a19305Safresh1 int set=0; 320f2a19305Safresh1 321*5486feefSafresh1 STATIC_ASSERT_STMT(REG_INTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS); 322f2a19305Safresh1 323*5486feefSafresh1 for (bit=0; bit < REG_INTFLAGS_NAME_SIZE; bit++) { 324f2a19305Safresh1 if (flags & (1<<bit)) { 325f2a19305Safresh1 if (!set++ && lead) 326f2a19305Safresh1 Perl_re_printf( aTHX_ "%s", lead); 327f2a19305Safresh1 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]); 328f2a19305Safresh1 } 329f2a19305Safresh1 } 330f2a19305Safresh1 if (lead) { 331f2a19305Safresh1 if (set) 332f2a19305Safresh1 Perl_re_printf( aTHX_ "\n"); 333f2a19305Safresh1 else 334f2a19305Safresh1 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 335f2a19305Safresh1 } 336f2a19305Safresh1 } 337f2a19305Safresh1 338f2a19305Safresh1 static void 339f2a19305Safresh1 S_regdump_extflags(pTHX_ const char *lead, const U32 flags) 340f2a19305Safresh1 { 341f2a19305Safresh1 int bit; 342f2a19305Safresh1 int set=0; 343f2a19305Safresh1 regex_charset cs; 344f2a19305Safresh1 345*5486feefSafresh1 STATIC_ASSERT_STMT(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS); 346f2a19305Safresh1 347f2a19305Safresh1 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) { 348f2a19305Safresh1 if (flags & (1U<<bit)) { 349f2a19305Safresh1 if ((1U<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */ 350f2a19305Safresh1 continue; 351f2a19305Safresh1 } 352f2a19305Safresh1 if (!set++ && lead) 353f2a19305Safresh1 Perl_re_printf( aTHX_ "%s", lead); 354f2a19305Safresh1 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]); 355f2a19305Safresh1 } 356f2a19305Safresh1 } 357f2a19305Safresh1 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { 358f2a19305Safresh1 if (!set++ && lead) { 359f2a19305Safresh1 Perl_re_printf( aTHX_ "%s", lead); 360f2a19305Safresh1 } 361f2a19305Safresh1 switch (cs) { 362f2a19305Safresh1 case REGEX_UNICODE_CHARSET: 363f2a19305Safresh1 Perl_re_printf( aTHX_ "UNICODE"); 364f2a19305Safresh1 break; 365f2a19305Safresh1 case REGEX_LOCALE_CHARSET: 366f2a19305Safresh1 Perl_re_printf( aTHX_ "LOCALE"); 367f2a19305Safresh1 break; 368f2a19305Safresh1 case REGEX_ASCII_RESTRICTED_CHARSET: 369f2a19305Safresh1 Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); 370f2a19305Safresh1 break; 371f2a19305Safresh1 case REGEX_ASCII_MORE_RESTRICTED_CHARSET: 372f2a19305Safresh1 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); 373f2a19305Safresh1 break; 374f2a19305Safresh1 default: 375f2a19305Safresh1 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); 376f2a19305Safresh1 break; 377f2a19305Safresh1 } 378f2a19305Safresh1 } 379f2a19305Safresh1 if (lead) { 380f2a19305Safresh1 if (set) 381f2a19305Safresh1 Perl_re_printf( aTHX_ "\n"); 382f2a19305Safresh1 else 383f2a19305Safresh1 Perl_re_printf( aTHX_ "%s[none-set]\n", lead); 384f2a19305Safresh1 } 385f2a19305Safresh1 } 386f2a19305Safresh1 #endif 387f2a19305Safresh1 388f2a19305Safresh1 void 389f2a19305Safresh1 Perl_regdump(pTHX_ const regexp *r) 390f2a19305Safresh1 { 391f2a19305Safresh1 #ifdef DEBUGGING 392f2a19305Safresh1 int i; 393f2a19305Safresh1 SV * const sv = sv_newmortal(); 394f2a19305Safresh1 SV *dsv= sv_newmortal(); 395f2a19305Safresh1 RXi_GET_DECL(r, ri); 396f2a19305Safresh1 DECLARE_AND_GET_RE_DEBUG_FLAGS; 397f2a19305Safresh1 398f2a19305Safresh1 PERL_ARGS_ASSERT_REGDUMP; 399f2a19305Safresh1 400f2a19305Safresh1 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0); 401f2a19305Safresh1 402f2a19305Safresh1 /* Header fields of interest. */ 403f2a19305Safresh1 for (i = 0; i < 2; i++) { 404f2a19305Safresh1 if (r->substrs->data[i].substr) { 405f2a19305Safresh1 RE_PV_QUOTED_DECL(s, 0, dsv, 406f2a19305Safresh1 SvPVX_const(r->substrs->data[i].substr), 407f2a19305Safresh1 RE_SV_DUMPLEN(r->substrs->data[i].substr), 408f2a19305Safresh1 PL_dump_re_max_len); 409f2a19305Safresh1 Perl_re_printf( aTHX_ 410f2a19305Safresh1 "%s %s%s at %" IVdf "..%" UVuf " ", 411f2a19305Safresh1 i ? "floating" : "anchored", 412f2a19305Safresh1 s, 413f2a19305Safresh1 RE_SV_TAIL(r->substrs->data[i].substr), 414f2a19305Safresh1 (IV)r->substrs->data[i].min_offset, 415f2a19305Safresh1 (UV)r->substrs->data[i].max_offset); 416f2a19305Safresh1 } 417f2a19305Safresh1 else if (r->substrs->data[i].utf8_substr) { 418f2a19305Safresh1 RE_PV_QUOTED_DECL(s, 1, dsv, 419f2a19305Safresh1 SvPVX_const(r->substrs->data[i].utf8_substr), 420f2a19305Safresh1 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr), 421f2a19305Safresh1 30); 422f2a19305Safresh1 Perl_re_printf( aTHX_ 423f2a19305Safresh1 "%s utf8 %s%s at %" IVdf "..%" UVuf " ", 424f2a19305Safresh1 i ? "floating" : "anchored", 425f2a19305Safresh1 s, 426f2a19305Safresh1 RE_SV_TAIL(r->substrs->data[i].utf8_substr), 427f2a19305Safresh1 (IV)r->substrs->data[i].min_offset, 428f2a19305Safresh1 (UV)r->substrs->data[i].max_offset); 429f2a19305Safresh1 } 430f2a19305Safresh1 } 431f2a19305Safresh1 432f2a19305Safresh1 if (r->check_substr || r->check_utf8) 433f2a19305Safresh1 Perl_re_printf( aTHX_ 434f2a19305Safresh1 (const char *) 435f2a19305Safresh1 ( r->check_substr == r->substrs->data[1].substr 436f2a19305Safresh1 && r->check_utf8 == r->substrs->data[1].utf8_substr 437f2a19305Safresh1 ? "(checking floating" : "(checking anchored")); 438f2a19305Safresh1 if (r->intflags & PREGf_NOSCAN) 439f2a19305Safresh1 Perl_re_printf( aTHX_ " noscan"); 440f2a19305Safresh1 if (r->extflags & RXf_CHECK_ALL) 441f2a19305Safresh1 Perl_re_printf( aTHX_ " isall"); 442f2a19305Safresh1 if (r->check_substr || r->check_utf8) 443f2a19305Safresh1 Perl_re_printf( aTHX_ ") "); 444f2a19305Safresh1 445f2a19305Safresh1 if (ri->regstclass) { 446f2a19305Safresh1 regprop(r, sv, ri->regstclass, NULL, NULL); 447f2a19305Safresh1 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); 448f2a19305Safresh1 } 449f2a19305Safresh1 if (r->intflags & PREGf_ANCH) { 450f2a19305Safresh1 Perl_re_printf( aTHX_ "anchored"); 451f2a19305Safresh1 if (r->intflags & PREGf_ANCH_MBOL) 452f2a19305Safresh1 Perl_re_printf( aTHX_ "(MBOL)"); 453f2a19305Safresh1 if (r->intflags & PREGf_ANCH_SBOL) 454f2a19305Safresh1 Perl_re_printf( aTHX_ "(SBOL)"); 455f2a19305Safresh1 if (r->intflags & PREGf_ANCH_GPOS) 456f2a19305Safresh1 Perl_re_printf( aTHX_ "(GPOS)"); 457f2a19305Safresh1 Perl_re_printf( aTHX_ " "); 458f2a19305Safresh1 } 459f2a19305Safresh1 if (r->intflags & PREGf_GPOS_SEEN) 460f2a19305Safresh1 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); 461f2a19305Safresh1 if (r->intflags & PREGf_SKIP) 462f2a19305Safresh1 Perl_re_printf( aTHX_ "plus "); 463f2a19305Safresh1 if (r->intflags & PREGf_IMPLICIT) 464f2a19305Safresh1 Perl_re_printf( aTHX_ "implicit "); 465f2a19305Safresh1 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); 466f2a19305Safresh1 if (r->extflags & RXf_EVAL_SEEN) 467f2a19305Safresh1 Perl_re_printf( aTHX_ "with eval "); 468f2a19305Safresh1 Perl_re_printf( aTHX_ "\n"); 469f2a19305Safresh1 DEBUG_FLAGS_r({ 470f2a19305Safresh1 regdump_extflags("r->extflags: ", r->extflags); 471f2a19305Safresh1 regdump_intflags("r->intflags: ", r->intflags); 472f2a19305Safresh1 }); 473f2a19305Safresh1 #else 474f2a19305Safresh1 PERL_ARGS_ASSERT_REGDUMP; 475f2a19305Safresh1 PERL_UNUSED_CONTEXT; 476f2a19305Safresh1 PERL_UNUSED_ARG(r); 477f2a19305Safresh1 #endif /* DEBUGGING */ 478f2a19305Safresh1 } 479f2a19305Safresh1 480f2a19305Safresh1 /* Should be synchronized with ANYOF_ #defines in regcomp.h */ 481f2a19305Safresh1 #ifdef DEBUGGING 482f2a19305Safresh1 483f2a19305Safresh1 # if CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1 || CC_ALPHA_ != 2 \ 484f2a19305Safresh1 || CC_LOWER_ != 3 || CC_UPPER_ != 4 || CC_PUNCT_ != 5 \ 485f2a19305Safresh1 || CC_PRINT_ != 6 || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8 \ 486f2a19305Safresh1 || CC_CASED_ != 9 || CC_SPACE_ != 10 || CC_BLANK_ != 11 \ 487f2a19305Safresh1 || CC_XDIGIT_ != 12 || CC_CNTRL_ != 13 || CC_ASCII_ != 14 \ 488f2a19305Safresh1 || CC_VERTSPACE_ != 15 489f2a19305Safresh1 # error Need to adjust order of anyofs[] 490f2a19305Safresh1 # endif 491f2a19305Safresh1 static const char * const anyofs[] = { 492f2a19305Safresh1 "\\w", 493f2a19305Safresh1 "\\W", 494f2a19305Safresh1 "\\d", 495f2a19305Safresh1 "\\D", 496f2a19305Safresh1 "[:alpha:]", 497f2a19305Safresh1 "[:^alpha:]", 498f2a19305Safresh1 "[:lower:]", 499f2a19305Safresh1 "[:^lower:]", 500f2a19305Safresh1 "[:upper:]", 501f2a19305Safresh1 "[:^upper:]", 502f2a19305Safresh1 "[:punct:]", 503f2a19305Safresh1 "[:^punct:]", 504f2a19305Safresh1 "[:print:]", 505f2a19305Safresh1 "[:^print:]", 506f2a19305Safresh1 "[:alnum:]", 507f2a19305Safresh1 "[:^alnum:]", 508f2a19305Safresh1 "[:graph:]", 509f2a19305Safresh1 "[:^graph:]", 510f2a19305Safresh1 "[:cased:]", 511f2a19305Safresh1 "[:^cased:]", 512f2a19305Safresh1 "\\s", 513f2a19305Safresh1 "\\S", 514f2a19305Safresh1 "[:blank:]", 515f2a19305Safresh1 "[:^blank:]", 516f2a19305Safresh1 "[:xdigit:]", 517f2a19305Safresh1 "[:^xdigit:]", 518f2a19305Safresh1 "[:cntrl:]", 519f2a19305Safresh1 "[:^cntrl:]", 520f2a19305Safresh1 "[:ascii:]", 521f2a19305Safresh1 "[:^ascii:]", 522f2a19305Safresh1 "\\v", 523f2a19305Safresh1 "\\V" 524f2a19305Safresh1 }; 525f2a19305Safresh1 #endif 526f2a19305Safresh1 527f2a19305Safresh1 /* 528f2a19305Safresh1 - regprop - printable representation of opcode, with run time support 529f2a19305Safresh1 */ 530f2a19305Safresh1 531f2a19305Safresh1 void 532f2a19305Safresh1 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) 533f2a19305Safresh1 { 534f2a19305Safresh1 #ifdef DEBUGGING 535f2a19305Safresh1 U8 k; 536f2a19305Safresh1 const U8 op = OP(o); 537f2a19305Safresh1 RXi_GET_DECL(prog, progi); 538f2a19305Safresh1 DECLARE_AND_GET_RE_DEBUG_FLAGS; 539f2a19305Safresh1 540f2a19305Safresh1 PERL_ARGS_ASSERT_REGPROP; 541f2a19305Safresh1 542f2a19305Safresh1 SvPVCLEAR(sv); 543f2a19305Safresh1 544f2a19305Safresh1 if (op > REGNODE_MAX) { /* regnode.type is unsigned */ 545f2a19305Safresh1 if (pRExC_state) { /* This gives more info, if we have it */ 546f2a19305Safresh1 FAIL3("panic: corrupted regexp opcode %d > %d", 547f2a19305Safresh1 (int)op, (int)REGNODE_MAX); 548f2a19305Safresh1 } 549f2a19305Safresh1 else { 550f2a19305Safresh1 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d", 551f2a19305Safresh1 (int)op, (int)REGNODE_MAX); 552f2a19305Safresh1 } 553f2a19305Safresh1 } 554f2a19305Safresh1 sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */ 555f2a19305Safresh1 556f2a19305Safresh1 k = REGNODE_TYPE(op); 557f2a19305Safresh1 if (op == BRANCH) { 558f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o)); 559f2a19305Safresh1 } 560f2a19305Safresh1 else if (op == BRANCHJ) { 561f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o)); 562f2a19305Safresh1 } 563f2a19305Safresh1 else if (k == EXACT) { 564f2a19305Safresh1 sv_catpvs(sv, " "); 565f2a19305Safresh1 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) 566f2a19305Safresh1 * is a crude hack but it may be the best for now since 567f2a19305Safresh1 * we have no flag "this EXACTish node was UTF-8" 568f2a19305Safresh1 * --jhi */ 569f2a19305Safresh1 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len, 570f2a19305Safresh1 PL_colors[0], PL_colors[1], 571f2a19305Safresh1 PERL_PV_ESCAPE_UNI_DETECT | 572f2a19305Safresh1 PERL_PV_ESCAPE_NONASCII | 573f2a19305Safresh1 PERL_PV_PRETTY_ELLIPSES | 574f2a19305Safresh1 PERL_PV_PRETTY_LTGT | 575f2a19305Safresh1 PERL_PV_PRETTY_NOCLEAR 576f2a19305Safresh1 ); 577f2a19305Safresh1 } else if (k == TRIE) { 578f2a19305Safresh1 /* print the details of the trie in dumpuntil instead, as 579f2a19305Safresh1 * progi->data isn't available here */ 580f2a19305Safresh1 const U32 n = ARG1u(o); 581f2a19305Safresh1 const reg_ac_data * const ac = IS_TRIE_AC(op) ? 582f2a19305Safresh1 (reg_ac_data *)progi->data->data[n] : 583f2a19305Safresh1 NULL; 584f2a19305Safresh1 const reg_trie_data * const trie 585f2a19305Safresh1 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; 586f2a19305Safresh1 587f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o))); 588f2a19305Safresh1 DEBUG_TRIE_COMPILE_r({ 589f2a19305Safresh1 if (trie->jump) 590f2a19305Safresh1 sv_catpvs(sv, "(JUMP)"); 591f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, 592f2a19305Safresh1 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">", 593f2a19305Safresh1 (UV)trie->startstate, 594f2a19305Safresh1 (IV)trie->statecount-1, /* -1 because of the unused 0 element */ 595f2a19305Safresh1 (UV)trie->wordcount, 596f2a19305Safresh1 (UV)trie->minlen, 597f2a19305Safresh1 (UV)trie->maxlen, 598f2a19305Safresh1 (UV)TRIE_CHARCOUNT(trie), 599f2a19305Safresh1 (UV)trie->uniquecharcount 600f2a19305Safresh1 ); 601f2a19305Safresh1 }); 602f2a19305Safresh1 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { 603f2a19305Safresh1 sv_catpvs(sv, "["); 604f2a19305Safresh1 (void) put_charclass_bitmap_innards(sv, 605f2a19305Safresh1 ((IS_ANYOF_TRIE(op)) 606f2a19305Safresh1 ? ANYOF_BITMAP(o) 607f2a19305Safresh1 : TRIE_BITMAP(trie)), 608f2a19305Safresh1 NULL, 609f2a19305Safresh1 NULL, 610f2a19305Safresh1 NULL, 611f2a19305Safresh1 0, 612f2a19305Safresh1 FALSE 613f2a19305Safresh1 ); 614f2a19305Safresh1 sv_catpvs(sv, "]"); 615f2a19305Safresh1 } 616f2a19305Safresh1 if (trie->before_paren || trie->after_paren) 617f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", 618f2a19305Safresh1 (IV)trie->before_paren,(IV)trie->after_paren); 619f2a19305Safresh1 } else if (k == CURLY) { 620f2a19305Safresh1 U32 lo = ARG1i(o), hi = ARG2i(o); 621f2a19305Safresh1 if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */ 622f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */ 623f2a19305Safresh1 if (op == CURLYM || op == CURLYN || op == CURLYX) 624f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */ 625f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); 626f2a19305Safresh1 if (hi == REG_INFTY) 627f2a19305Safresh1 sv_catpvs(sv, "INFTY"); 628f2a19305Safresh1 else 629f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); 630f2a19305Safresh1 sv_catpvs(sv, "}"); 631f2a19305Safresh1 } 632f2a19305Safresh1 else if (k == WHILEM && FLAGS(o)) /* Ordinal/of */ 633f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4); 634f2a19305Safresh1 else if (k == REF || k == OPEN || k == CLOSE 635f2a19305Safresh1 || k == GROUPP || op == ACCEPT) 636f2a19305Safresh1 { 637f2a19305Safresh1 AV *name_list= NULL; 638f2a19305Safresh1 U32 parno= (op == ACCEPT) ? ARG2u(o) : 639f2a19305Safresh1 (op == OPEN || op == CLOSE) ? PARNO(o) : 640f2a19305Safresh1 ARG1u(o); 641f2a19305Safresh1 if ( RXp_PAREN_NAMES(prog) ) { 642f2a19305Safresh1 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 643f2a19305Safresh1 } else if ( pRExC_state ) { 644f2a19305Safresh1 name_list= RExC_paren_name_list; 645f2a19305Safresh1 } 646f2a19305Safresh1 if ( name_list ) { 647f2a19305Safresh1 if ( k != REF || (op < REFN)) { 648f2a19305Safresh1 UV logical_parno = parno; 649f2a19305Safresh1 if (prog->parno_to_logical) 650f2a19305Safresh1 logical_parno = prog->parno_to_logical[parno]; 651f2a19305Safresh1 652f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ 653f2a19305Safresh1 if (parno != logical_parno) 654f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ 655f2a19305Safresh1 656f2a19305Safresh1 SV **name= av_fetch_simple(name_list, parno, 0 ); 657f2a19305Safresh1 if (name) 658f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 659f2a19305Safresh1 } 660f2a19305Safresh1 else 661f2a19305Safresh1 if (parno > 0) { 662f2a19305Safresh1 /* parno must always be larger than 0 for this block 663f2a19305Safresh1 * as it represents a slot into the data array, which 664f2a19305Safresh1 * has the 0 slot reserved for a placeholder so any valid 665f2a19305Safresh1 * index into it is always true, eg non-zero 666f2a19305Safresh1 * see the '%' "what" type and the implementation of 667f2a19305Safresh1 * S_reg_add_data() 668f2a19305Safresh1 */ 669f2a19305Safresh1 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); 670f2a19305Safresh1 I32 *nums=(I32*)SvPVX(sv_dat); 671f2a19305Safresh1 SV **name= av_fetch_simple(name_list, nums[0], 0 ); 672f2a19305Safresh1 I32 n; 673f2a19305Safresh1 if (name) { 674f2a19305Safresh1 for ( n=0; n<SvIVX(sv_dat); n++ ) { 675f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf, 676f2a19305Safresh1 (n ? "," : ""), (IV)nums[n]); 677f2a19305Safresh1 } 678f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 679f2a19305Safresh1 } 680f2a19305Safresh1 } 681f2a19305Safresh1 } else if (parno>0) { 682f2a19305Safresh1 UV logical_parno = parno; 683f2a19305Safresh1 if (prog->parno_to_logical) 684f2a19305Safresh1 logical_parno = prog->parno_to_logical[parno]; 685f2a19305Safresh1 686f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno); /* Parenth number */ 687f2a19305Safresh1 if (logical_parno != parno) 688f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno); /* Parenth number */ 689f2a19305Safresh1 690f2a19305Safresh1 } 691f2a19305Safresh1 if ( k == REF ) { 692f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o)); 693f2a19305Safresh1 } 694f2a19305Safresh1 if ( k == REF && reginfo) { 695f2a19305Safresh1 U32 n = ARG1u(o); /* which paren pair */ 696f2a19305Safresh1 I32 ln = RXp_OFFS_START(prog,n); 697f2a19305Safresh1 if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1) 698f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, ": FAIL"); 699f2a19305Safresh1 else if (ln == RXp_OFFS_END(prog,n)) 700f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING"); 701f2a19305Safresh1 else { 702f2a19305Safresh1 const char *s = reginfo->strbeg + ln; 703f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, ": "); 704f2a19305Safresh1 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0, 705f2a19305Safresh1 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); 706f2a19305Safresh1 } 707f2a19305Safresh1 } 708f2a19305Safresh1 } else if (k == GOSUB) { 709f2a19305Safresh1 AV *name_list= NULL; 710f2a19305Safresh1 IV parno = ARG1u(o); 711f2a19305Safresh1 IV logical_parno = (parno && prog->parno_to_logical) 712f2a19305Safresh1 ? prog->parno_to_logical[parno] 713f2a19305Safresh1 : parno; 714f2a19305Safresh1 if ( RXp_PAREN_NAMES(prog) ) { 715f2a19305Safresh1 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); 716f2a19305Safresh1 } else if ( pRExC_state ) { 717f2a19305Safresh1 name_list= RExC_paren_name_list; 718f2a19305Safresh1 } 719f2a19305Safresh1 720f2a19305Safresh1 /* Paren and offset */ 721f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno); 722f2a19305Safresh1 if (logical_parno != parno) 723f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno); 724f2a19305Safresh1 725f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o), 726f2a19305Safresh1 (int)((o + (int)ARG2i(o)) - progi->program) ); 727f2a19305Safresh1 if (name_list) { 728f2a19305Safresh1 SV **name= av_fetch_simple(name_list, ARG1u(o), 0 ); 729f2a19305Safresh1 if (name) 730f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); 731f2a19305Safresh1 } 732f2a19305Safresh1 } 733f2a19305Safresh1 else if (k == LOGICAL) 734f2a19305Safresh1 /* 2: embedded, otherwise 1 */ 735f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); 736f2a19305Safresh1 else if (k == ANYOF || k == ANYOFH || k == ANYOFR) { 737f2a19305Safresh1 U8 flags; 738f2a19305Safresh1 char * bitmap; 739f2a19305Safresh1 U8 do_sep = 0; /* Do we need to separate various components of the 740f2a19305Safresh1 output? */ 741f2a19305Safresh1 /* Set if there is still an unresolved user-defined property */ 742f2a19305Safresh1 SV *unresolved = NULL; 743f2a19305Safresh1 744f2a19305Safresh1 /* Things that are ignored except when the runtime locale is UTF-8 */ 745f2a19305Safresh1 SV *only_utf8_locale_invlist = NULL; 746f2a19305Safresh1 747f2a19305Safresh1 /* Code points that don't fit in the bitmap */ 748f2a19305Safresh1 SV *nonbitmap_invlist = NULL; 749f2a19305Safresh1 750f2a19305Safresh1 /* And things that aren't in the bitmap, but are small enough to be */ 751f2a19305Safresh1 SV* bitmap_range_not_in_bitmap = NULL; 752f2a19305Safresh1 753f2a19305Safresh1 bool inverted; 754f2a19305Safresh1 755f2a19305Safresh1 if (k != ANYOF) { 756f2a19305Safresh1 flags = 0; 757f2a19305Safresh1 bitmap = NULL; 758f2a19305Safresh1 } 759f2a19305Safresh1 else { 760f2a19305Safresh1 flags = ANYOF_FLAGS(o); 761f2a19305Safresh1 bitmap = ANYOF_BITMAP(o); 762f2a19305Safresh1 } 763f2a19305Safresh1 764f2a19305Safresh1 if (op == ANYOFL || op == ANYOFPOSIXL) { 765f2a19305Safresh1 if ((flags & ANYOFL_UTF8_LOCALE_REQD)) { 766f2a19305Safresh1 sv_catpvs(sv, "{utf8-locale-reqd}"); 767f2a19305Safresh1 } 768f2a19305Safresh1 if (flags & ANYOFL_FOLD) { 769f2a19305Safresh1 sv_catpvs(sv, "{i}"); 770f2a19305Safresh1 } 771f2a19305Safresh1 } 772f2a19305Safresh1 773f2a19305Safresh1 inverted = flags & ANYOF_INVERT; 774f2a19305Safresh1 775f2a19305Safresh1 /* If there is stuff outside the bitmap, get it */ 776f2a19305Safresh1 if (k == ANYOFR) { 777f2a19305Safresh1 778f2a19305Safresh1 /* For a single range, split into the parts inside vs outside the 779f2a19305Safresh1 * bitmap. */ 780f2a19305Safresh1 UV start = ANYOFRbase(o); 781f2a19305Safresh1 UV end = ANYOFRbase(o) + ANYOFRdelta(o); 782f2a19305Safresh1 783f2a19305Safresh1 if (start < NUM_ANYOF_CODE_POINTS) { 784f2a19305Safresh1 if (end < NUM_ANYOF_CODE_POINTS) { 785f2a19305Safresh1 bitmap_range_not_in_bitmap 786f2a19305Safresh1 = _add_range_to_invlist(bitmap_range_not_in_bitmap, 787f2a19305Safresh1 start, end); 788f2a19305Safresh1 } 789f2a19305Safresh1 else { 790f2a19305Safresh1 bitmap_range_not_in_bitmap 791f2a19305Safresh1 = _add_range_to_invlist(bitmap_range_not_in_bitmap, 792f2a19305Safresh1 start, NUM_ANYOF_CODE_POINTS); 793f2a19305Safresh1 start = NUM_ANYOF_CODE_POINTS; 794f2a19305Safresh1 } 795f2a19305Safresh1 } 796f2a19305Safresh1 797f2a19305Safresh1 if (start >= NUM_ANYOF_CODE_POINTS) { 798f2a19305Safresh1 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 799f2a19305Safresh1 ANYOFRbase(o), 800f2a19305Safresh1 ANYOFRbase(o) + ANYOFRdelta(o)); 801f2a19305Safresh1 } 802f2a19305Safresh1 } 803f2a19305Safresh1 else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) { 804f2a19305Safresh1 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, 805f2a19305Safresh1 NUM_ANYOF_CODE_POINTS, 806f2a19305Safresh1 UV_MAX); 807f2a19305Safresh1 } 808f2a19305Safresh1 else if (ANYOF_HAS_AUX(o)) { 809f2a19305Safresh1 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE, 810f2a19305Safresh1 &unresolved, 811f2a19305Safresh1 &only_utf8_locale_invlist, 812f2a19305Safresh1 &nonbitmap_invlist); 813f2a19305Safresh1 814f2a19305Safresh1 /* The aux data may contain stuff that could fit in the bitmap. 815f2a19305Safresh1 * This could come from a user-defined property being finally 816f2a19305Safresh1 * resolved when this call was done; or much more likely because 817f2a19305Safresh1 * there are matches that require UTF-8 to be valid, and so aren't 818f2a19305Safresh1 * in the bitmap (or ANYOFR). This is teased apart later */ 819f2a19305Safresh1 _invlist_intersection(nonbitmap_invlist, 820f2a19305Safresh1 PL_InBitmap, 821f2a19305Safresh1 &bitmap_range_not_in_bitmap); 822f2a19305Safresh1 /* Leave just the things that don't fit into the bitmap */ 823f2a19305Safresh1 _invlist_subtract(nonbitmap_invlist, 824f2a19305Safresh1 PL_InBitmap, 825f2a19305Safresh1 &nonbitmap_invlist); 826f2a19305Safresh1 } 827f2a19305Safresh1 828f2a19305Safresh1 /* Ready to start outputting. First, the initial left bracket */ 829f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 830f2a19305Safresh1 831f2a19305Safresh1 if ( bitmap 832f2a19305Safresh1 || bitmap_range_not_in_bitmap 833f2a19305Safresh1 || only_utf8_locale_invlist 834f2a19305Safresh1 || unresolved) 835f2a19305Safresh1 { 836f2a19305Safresh1 /* Then all the things that could fit in the bitmap */ 837f2a19305Safresh1 do_sep = put_charclass_bitmap_innards( 838f2a19305Safresh1 sv, 839f2a19305Safresh1 bitmap, 840f2a19305Safresh1 bitmap_range_not_in_bitmap, 841f2a19305Safresh1 only_utf8_locale_invlist, 842f2a19305Safresh1 o, 843f2a19305Safresh1 flags, 844f2a19305Safresh1 845f2a19305Safresh1 /* Can't try inverting for a 846f2a19305Safresh1 * better display if there 847f2a19305Safresh1 * are things that haven't 848f2a19305Safresh1 * been resolved */ 849f2a19305Safresh1 (unresolved != NULL || k == ANYOFR)); 850f2a19305Safresh1 SvREFCNT_dec(bitmap_range_not_in_bitmap); 851f2a19305Safresh1 852f2a19305Safresh1 /* If there are user-defined properties which haven't been defined 853f2a19305Safresh1 * yet, output them. If the result is not to be inverted, it is 854f2a19305Safresh1 * clearest to output them in a separate [] from the bitmap range 855f2a19305Safresh1 * stuff. If the result is to be complemented, we have to show 856f2a19305Safresh1 * everything in one [], as the inversion applies to the whole 857f2a19305Safresh1 * thing. Use {braces} to separate them from anything in the 858f2a19305Safresh1 * bitmap and anything above the bitmap. */ 859f2a19305Safresh1 if (unresolved) { 860f2a19305Safresh1 if (inverted) { 861f2a19305Safresh1 if (! do_sep) { /* If didn't output anything in the bitmap 862f2a19305Safresh1 */ 863f2a19305Safresh1 sv_catpvs(sv, "^"); 864f2a19305Safresh1 } 865f2a19305Safresh1 sv_catpvs(sv, "{"); 866f2a19305Safresh1 } 867f2a19305Safresh1 else if (do_sep) { 868f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], 869f2a19305Safresh1 PL_colors[0]); 870f2a19305Safresh1 } 871f2a19305Safresh1 sv_catsv(sv, unresolved); 872f2a19305Safresh1 if (inverted) { 873f2a19305Safresh1 sv_catpvs(sv, "}"); 874f2a19305Safresh1 } 875f2a19305Safresh1 do_sep = ! inverted; 876f2a19305Safresh1 } 877f2a19305Safresh1 else if ( do_sep == 2 878f2a19305Safresh1 && ! nonbitmap_invlist 879f2a19305Safresh1 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o)) 880f2a19305Safresh1 { 881f2a19305Safresh1 /* Here, the display shows the class as inverted, and 882f2a19305Safresh1 * everything above the lower display should also match, but 883f2a19305Safresh1 * there is no indication of that. Add this range so the code 884f2a19305Safresh1 * below will add it to the display */ 885f2a19305Safresh1 _invlist_union_complement_2nd(nonbitmap_invlist, 886f2a19305Safresh1 PL_InBitmap, 887f2a19305Safresh1 &nonbitmap_invlist); 888f2a19305Safresh1 } 889f2a19305Safresh1 } 890f2a19305Safresh1 891f2a19305Safresh1 /* And, finally, add the above-the-bitmap stuff */ 892f2a19305Safresh1 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { 893f2a19305Safresh1 SV* contents; 894f2a19305Safresh1 895f2a19305Safresh1 /* See if truncation size is overridden */ 896f2a19305Safresh1 const STRLEN dump_len = (PL_dump_re_max_len > 256) 897f2a19305Safresh1 ? PL_dump_re_max_len 898f2a19305Safresh1 : 256; 899f2a19305Safresh1 900f2a19305Safresh1 /* This is output in a separate [] */ 901f2a19305Safresh1 if (do_sep) { 902f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); 903f2a19305Safresh1 } 904f2a19305Safresh1 905f2a19305Safresh1 /* And, for easy of understanding, it is shown in the 906f2a19305Safresh1 * uncomplemented form if possible. The one exception being if 907f2a19305Safresh1 * there are unresolved items, where the inversion has to be 908f2a19305Safresh1 * delayed until runtime */ 909f2a19305Safresh1 if (inverted && ! unresolved) { 910f2a19305Safresh1 _invlist_invert(nonbitmap_invlist); 911f2a19305Safresh1 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); 912f2a19305Safresh1 } 913f2a19305Safresh1 914f2a19305Safresh1 contents = invlist_contents(nonbitmap_invlist, 915f2a19305Safresh1 FALSE /* output suitable for catsv */ 916f2a19305Safresh1 ); 917f2a19305Safresh1 918f2a19305Safresh1 /* If the output is shorter than the permissible maximum, just do it. */ 919f2a19305Safresh1 if (SvCUR(contents) <= dump_len) { 920f2a19305Safresh1 sv_catsv(sv, contents); 921f2a19305Safresh1 } 922f2a19305Safresh1 else { 923f2a19305Safresh1 const char * contents_string = SvPVX(contents); 924f2a19305Safresh1 STRLEN i = dump_len; 925f2a19305Safresh1 926f2a19305Safresh1 /* Otherwise, start at the permissible max and work back to the 927f2a19305Safresh1 * first break possibility */ 928f2a19305Safresh1 while (i > 0 && contents_string[i] != ' ') { 929f2a19305Safresh1 i--; 930f2a19305Safresh1 } 931f2a19305Safresh1 if (i == 0) { /* Fail-safe. Use the max if we couldn't 932f2a19305Safresh1 find a legal break */ 933f2a19305Safresh1 i = dump_len; 934f2a19305Safresh1 } 935f2a19305Safresh1 936f2a19305Safresh1 sv_catpvn(sv, contents_string, i); 937f2a19305Safresh1 sv_catpvs(sv, "..."); 938f2a19305Safresh1 } 939f2a19305Safresh1 940f2a19305Safresh1 SvREFCNT_dec_NN(contents); 941f2a19305Safresh1 SvREFCNT_dec_NN(nonbitmap_invlist); 942f2a19305Safresh1 } 943f2a19305Safresh1 944f2a19305Safresh1 /* And finally the matching, closing ']' */ 945f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 946f2a19305Safresh1 947f2a19305Safresh1 if (op == ANYOFHs) { 948f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1)); 949f2a19305Safresh1 } 950f2a19305Safresh1 else if (REGNODE_TYPE(op) != ANYOF) { 951f2a19305Safresh1 U8 lowest = (op != ANYOFHr) 952f2a19305Safresh1 ? FLAGS(o) 953f2a19305Safresh1 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o)); 954f2a19305Safresh1 U8 highest = (op == ANYOFHr) 955f2a19305Safresh1 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o)) 956f2a19305Safresh1 : (op == ANYOFH || op == ANYOFR) 957f2a19305Safresh1 ? 0xFF 958f2a19305Safresh1 : lowest; 959f2a19305Safresh1 #ifndef EBCDIC 960f2a19305Safresh1 if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o))) 961f2a19305Safresh1 #endif 962f2a19305Safresh1 { 963f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest); 964f2a19305Safresh1 if (lowest != highest) { 965f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest); 966f2a19305Safresh1 } 967f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, ")"); 968f2a19305Safresh1 } 969f2a19305Safresh1 } 970f2a19305Safresh1 971f2a19305Safresh1 SvREFCNT_dec(unresolved); 972f2a19305Safresh1 } 973f2a19305Safresh1 else if (k == ANYOFM) { 974f2a19305Safresh1 SV * cp_list = get_ANYOFM_contents(o); 975f2a19305Safresh1 976f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 977f2a19305Safresh1 if (op == NANYOFM) { 978f2a19305Safresh1 _invlist_invert(cp_list); 979f2a19305Safresh1 } 980f2a19305Safresh1 981f2a19305Safresh1 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE); 982f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 983f2a19305Safresh1 984f2a19305Safresh1 SvREFCNT_dec(cp_list); 985f2a19305Safresh1 } 986f2a19305Safresh1 else if (k == ANYOFHbbm) { 987f2a19305Safresh1 SV * cp_list = get_ANYOFHbbm_contents(o); 988f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); 989f2a19305Safresh1 990f2a19305Safresh1 sv_catsv(sv, invlist_contents(cp_list, 991f2a19305Safresh1 FALSE /* output suitable for catsv */ 992f2a19305Safresh1 )); 993f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); 994f2a19305Safresh1 995f2a19305Safresh1 SvREFCNT_dec(cp_list); 996f2a19305Safresh1 } 997f2a19305Safresh1 else if (k == POSIXD || k == NPOSIXD) { 998f2a19305Safresh1 U8 index = FLAGS(o) * 2; 999f2a19305Safresh1 if (index < C_ARRAY_LENGTH(anyofs)) { 1000f2a19305Safresh1 if (*anyofs[index] != '[') { 1001f2a19305Safresh1 sv_catpvs(sv, "["); 1002f2a19305Safresh1 } 1003f2a19305Safresh1 sv_catpv(sv, anyofs[index]); 1004f2a19305Safresh1 if (*anyofs[index] != '[') { 1005f2a19305Safresh1 sv_catpvs(sv, "]"); 1006f2a19305Safresh1 } 1007f2a19305Safresh1 } 1008f2a19305Safresh1 else { 1009f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); 1010f2a19305Safresh1 } 1011f2a19305Safresh1 } 1012f2a19305Safresh1 else if (k == BOUND || k == NBOUND) { 1013f2a19305Safresh1 /* Must be synced with order of 'bound_type' in regcomp.h */ 1014f2a19305Safresh1 const char * const bounds[] = { 1015f2a19305Safresh1 "", /* Traditional */ 1016f2a19305Safresh1 "{gcb}", 1017f2a19305Safresh1 "{lb}", 1018f2a19305Safresh1 "{sb}", 1019f2a19305Safresh1 "{wb}" 1020f2a19305Safresh1 }; 1021f2a19305Safresh1 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); 1022f2a19305Safresh1 sv_catpv(sv, bounds[FLAGS(o)]); 1023f2a19305Safresh1 } 1024f2a19305Safresh1 else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) { 1025f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o))); 1026f2a19305Safresh1 if (NEXT_OFF(o)) { 1027f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o)); 1028f2a19305Safresh1 } 1029f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "]"); 1030f2a19305Safresh1 } 1031f2a19305Safresh1 else if (op == SBOL) 1032f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^"); 1033f2a19305Safresh1 else if (op == EVAL) { 1034f2a19305Safresh1 if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG) 1035f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, " optimistic"); 1036f2a19305Safresh1 } 1037f2a19305Safresh1 1038f2a19305Safresh1 /* add on the verb argument if there is one */ 1039f2a19305Safresh1 if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) { 1040f2a19305Safresh1 if ( ARG1u(o) ) 1041f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, ":%" SVf, 1042f2a19305Safresh1 SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ])))); 1043f2a19305Safresh1 else 1044f2a19305Safresh1 sv_catpvs(sv, ":NULL"); 1045f2a19305Safresh1 } 1046f2a19305Safresh1 #else 1047f2a19305Safresh1 PERL_UNUSED_CONTEXT; 1048f2a19305Safresh1 PERL_UNUSED_ARG(sv); 1049f2a19305Safresh1 PERL_UNUSED_ARG(o); 1050f2a19305Safresh1 PERL_UNUSED_ARG(prog); 1051f2a19305Safresh1 PERL_UNUSED_ARG(reginfo); 1052f2a19305Safresh1 PERL_UNUSED_ARG(pRExC_state); 1053f2a19305Safresh1 #endif /* DEBUGGING */ 1054f2a19305Safresh1 } 1055f2a19305Safresh1 1056f2a19305Safresh1 1057*5486feefSafresh1 #ifdef DEBUGGING 1058f2a19305Safresh1 STATIC void 1059f2a19305Safresh1 S_put_code_point(pTHX_ SV *sv, UV c) 1060f2a19305Safresh1 { 1061f2a19305Safresh1 PERL_ARGS_ASSERT_PUT_CODE_POINT; 1062f2a19305Safresh1 1063f2a19305Safresh1 if (c > 255) { 1064f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); 1065f2a19305Safresh1 } 1066f2a19305Safresh1 else if (isPRINT(c)) { 1067f2a19305Safresh1 const char string = (char) c; 1068f2a19305Safresh1 1069f2a19305Safresh1 /* We use {phrase} as metanotation in the class, so also escape literal 1070f2a19305Safresh1 * braces */ 1071f2a19305Safresh1 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') 1072f2a19305Safresh1 sv_catpvs(sv, "\\"); 1073f2a19305Safresh1 sv_catpvn(sv, &string, 1); 1074f2a19305Safresh1 } 1075f2a19305Safresh1 else if (isMNEMONIC_CNTRL(c)) { 1076f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); 1077f2a19305Safresh1 } 1078f2a19305Safresh1 else { 1079f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); 1080f2a19305Safresh1 } 1081f2a19305Safresh1 } 1082f2a19305Safresh1 1083f2a19305Safresh1 STATIC void 1084f2a19305Safresh1 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) 1085f2a19305Safresh1 { 1086f2a19305Safresh1 /* Appends to 'sv' a displayable version of the range of code points from 1087f2a19305Safresh1 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls 1088f2a19305Safresh1 * that have them, when they occur at the beginning or end of the range. 1089f2a19305Safresh1 * It uses hex to output the remaining code points, unless 'allow_literals' 1090f2a19305Safresh1 * is true, in which case the printable ASCII ones are output as-is (though 1091f2a19305Safresh1 * some of these will be escaped by put_code_point()). 1092f2a19305Safresh1 * 1093f2a19305Safresh1 * NOTE: This is designed only for printing ranges of code points that fit 1094f2a19305Safresh1 * inside an ANYOF bitmap. Higher code points are simply suppressed 1095f2a19305Safresh1 */ 1096f2a19305Safresh1 1097f2a19305Safresh1 const unsigned int min_range_count = 3; 1098f2a19305Safresh1 1099f2a19305Safresh1 assert(start <= end); 1100f2a19305Safresh1 1101f2a19305Safresh1 PERL_ARGS_ASSERT_PUT_RANGE; 1102f2a19305Safresh1 1103f2a19305Safresh1 while (start <= end) { 1104f2a19305Safresh1 UV this_end; 1105f2a19305Safresh1 const char * format; 1106f2a19305Safresh1 1107f2a19305Safresh1 if ( end - start < min_range_count 1108f2a19305Safresh1 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end)))) 1109f2a19305Safresh1 { 1110f2a19305Safresh1 /* Output a range of 1 or 2 chars individually, or longer ranges 1111f2a19305Safresh1 * when printable */ 1112f2a19305Safresh1 for (; start <= end; start++) { 1113f2a19305Safresh1 put_code_point(sv, start); 1114f2a19305Safresh1 } 1115f2a19305Safresh1 break; 1116f2a19305Safresh1 } 1117f2a19305Safresh1 1118f2a19305Safresh1 /* If permitted by the input options, and there is a possibility that 1119f2a19305Safresh1 * this range contains a printable literal, look to see if there is 1120f2a19305Safresh1 * one. */ 1121f2a19305Safresh1 if (allow_literals && start <= MAX_PRINT_A) { 1122f2a19305Safresh1 1123f2a19305Safresh1 /* If the character at the beginning of the range isn't an ASCII 1124f2a19305Safresh1 * printable, effectively split the range into two parts: 1125f2a19305Safresh1 * 1) the portion before the first such printable, 1126f2a19305Safresh1 * 2) the rest 1127f2a19305Safresh1 * and output them separately. */ 1128f2a19305Safresh1 if (! isPRINT_A(start)) { 1129f2a19305Safresh1 UV temp_end = start + 1; 1130f2a19305Safresh1 1131f2a19305Safresh1 /* There is no point looking beyond the final possible 1132f2a19305Safresh1 * printable, in MAX_PRINT_A */ 1133f2a19305Safresh1 UV max = MIN(end, MAX_PRINT_A); 1134f2a19305Safresh1 1135f2a19305Safresh1 while (temp_end <= max && ! isPRINT_A(temp_end)) { 1136f2a19305Safresh1 temp_end++; 1137f2a19305Safresh1 } 1138f2a19305Safresh1 1139f2a19305Safresh1 /* Here, temp_end points to one beyond the first printable if 1140f2a19305Safresh1 * found, or to one beyond 'max' if not. If none found, make 1141f2a19305Safresh1 * sure that we use the entire range */ 1142f2a19305Safresh1 if (temp_end > MAX_PRINT_A) { 1143f2a19305Safresh1 temp_end = end + 1; 1144f2a19305Safresh1 } 1145f2a19305Safresh1 1146f2a19305Safresh1 /* Output the first part of the split range: the part that 1147f2a19305Safresh1 * doesn't have printables, with the parameter set to not look 1148f2a19305Safresh1 * for literals (otherwise we would infinitely recurse) */ 1149f2a19305Safresh1 put_range(sv, start, temp_end - 1, FALSE); 1150f2a19305Safresh1 1151f2a19305Safresh1 /* The 2nd part of the range (if any) starts here. */ 1152f2a19305Safresh1 start = temp_end; 1153f2a19305Safresh1 1154f2a19305Safresh1 /* We do a continue, instead of dropping down, because even if 1155f2a19305Safresh1 * the 2nd part is non-empty, it could be so short that we want 1156f2a19305Safresh1 * to output it as individual characters, as tested for at the 1157f2a19305Safresh1 * top of this loop. */ 1158f2a19305Safresh1 continue; 1159f2a19305Safresh1 } 1160f2a19305Safresh1 1161f2a19305Safresh1 /* Here, 'start' is a printable ASCII. If it is an alphanumeric, 1162f2a19305Safresh1 * output a sub-range of just the digits or letters, then process 1163f2a19305Safresh1 * the remaining portion as usual. */ 1164f2a19305Safresh1 if (isALPHANUMERIC_A(start)) { 1165f2a19305Safresh1 UV mask = (isDIGIT_A(start)) 1166f2a19305Safresh1 ? CC_DIGIT_ 1167f2a19305Safresh1 : isUPPER_A(start) 1168f2a19305Safresh1 ? CC_UPPER_ 1169f2a19305Safresh1 : CC_LOWER_; 1170f2a19305Safresh1 UV temp_end = start + 1; 1171f2a19305Safresh1 1172f2a19305Safresh1 /* Find the end of the sub-range that includes just the 1173f2a19305Safresh1 * characters in the same class as the first character in it */ 1174f2a19305Safresh1 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) { 1175f2a19305Safresh1 temp_end++; 1176f2a19305Safresh1 } 1177f2a19305Safresh1 temp_end--; 1178f2a19305Safresh1 1179f2a19305Safresh1 /* For short ranges, don't duplicate the code above to output 1180f2a19305Safresh1 * them; just call recursively */ 1181f2a19305Safresh1 if (temp_end - start < min_range_count) { 1182f2a19305Safresh1 put_range(sv, start, temp_end, FALSE); 1183f2a19305Safresh1 } 1184f2a19305Safresh1 else { /* Output as a range */ 1185f2a19305Safresh1 put_code_point(sv, start); 1186f2a19305Safresh1 sv_catpvs(sv, "-"); 1187f2a19305Safresh1 put_code_point(sv, temp_end); 1188f2a19305Safresh1 } 1189f2a19305Safresh1 start = temp_end + 1; 1190f2a19305Safresh1 continue; 1191f2a19305Safresh1 } 1192f2a19305Safresh1 1193f2a19305Safresh1 /* We output any other printables as individual characters */ 1194f2a19305Safresh1 if (isPUNCT_A(start) || isSPACE_A(start)) { 1195f2a19305Safresh1 while (start <= end && (isPUNCT_A(start) 1196f2a19305Safresh1 || isSPACE_A(start))) 1197f2a19305Safresh1 { 1198f2a19305Safresh1 put_code_point(sv, start); 1199f2a19305Safresh1 start++; 1200f2a19305Safresh1 } 1201f2a19305Safresh1 continue; 1202f2a19305Safresh1 } 1203f2a19305Safresh1 } /* End of looking for literals */ 1204f2a19305Safresh1 1205f2a19305Safresh1 /* Here is not to output as a literal. Some control characters have 1206f2a19305Safresh1 * mnemonic names. Split off any of those at the beginning and end of 1207f2a19305Safresh1 * the range to print mnemonically. It isn't possible for many of 1208f2a19305Safresh1 * these to be in a row, so this won't overwhelm with output */ 1209f2a19305Safresh1 if ( start <= end 1210f2a19305Safresh1 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) 1211f2a19305Safresh1 { 1212f2a19305Safresh1 while (isMNEMONIC_CNTRL(start) && start <= end) { 1213f2a19305Safresh1 put_code_point(sv, start); 1214f2a19305Safresh1 start++; 1215f2a19305Safresh1 } 1216f2a19305Safresh1 1217f2a19305Safresh1 /* If this didn't take care of the whole range ... */ 1218f2a19305Safresh1 if (start <= end) { 1219f2a19305Safresh1 1220f2a19305Safresh1 /* Look backwards from the end to find the final non-mnemonic 1221f2a19305Safresh1 * */ 1222f2a19305Safresh1 UV temp_end = end; 1223f2a19305Safresh1 while (isMNEMONIC_CNTRL(temp_end)) { 1224f2a19305Safresh1 temp_end--; 1225f2a19305Safresh1 } 1226f2a19305Safresh1 1227f2a19305Safresh1 /* And separately output the interior range that doesn't start 1228f2a19305Safresh1 * or end with mnemonics */ 1229f2a19305Safresh1 put_range(sv, start, temp_end, FALSE); 1230f2a19305Safresh1 1231f2a19305Safresh1 /* Then output the mnemonic trailing controls */ 1232f2a19305Safresh1 start = temp_end + 1; 1233f2a19305Safresh1 while (start <= end) { 1234f2a19305Safresh1 put_code_point(sv, start); 1235f2a19305Safresh1 start++; 1236f2a19305Safresh1 } 1237f2a19305Safresh1 break; 1238f2a19305Safresh1 } 1239f2a19305Safresh1 } 1240f2a19305Safresh1 1241f2a19305Safresh1 /* As a final resort, output the range or subrange as hex. */ 1242f2a19305Safresh1 1243f2a19305Safresh1 if (start >= NUM_ANYOF_CODE_POINTS) { 1244f2a19305Safresh1 this_end = end; 1245f2a19305Safresh1 } 1246f2a19305Safresh1 else { /* Have to split range at the bitmap boundary */ 1247f2a19305Safresh1 this_end = (end < NUM_ANYOF_CODE_POINTS) 1248f2a19305Safresh1 ? end 1249f2a19305Safresh1 : NUM_ANYOF_CODE_POINTS - 1; 1250f2a19305Safresh1 } 1251f2a19305Safresh1 #if NUM_ANYOF_CODE_POINTS > 256 1252f2a19305Safresh1 format = (this_end < 256) 1253f2a19305Safresh1 ? "\\x%02" UVXf "-\\x%02" UVXf 1254f2a19305Safresh1 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; 1255f2a19305Safresh1 #else 1256f2a19305Safresh1 format = "\\x%02" UVXf "-\\x%02" UVXf; 1257f2a19305Safresh1 #endif 1258f2a19305Safresh1 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); 1259f2a19305Safresh1 Perl_sv_catpvf(aTHX_ sv, format, start, this_end); 1260f2a19305Safresh1 GCC_DIAG_RESTORE_STMT; 1261f2a19305Safresh1 break; 1262f2a19305Safresh1 } 1263f2a19305Safresh1 } 1264f2a19305Safresh1 1265f2a19305Safresh1 STATIC void 1266f2a19305Safresh1 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) 1267f2a19305Safresh1 { 1268f2a19305Safresh1 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list 1269f2a19305Safresh1 * 'invlist' */ 1270f2a19305Safresh1 1271f2a19305Safresh1 UV start, end; 1272f2a19305Safresh1 bool allow_literals = TRUE; 1273f2a19305Safresh1 1274f2a19305Safresh1 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; 1275f2a19305Safresh1 1276f2a19305Safresh1 /* Generally, it is more readable if printable characters are output as 1277f2a19305Safresh1 * literals, but if a range (nearly) spans all of them, it's best to output 1278f2a19305Safresh1 * it as a single range. This code will use a single range if all but 2 1279f2a19305Safresh1 * ASCII printables are in it */ 1280f2a19305Safresh1 invlist_iterinit(invlist); 1281f2a19305Safresh1 while (invlist_iternext(invlist, &start, &end)) { 1282f2a19305Safresh1 1283f2a19305Safresh1 /* If the range starts beyond the final printable, it doesn't have any 1284f2a19305Safresh1 * in it */ 1285f2a19305Safresh1 if (start > MAX_PRINT_A) { 1286f2a19305Safresh1 break; 1287f2a19305Safresh1 } 1288f2a19305Safresh1 1289f2a19305Safresh1 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span 1290f2a19305Safresh1 * all but two, the range must start and end no later than 2 from 1291f2a19305Safresh1 * either end */ 1292f2a19305Safresh1 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { 1293f2a19305Safresh1 if (end > MAX_PRINT_A) { 1294f2a19305Safresh1 end = MAX_PRINT_A; 1295f2a19305Safresh1 } 1296f2a19305Safresh1 if (start < ' ') { 1297f2a19305Safresh1 start = ' '; 1298f2a19305Safresh1 } 1299f2a19305Safresh1 if (end - start >= MAX_PRINT_A - ' ' - 2) { 1300f2a19305Safresh1 allow_literals = FALSE; 1301f2a19305Safresh1 } 1302f2a19305Safresh1 break; 1303f2a19305Safresh1 } 1304f2a19305Safresh1 } 1305f2a19305Safresh1 invlist_iterfinish(invlist); 1306f2a19305Safresh1 1307f2a19305Safresh1 /* Here we have figured things out. Output each range */ 1308f2a19305Safresh1 invlist_iterinit(invlist); 1309f2a19305Safresh1 while (invlist_iternext(invlist, &start, &end)) { 1310f2a19305Safresh1 if (start >= NUM_ANYOF_CODE_POINTS) { 1311f2a19305Safresh1 break; 1312f2a19305Safresh1 } 1313f2a19305Safresh1 put_range(sv, start, end, allow_literals); 1314f2a19305Safresh1 } 1315f2a19305Safresh1 invlist_iterfinish(invlist); 1316f2a19305Safresh1 1317f2a19305Safresh1 return; 1318f2a19305Safresh1 } 1319f2a19305Safresh1 1320f2a19305Safresh1 STATIC SV* 1321f2a19305Safresh1 S_put_charclass_bitmap_innards_common(pTHX_ 1322f2a19305Safresh1 SV* invlist, /* The bitmap */ 1323f2a19305Safresh1 SV* posixes, /* Under /l, things like [:word:], \S */ 1324f2a19305Safresh1 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ 1325f2a19305Safresh1 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ 1326f2a19305Safresh1 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ 1327f2a19305Safresh1 const bool invert /* Is the result to be inverted? */ 1328f2a19305Safresh1 ) 1329f2a19305Safresh1 { 1330f2a19305Safresh1 /* Create and return an SV containing a displayable version of the bitmap 1331f2a19305Safresh1 * and associated information determined by the input parameters. If the 1332f2a19305Safresh1 * output would have been only the inversion indicator '^', NULL is instead 1333f2a19305Safresh1 * returned. */ 1334f2a19305Safresh1 1335f2a19305Safresh1 SV * output; 1336f2a19305Safresh1 1337f2a19305Safresh1 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; 1338f2a19305Safresh1 1339f2a19305Safresh1 if (invert) { 1340f2a19305Safresh1 output = newSVpvs("^"); 1341f2a19305Safresh1 } 1342f2a19305Safresh1 else { 1343f2a19305Safresh1 output = newSVpvs(""); 1344f2a19305Safresh1 } 1345f2a19305Safresh1 1346f2a19305Safresh1 /* First, the code points in the bitmap that are unconditionally there */ 1347f2a19305Safresh1 put_charclass_bitmap_innards_invlist(output, invlist); 1348f2a19305Safresh1 1349f2a19305Safresh1 /* Traditionally, these have been placed after the main code points */ 1350f2a19305Safresh1 if (posixes) { 1351f2a19305Safresh1 sv_catsv(output, posixes); 1352f2a19305Safresh1 } 1353f2a19305Safresh1 1354f2a19305Safresh1 if (only_utf8 && _invlist_len(only_utf8)) { 1355f2a19305Safresh1 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); 1356f2a19305Safresh1 put_charclass_bitmap_innards_invlist(output, only_utf8); 1357f2a19305Safresh1 } 1358f2a19305Safresh1 1359f2a19305Safresh1 if (not_utf8 && _invlist_len(not_utf8)) { 1360f2a19305Safresh1 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); 1361f2a19305Safresh1 put_charclass_bitmap_innards_invlist(output, not_utf8); 1362f2a19305Safresh1 } 1363f2a19305Safresh1 1364f2a19305Safresh1 if (only_utf8_locale && _invlist_len(only_utf8_locale)) { 1365f2a19305Safresh1 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); 1366f2a19305Safresh1 put_charclass_bitmap_innards_invlist(output, only_utf8_locale); 1367f2a19305Safresh1 1368f2a19305Safresh1 /* This is the only list in this routine that can legally contain code 1369f2a19305Safresh1 * points outside the bitmap range. The call just above to 1370f2a19305Safresh1 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so 1371f2a19305Safresh1 * output them here. There's about a half-dozen possible, and none in 1372f2a19305Safresh1 * contiguous ranges longer than 2 */ 1373f2a19305Safresh1 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 1374f2a19305Safresh1 UV start, end; 1375f2a19305Safresh1 SV* above_bitmap = NULL; 1376f2a19305Safresh1 1377f2a19305Safresh1 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); 1378f2a19305Safresh1 1379f2a19305Safresh1 invlist_iterinit(above_bitmap); 1380f2a19305Safresh1 while (invlist_iternext(above_bitmap, &start, &end)) { 1381f2a19305Safresh1 UV i; 1382f2a19305Safresh1 1383f2a19305Safresh1 for (i = start; i <= end; i++) { 1384f2a19305Safresh1 put_code_point(output, i); 1385f2a19305Safresh1 } 1386f2a19305Safresh1 } 1387f2a19305Safresh1 invlist_iterfinish(above_bitmap); 1388f2a19305Safresh1 SvREFCNT_dec_NN(above_bitmap); 1389f2a19305Safresh1 } 1390f2a19305Safresh1 } 1391f2a19305Safresh1 1392f2a19305Safresh1 if (invert && SvCUR(output) == 1) { 1393f2a19305Safresh1 return NULL; 1394f2a19305Safresh1 } 1395f2a19305Safresh1 1396f2a19305Safresh1 return output; 1397f2a19305Safresh1 } 1398f2a19305Safresh1 1399f2a19305Safresh1 STATIC U8 1400f2a19305Safresh1 S_put_charclass_bitmap_innards(pTHX_ SV *sv, 1401f2a19305Safresh1 char *bitmap, 1402f2a19305Safresh1 SV *nonbitmap_invlist, 1403f2a19305Safresh1 SV *only_utf8_locale_invlist, 1404f2a19305Safresh1 const regnode * const node, 1405f2a19305Safresh1 const U8 flags, 1406f2a19305Safresh1 const bool force_as_is_display) 1407f2a19305Safresh1 { 1408f2a19305Safresh1 /* Appends to 'sv' a displayable version of the innards of the bracketed 1409f2a19305Safresh1 * character class defined by the other arguments: 1410f2a19305Safresh1 * 'bitmap' points to the bitmap, or NULL if to ignore that. 1411f2a19305Safresh1 * 'nonbitmap_invlist' is an inversion list of the code points that are in 1412f2a19305Safresh1 * the bitmap range, but for some reason aren't in the bitmap; NULL if 1413f2a19305Safresh1 * none. The reasons for this could be that they require some 1414f2a19305Safresh1 * condition such as the target string being or not being in UTF-8 1415f2a19305Safresh1 * (under /d), or because they came from a user-defined property that 1416f2a19305Safresh1 * was not resolved at the time of the regex compilation (under /u) 1417f2a19305Safresh1 * 'only_utf8_locale_invlist' is an inversion list of the code points that 1418f2a19305Safresh1 * are valid only if the runtime locale is a UTF-8 one; NULL if none 1419f2a19305Safresh1 * 'node' is the regex pattern ANYOF node. It is needed only when the 1420f2a19305Safresh1 * above two parameters are not null, and is passed so that this 1421f2a19305Safresh1 * routine can tease apart the various reasons for them. 1422f2a19305Safresh1 * 'flags' is the flags field of 'node' 1423f2a19305Safresh1 * 'force_as_is_display' is TRUE if this routine should definitely NOT try 1424f2a19305Safresh1 * to invert things to see if that leads to a cleaner display. If 1425f2a19305Safresh1 * FALSE, this routine is free to use its judgment about doing this. 1426f2a19305Safresh1 * 1427f2a19305Safresh1 * It returns 0 if nothing was actually output. (It may be that 1428f2a19305Safresh1 * the bitmap, etc is empty.) 1429f2a19305Safresh1 * 1 if the output wasn't inverted (didn't begin with a '^') 1430f2a19305Safresh1 * 2 if the output was inverted (did begin with a '^') 1431f2a19305Safresh1 * 1432f2a19305Safresh1 * When called for outputting the bitmap of a non-ANYOF node, just pass the 1433f2a19305Safresh1 * bitmap, with the succeeding parameters set to NULL, and the final one to 1434f2a19305Safresh1 * FALSE. 1435f2a19305Safresh1 */ 1436f2a19305Safresh1 1437f2a19305Safresh1 /* In general, it tries to display the 'cleanest' representation of the 1438f2a19305Safresh1 * innards, choosing whether to display them inverted or not, regardless of 1439f2a19305Safresh1 * whether the class itself is to be inverted. However, there are some 1440f2a19305Safresh1 * cases where it can't try inverting, as what actually matches isn't known 1441f2a19305Safresh1 * until runtime, and hence the inversion isn't either. */ 1442f2a19305Safresh1 1443f2a19305Safresh1 bool inverting_allowed = ! force_as_is_display; 1444f2a19305Safresh1 1445f2a19305Safresh1 int i; 1446f2a19305Safresh1 STRLEN orig_sv_cur = SvCUR(sv); 1447f2a19305Safresh1 1448f2a19305Safresh1 SV* invlist; /* Inversion list we accumulate of code points that 1449f2a19305Safresh1 are unconditionally matched */ 1450f2a19305Safresh1 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is 1451f2a19305Safresh1 UTF-8 */ 1452f2a19305Safresh1 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 1453f2a19305Safresh1 */ 1454f2a19305Safresh1 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ 1455f2a19305Safresh1 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale 1456f2a19305Safresh1 is UTF-8 */ 1457f2a19305Safresh1 1458f2a19305Safresh1 SV* as_is_display; /* The output string when we take the inputs 1459f2a19305Safresh1 literally */ 1460f2a19305Safresh1 SV* inverted_display; /* The output string when we invert the inputs */ 1461f2a19305Safresh1 1462f2a19305Safresh1 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted 1463f2a19305Safresh1 to match? */ 1464f2a19305Safresh1 /* We are biased in favor of displaying things without them being inverted, 1465f2a19305Safresh1 * as that is generally easier to understand */ 1466f2a19305Safresh1 const int bias = 5; 1467f2a19305Safresh1 1468f2a19305Safresh1 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; 1469f2a19305Safresh1 1470f2a19305Safresh1 /* Start off with whatever code points are passed in. (We clone, so we 1471f2a19305Safresh1 * don't change the caller's list) */ 1472f2a19305Safresh1 if (nonbitmap_invlist) { 1473f2a19305Safresh1 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); 1474f2a19305Safresh1 invlist = invlist_clone(nonbitmap_invlist, NULL); 1475f2a19305Safresh1 } 1476f2a19305Safresh1 else { /* Worst case size is every other code point is matched */ 1477f2a19305Safresh1 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); 1478f2a19305Safresh1 } 1479f2a19305Safresh1 1480f2a19305Safresh1 if (flags) { 1481f2a19305Safresh1 if (OP(node) == ANYOFD) { 1482f2a19305Safresh1 1483f2a19305Safresh1 /* This flag indicates that the code points below 0x100 in the 1484f2a19305Safresh1 * nonbitmap list are precisely the ones that match only when the 1485f2a19305Safresh1 * target is UTF-8 (they should all be non-ASCII). */ 1486f2a19305Safresh1 if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) { 1487f2a19305Safresh1 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); 1488f2a19305Safresh1 _invlist_subtract(invlist, only_utf8, &invlist); 1489f2a19305Safresh1 } 1490f2a19305Safresh1 1491f2a19305Safresh1 /* And this flag for matching all non-ASCII 0xFF and below */ 1492f2a19305Safresh1 if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) { 1493f2a19305Safresh1 not_utf8 = invlist_clone(PL_UpperLatin1, NULL); 1494f2a19305Safresh1 } 1495f2a19305Safresh1 } 1496f2a19305Safresh1 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) { 1497f2a19305Safresh1 1498f2a19305Safresh1 /* If either of these flags are set, what matches isn't 1499f2a19305Safresh1 * determinable except during execution, so don't know enough here 1500f2a19305Safresh1 * to invert */ 1501f2a19305Safresh1 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { 1502f2a19305Safresh1 inverting_allowed = FALSE; 1503f2a19305Safresh1 } 1504f2a19305Safresh1 1505f2a19305Safresh1 /* What the posix classes match also varies at runtime, so these 1506f2a19305Safresh1 * will be output symbolically. */ 1507f2a19305Safresh1 if (ANYOF_POSIXL_TEST_ANY_SET(node)) { 1508f2a19305Safresh1 int i; 1509f2a19305Safresh1 1510f2a19305Safresh1 posixes = newSVpvs(""); 1511f2a19305Safresh1 for (i = 0; i < ANYOF_POSIXL_MAX; i++) { 1512f2a19305Safresh1 if (ANYOF_POSIXL_TEST(node, i)) { 1513f2a19305Safresh1 sv_catpv(posixes, anyofs[i]); 1514f2a19305Safresh1 } 1515f2a19305Safresh1 } 1516f2a19305Safresh1 } 1517f2a19305Safresh1 } 1518f2a19305Safresh1 } 1519f2a19305Safresh1 1520f2a19305Safresh1 /* Accumulate the bit map into the unconditional match list */ 1521f2a19305Safresh1 if (bitmap) { 1522f2a19305Safresh1 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { 1523f2a19305Safresh1 if (BITMAP_TEST(bitmap, i)) { 1524f2a19305Safresh1 int start = i++; 1525f2a19305Safresh1 for (; 1526f2a19305Safresh1 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); 1527f2a19305Safresh1 i++) 1528f2a19305Safresh1 { /* empty */ } 1529f2a19305Safresh1 invlist = _add_range_to_invlist(invlist, start, i-1); 1530f2a19305Safresh1 } 1531f2a19305Safresh1 } 1532f2a19305Safresh1 } 1533f2a19305Safresh1 1534f2a19305Safresh1 /* Make sure that the conditional match lists don't have anything in them 1535f2a19305Safresh1 * that match unconditionally; otherwise the output is quite confusing. 1536f2a19305Safresh1 * This could happen if the code that populates these misses some 1537f2a19305Safresh1 * duplication. */ 1538f2a19305Safresh1 if (only_utf8) { 1539f2a19305Safresh1 _invlist_subtract(only_utf8, invlist, &only_utf8); 1540f2a19305Safresh1 } 1541f2a19305Safresh1 if (not_utf8) { 1542f2a19305Safresh1 _invlist_subtract(not_utf8, invlist, ¬_utf8); 1543f2a19305Safresh1 } 1544f2a19305Safresh1 1545f2a19305Safresh1 if (only_utf8_locale_invlist) { 1546f2a19305Safresh1 1547f2a19305Safresh1 /* Since this list is passed in, we have to make a copy before 1548f2a19305Safresh1 * modifying it */ 1549f2a19305Safresh1 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL); 1550f2a19305Safresh1 1551f2a19305Safresh1 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); 1552f2a19305Safresh1 1553f2a19305Safresh1 /* And, it can get really weird for us to try outputting an inverted 1554f2a19305Safresh1 * form of this list when it has things above the bitmap, so don't even 1555f2a19305Safresh1 * try */ 1556f2a19305Safresh1 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { 1557f2a19305Safresh1 inverting_allowed = FALSE; 1558f2a19305Safresh1 } 1559f2a19305Safresh1 } 1560f2a19305Safresh1 1561f2a19305Safresh1 /* Calculate what the output would be if we take the input as-is */ 1562f2a19305Safresh1 as_is_display = put_charclass_bitmap_innards_common(invlist, 1563f2a19305Safresh1 posixes, 1564f2a19305Safresh1 only_utf8, 1565f2a19305Safresh1 not_utf8, 1566f2a19305Safresh1 only_utf8_locale, 1567f2a19305Safresh1 invert); 1568f2a19305Safresh1 1569f2a19305Safresh1 /* If have to take the output as-is, just do that */ 1570f2a19305Safresh1 if (! inverting_allowed) { 1571f2a19305Safresh1 if (as_is_display) { 1572f2a19305Safresh1 sv_catsv(sv, as_is_display); 1573f2a19305Safresh1 SvREFCNT_dec_NN(as_is_display); 1574f2a19305Safresh1 } 1575f2a19305Safresh1 } 1576f2a19305Safresh1 else { /* But otherwise, create the output again on the inverted input, and 1577f2a19305Safresh1 use whichever version is shorter */ 1578f2a19305Safresh1 1579f2a19305Safresh1 int inverted_bias, as_is_bias; 1580f2a19305Safresh1 1581f2a19305Safresh1 /* We will apply our bias to whichever of the results doesn't have 1582f2a19305Safresh1 * the '^' */ 1583f2a19305Safresh1 bool trial_invert; 1584f2a19305Safresh1 if (invert) { 1585f2a19305Safresh1 trial_invert = FALSE; 1586f2a19305Safresh1 as_is_bias = bias; 1587f2a19305Safresh1 inverted_bias = 0; 1588f2a19305Safresh1 } 1589f2a19305Safresh1 else { 1590f2a19305Safresh1 trial_invert = TRUE; 1591f2a19305Safresh1 as_is_bias = 0; 1592f2a19305Safresh1 inverted_bias = bias; 1593f2a19305Safresh1 } 1594f2a19305Safresh1 1595f2a19305Safresh1 /* Now invert each of the lists that contribute to the output, 1596f2a19305Safresh1 * excluding from the result things outside the possible range */ 1597f2a19305Safresh1 1598f2a19305Safresh1 /* For the unconditional inversion list, we have to add in all the 1599f2a19305Safresh1 * conditional code points, so that when inverted, they will be gone 1600f2a19305Safresh1 * from it */ 1601f2a19305Safresh1 _invlist_union(only_utf8, invlist, &invlist); 1602f2a19305Safresh1 _invlist_union(not_utf8, invlist, &invlist); 1603f2a19305Safresh1 _invlist_union(only_utf8_locale, invlist, &invlist); 1604f2a19305Safresh1 _invlist_invert(invlist); 1605f2a19305Safresh1 _invlist_intersection(invlist, PL_InBitmap, &invlist); 1606f2a19305Safresh1 1607f2a19305Safresh1 if (only_utf8) { 1608f2a19305Safresh1 _invlist_invert(only_utf8); 1609f2a19305Safresh1 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); 1610f2a19305Safresh1 } 1611f2a19305Safresh1 else if (not_utf8) { 1612f2a19305Safresh1 1613f2a19305Safresh1 /* If a code point matches iff the target string is not in UTF-8, 1614f2a19305Safresh1 * then complementing the result has it not match iff not in UTF-8, 1615f2a19305Safresh1 * which is the same thing as matching iff it is UTF-8. */ 1616f2a19305Safresh1 only_utf8 = not_utf8; 1617f2a19305Safresh1 not_utf8 = NULL; 1618f2a19305Safresh1 } 1619f2a19305Safresh1 1620f2a19305Safresh1 if (only_utf8_locale) { 1621f2a19305Safresh1 _invlist_invert(only_utf8_locale); 1622f2a19305Safresh1 _invlist_intersection(only_utf8_locale, 1623f2a19305Safresh1 PL_InBitmap, 1624f2a19305Safresh1 &only_utf8_locale); 1625f2a19305Safresh1 } 1626f2a19305Safresh1 1627f2a19305Safresh1 inverted_display = put_charclass_bitmap_innards_common( 1628f2a19305Safresh1 invlist, 1629f2a19305Safresh1 posixes, 1630f2a19305Safresh1 only_utf8, 1631f2a19305Safresh1 not_utf8, 1632f2a19305Safresh1 only_utf8_locale, trial_invert); 1633f2a19305Safresh1 1634f2a19305Safresh1 /* Use the shortest representation, taking into account our bias 1635f2a19305Safresh1 * against showing it inverted */ 1636f2a19305Safresh1 if ( inverted_display 1637f2a19305Safresh1 && ( ! as_is_display 1638f2a19305Safresh1 || ( SvCUR(inverted_display) + inverted_bias 1639f2a19305Safresh1 < SvCUR(as_is_display) + as_is_bias))) 1640f2a19305Safresh1 { 1641f2a19305Safresh1 sv_catsv(sv, inverted_display); 1642f2a19305Safresh1 invert = ! invert; 1643f2a19305Safresh1 } 1644f2a19305Safresh1 else if (as_is_display) { 1645f2a19305Safresh1 sv_catsv(sv, as_is_display); 1646f2a19305Safresh1 } 1647f2a19305Safresh1 1648f2a19305Safresh1 SvREFCNT_dec(as_is_display); 1649f2a19305Safresh1 SvREFCNT_dec(inverted_display); 1650f2a19305Safresh1 } 1651f2a19305Safresh1 1652f2a19305Safresh1 SvREFCNT_dec_NN(invlist); 1653f2a19305Safresh1 SvREFCNT_dec(only_utf8); 1654f2a19305Safresh1 SvREFCNT_dec(not_utf8); 1655f2a19305Safresh1 SvREFCNT_dec(posixes); 1656f2a19305Safresh1 SvREFCNT_dec(only_utf8_locale); 1657f2a19305Safresh1 1658f2a19305Safresh1 U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur); 1659f2a19305Safresh1 if (did_output_something) { 1660f2a19305Safresh1 /* Distinguish between non and inverted cases */ 1661f2a19305Safresh1 did_output_something += invert; 1662f2a19305Safresh1 } 1663f2a19305Safresh1 1664f2a19305Safresh1 return did_output_something; 1665f2a19305Safresh1 } 1666f2a19305Safresh1 #endif /* DEBUGGING */ 1667