xref: /openbsd-src/gnu/usr.bin/perl/regcomp_debug.c (revision 5486feefcc8cb79b19e014ab332cc5dfd05b3b33)
1f2a19305Safresh1 #ifdef PERL_EXT_RE_BUILD
2f2a19305Safresh1 #include "re_top.h"
3f2a19305Safresh1 #endif
4f2a19305Safresh1 
5f2a19305Safresh1 #include "EXTERN.h"
6f2a19305Safresh1 #define PERL_IN_REGEX_ENGINE
7f2a19305Safresh1 #define PERL_IN_REGCOMP_ANY
8f2a19305Safresh1 #define PERL_IN_REGCOMP_DEBUG_C
9f2a19305Safresh1 #include "perl.h"
10f2a19305Safresh1 
11f2a19305Safresh1 #ifdef PERL_IN_XSUB_RE
12f2a19305Safresh1 #  include "re_comp.h"
13f2a19305Safresh1 #else
14f2a19305Safresh1 #  include "regcomp.h"
15f2a19305Safresh1 #endif
16f2a19305Safresh1 
17f2a19305Safresh1 #include "invlist_inline.h"
18f2a19305Safresh1 #include "unicode_constants.h"
19f2a19305Safresh1 #include "regcomp_internal.h"
20f2a19305Safresh1 
21*5486feefSafresh1 #ifdef PERL_RE_BUILD_DEBUG
22f2a19305Safresh1 int
23f2a19305Safresh1 Perl_re_printf(pTHX_ const char *fmt, ...)
24f2a19305Safresh1 {
25f2a19305Safresh1     va_list ap;
26f2a19305Safresh1     int result;
27f2a19305Safresh1     PerlIO *f= Perl_debug_log;
28f2a19305Safresh1     PERL_ARGS_ASSERT_RE_PRINTF;
29f2a19305Safresh1     va_start(ap, fmt);
30f2a19305Safresh1     result = PerlIO_vprintf(f, fmt, ap);
31f2a19305Safresh1     va_end(ap);
32f2a19305Safresh1     return result;
33f2a19305Safresh1 }
34f2a19305Safresh1 
35f2a19305Safresh1 int
36f2a19305Safresh1 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
37f2a19305Safresh1 {
38f2a19305Safresh1     va_list ap;
39f2a19305Safresh1     int result;
40f2a19305Safresh1     PerlIO *f= Perl_debug_log;
41f2a19305Safresh1     PERL_ARGS_ASSERT_RE_INDENTF;
42f2a19305Safresh1     va_start(ap, depth);
43f2a19305Safresh1     PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
44f2a19305Safresh1     result = PerlIO_vprintf(f, fmt, ap);
45f2a19305Safresh1     va_end(ap);
46f2a19305Safresh1     return result;
47f2a19305Safresh1 }
48f2a19305Safresh1 
49f2a19305Safresh1 void
50f2a19305Safresh1 Perl_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
51f2a19305Safresh1                                     const char *close_str)
52f2a19305Safresh1 {
53f2a19305Safresh1     PERL_ARGS_ASSERT_DEBUG_SHOW_STUDY_FLAGS;
54f2a19305Safresh1     if (!flags)
55f2a19305Safresh1         return;
56f2a19305Safresh1 
57f2a19305Safresh1     Perl_re_printf( aTHX_  "%s", open_str);
58f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
59f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
60f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
61f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
62f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
63f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
64f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
65f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
66f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
67f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
68f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
69f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
70f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
71f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
72f2a19305Safresh1     DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
73f2a19305Safresh1     Perl_re_printf( aTHX_  "%s", close_str);
74f2a19305Safresh1 }
75f2a19305Safresh1 
76f2a19305Safresh1 void
77f2a19305Safresh1 Perl_debug_studydata(pTHX_ const char *where, scan_data_t *data,
78f2a19305Safresh1                     U32 depth, int is_inf,
79f2a19305Safresh1                     SSize_t min, SSize_t stopmin, SSize_t delta)
80f2a19305Safresh1 {
81f2a19305Safresh1     PERL_ARGS_ASSERT_DEBUG_STUDYDATA;
82f2a19305Safresh1     DECLARE_AND_GET_RE_DEBUG_FLAGS;
83f2a19305Safresh1 
84f2a19305Safresh1     DEBUG_OPTIMISE_MORE_r({
85f2a19305Safresh1         if (!data) {
86f2a19305Safresh1             Perl_re_indentf(aTHX_  "%s: NO DATA",
87f2a19305Safresh1                 depth,
88f2a19305Safresh1                 where);
89f2a19305Safresh1             return;
90f2a19305Safresh1         }
91f2a19305Safresh1         Perl_re_indentf(aTHX_  "%s: M/S/D: %" IVdf "/%" IVdf "/%" IVdf " Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
92f2a19305Safresh1             depth,
93f2a19305Safresh1             where,
94f2a19305Safresh1             min, stopmin, delta,
95f2a19305Safresh1             (IV)data->pos_min,
96f2a19305Safresh1             (IV)data->pos_delta,
97f2a19305Safresh1             (UV)data->flags
98f2a19305Safresh1         );
99f2a19305Safresh1 
100f2a19305Safresh1         Perl_debug_show_study_flags(aTHX_ data->flags," [","]");
101f2a19305Safresh1 
102f2a19305Safresh1         Perl_re_printf( aTHX_
103f2a19305Safresh1             " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
104f2a19305Safresh1             (IV)data->whilem_c,
105f2a19305Safresh1             (IV)(data->last_closep ? *((data)->last_closep) : -1),
106f2a19305Safresh1             is_inf ? "INF " : ""
107f2a19305Safresh1         );
108f2a19305Safresh1 
109f2a19305Safresh1         if (data->last_found) {
110f2a19305Safresh1             int i;
111f2a19305Safresh1             Perl_re_printf(aTHX_
112f2a19305Safresh1                 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
113f2a19305Safresh1                     SvPVX_const(data->last_found),
114f2a19305Safresh1                     (IV)data->last_end,
115f2a19305Safresh1                     (IV)data->last_start_min,
116f2a19305Safresh1                     (IV)data->last_start_max
117f2a19305Safresh1             );
118f2a19305Safresh1 
119f2a19305Safresh1             for (i = 0; i < 2; i++) {
120f2a19305Safresh1                 Perl_re_printf(aTHX_
121f2a19305Safresh1                     " %s%s: '%s' @ %" IVdf "/%" IVdf,
122f2a19305Safresh1                     data->cur_is_floating == i ? "*" : "",
123f2a19305Safresh1                     i ? "Float" : "Fixed",
124f2a19305Safresh1                     SvPVX_const(data->substrs[i].str),
125f2a19305Safresh1                     (IV)data->substrs[i].min_offset,
126f2a19305Safresh1                     (IV)data->substrs[i].max_offset
127f2a19305Safresh1                 );
128f2a19305Safresh1                 Perl_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
129f2a19305Safresh1             }
130f2a19305Safresh1         }
131f2a19305Safresh1 
132f2a19305Safresh1         Perl_re_printf( aTHX_ "\n");
133f2a19305Safresh1     });
134f2a19305Safresh1 }
135f2a19305Safresh1 
136f2a19305Safresh1 
137f2a19305Safresh1 void
138f2a19305Safresh1 Perl_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
139f2a19305Safresh1                 regnode *scan, U32 depth, U32 flags)
140f2a19305Safresh1 {
141f2a19305Safresh1     PERL_ARGS_ASSERT_DEBUG_PEEP;
142f2a19305Safresh1     DECLARE_AND_GET_RE_DEBUG_FLAGS;
143f2a19305Safresh1 
144f2a19305Safresh1     DEBUG_OPTIMISE_r({
145f2a19305Safresh1         regnode *Next;
146f2a19305Safresh1 
147f2a19305Safresh1         if (!scan)
148f2a19305Safresh1             return;
149f2a19305Safresh1         Next = regnext(scan);
150f2a19305Safresh1         regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
151f2a19305Safresh1         Perl_re_indentf( aTHX_   "%s>%3d: %s (%d)",
152f2a19305Safresh1             depth,
153f2a19305Safresh1             str,
154f2a19305Safresh1             REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
155f2a19305Safresh1             Next ? (REG_NODE_NUM(Next)) : 0 );
156f2a19305Safresh1         Perl_debug_show_study_flags(aTHX_ flags," [ ","]");
157f2a19305Safresh1         Perl_re_printf( aTHX_  "\n");
158f2a19305Safresh1    });
159f2a19305Safresh1 }
160f2a19305Safresh1 
161*5486feefSafresh1 const regnode *
162*5486feefSafresh1 Perl_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
163*5486feefSafresh1             const regnode *last, const regnode *plast,
164*5486feefSafresh1             SV* sv, I32 indent, U32 depth)
165*5486feefSafresh1 {
166*5486feefSafresh1     const regnode *next;
167*5486feefSafresh1     const regnode *optstart= NULL;
168*5486feefSafresh1 
169*5486feefSafresh1     RXi_GET_DECL(r, ri);
170*5486feefSafresh1     DECLARE_AND_GET_RE_DEBUG_FLAGS;
171*5486feefSafresh1 
172*5486feefSafresh1     PERL_ARGS_ASSERT_DUMPUNTIL;
173*5486feefSafresh1 
174*5486feefSafresh1 #ifdef DEBUG_DUMPUNTIL
175*5486feefSafresh1     Perl_re_printf( aTHX_  "--- %d : %d - %d - %d\n", indent, node-start,
176*5486feefSafresh1         last ? last-start : 0, plast ? plast-start : 0);
177*5486feefSafresh1 #endif
178*5486feefSafresh1 
179*5486feefSafresh1     if (plast && plast < last)
180*5486feefSafresh1         last= plast;
181*5486feefSafresh1 
182*5486feefSafresh1     while (node && (!last || node < last)) {
183*5486feefSafresh1         const U8 op = OP(node);
184*5486feefSafresh1 
185*5486feefSafresh1         if (op == CLOSE || op == SRCLOSE || op == WHILEM)
186*5486feefSafresh1             indent--;
187*5486feefSafresh1         next = regnext((regnode *)node);
188*5486feefSafresh1         const regnode *after = regnode_after((regnode *)node,0);
189*5486feefSafresh1 
190*5486feefSafresh1         /* Where, what. */
191*5486feefSafresh1         if (op == OPTIMIZED) {
192*5486feefSafresh1             if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
193*5486feefSafresh1                 optstart = node;
194*5486feefSafresh1             else
195*5486feefSafresh1                 goto after_print;
196*5486feefSafresh1         } else
197*5486feefSafresh1             CLEAR_OPTSTART;
198*5486feefSafresh1 
199*5486feefSafresh1         regprop(r, sv, node, NULL, NULL);
200*5486feefSafresh1         Perl_re_printf( aTHX_  "%4" IVdf ":%*s%s", (IV)(node - start),
201*5486feefSafresh1                       (int)(2*indent + 1), "", SvPVX_const(sv));
202*5486feefSafresh1 
203*5486feefSafresh1         if (op != OPTIMIZED) {
204*5486feefSafresh1             if (next == NULL)           /* Next ptr. */
205*5486feefSafresh1                 Perl_re_printf( aTHX_  " (0)");
206*5486feefSafresh1             else if (REGNODE_TYPE(op) == BRANCH
207*5486feefSafresh1                      && REGNODE_TYPE(OP(next)) != BRANCH )
208*5486feefSafresh1                 Perl_re_printf( aTHX_  " (FAIL)");
209*5486feefSafresh1             else
210*5486feefSafresh1                 Perl_re_printf( aTHX_  " (%" IVdf ")", (IV)(next - start));
211*5486feefSafresh1             Perl_re_printf( aTHX_ "\n");
212*5486feefSafresh1         }
213*5486feefSafresh1 
214*5486feefSafresh1       after_print:
215*5486feefSafresh1         if (REGNODE_TYPE(op) == BRANCHJ) {
216*5486feefSafresh1             assert(next);
217*5486feefSafresh1             const regnode *nnode = (OP(next) == LONGJMP
218*5486feefSafresh1                                    ? regnext((regnode *)next)
219*5486feefSafresh1                                    : next);
220*5486feefSafresh1             if (last && nnode > last)
221*5486feefSafresh1                 nnode = last;
222*5486feefSafresh1             DUMPUNTIL(after, nnode);
223*5486feefSafresh1         }
224*5486feefSafresh1         else if (REGNODE_TYPE(op) == BRANCH) {
225*5486feefSafresh1             assert(next);
226*5486feefSafresh1             DUMPUNTIL(after, next);
227*5486feefSafresh1         }
228*5486feefSafresh1         else if ( REGNODE_TYPE(op)  == TRIE ) {
229*5486feefSafresh1             const regnode *this_trie = node;
230*5486feefSafresh1             const U32 n = ARG1u(node);
231*5486feefSafresh1             const reg_ac_data * const ac = op>=AHOCORASICK ?
232*5486feefSafresh1                (reg_ac_data *)ri->data->data[n] :
233*5486feefSafresh1                NULL;
234*5486feefSafresh1             const reg_trie_data * const trie =
235*5486feefSafresh1                 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
236*5486feefSafresh1 #ifdef DEBUGGING
237*5486feefSafresh1             AV *const trie_words
238*5486feefSafresh1                            = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
239*5486feefSafresh1 #endif
240*5486feefSafresh1             const regnode *nextbranch= NULL;
241*5486feefSafresh1             I32 word_idx;
242*5486feefSafresh1             SvPVCLEAR(sv);
243*5486feefSafresh1             for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
244*5486feefSafresh1                 SV ** const elem_ptr = av_fetch_simple(trie_words, word_idx, 0);
245*5486feefSafresh1 
246*5486feefSafresh1                 Perl_re_indentf( aTHX_  "%s ",
247*5486feefSafresh1                     indent+3,
248*5486feefSafresh1                     elem_ptr
249*5486feefSafresh1                     ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
250*5486feefSafresh1                                 SvCUR(*elem_ptr), PL_dump_re_max_len,
251*5486feefSafresh1                                 PL_colors[0], PL_colors[1],
252*5486feefSafresh1                                 (SvUTF8(*elem_ptr)
253*5486feefSafresh1                                  ? PERL_PV_ESCAPE_UNI
254*5486feefSafresh1                                  : 0)
255*5486feefSafresh1                                 | PERL_PV_PRETTY_ELLIPSES
256*5486feefSafresh1                                 | PERL_PV_PRETTY_LTGT
257*5486feefSafresh1                             )
258*5486feefSafresh1                     : "???"
259*5486feefSafresh1                 );
260*5486feefSafresh1                 if (trie->jump) {
261*5486feefSafresh1                     U16 dist= trie->jump[word_idx+1];
262*5486feefSafresh1                     Perl_re_printf( aTHX_  "(%" UVuf ")\n",
263*5486feefSafresh1                                (UV)((dist ? this_trie + dist : next) - start));
264*5486feefSafresh1                     if (dist) {
265*5486feefSafresh1                         if (!nextbranch)
266*5486feefSafresh1                             nextbranch= this_trie + trie->jump[0];
267*5486feefSafresh1                         DUMPUNTIL(this_trie + dist, nextbranch);
268*5486feefSafresh1                     }
269*5486feefSafresh1                     if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
270*5486feefSafresh1                         nextbranch= regnext((regnode *)nextbranch);
271*5486feefSafresh1                 } else {
272*5486feefSafresh1                     Perl_re_printf( aTHX_  "\n");
273*5486feefSafresh1                 }
274*5486feefSafresh1             }
275*5486feefSafresh1             if (last && next > last)
276*5486feefSafresh1                 node= last;
277*5486feefSafresh1             else
278*5486feefSafresh1                 node= next;
279*5486feefSafresh1         }
280*5486feefSafresh1         else if ( op == CURLY ) {   /* "next" might be very big: optimizer */
281*5486feefSafresh1             DUMPUNTIL(after, after + 1); /* +1 is NOT a REGNODE_AFTER */
282*5486feefSafresh1         }
283*5486feefSafresh1         else if (REGNODE_TYPE(op) == CURLY && op != CURLYX) {
284*5486feefSafresh1             assert(next);
285*5486feefSafresh1             DUMPUNTIL(after, next);
286*5486feefSafresh1         }
287*5486feefSafresh1         else if ( op == PLUS || op == STAR) {
288*5486feefSafresh1             DUMPUNTIL(after, after + 1); /* +1 NOT a REGNODE_AFTER */
289*5486feefSafresh1         }
290*5486feefSafresh1         else if (REGNODE_TYPE(op) == EXACT || op == ANYOFHs) {
291*5486feefSafresh1             /* Literal string, where present. */
292*5486feefSafresh1             node = (const regnode *)REGNODE_AFTER_varies(node);
293*5486feefSafresh1         }
294*5486feefSafresh1         else {
295*5486feefSafresh1             node = REGNODE_AFTER_opcode(node,op);
296*5486feefSafresh1         }
297*5486feefSafresh1         if (op == CURLYX || op == OPEN || op == SROPEN)
298*5486feefSafresh1             indent++;
299*5486feefSafresh1         if (REGNODE_TYPE(op) == END)
300*5486feefSafresh1             break;
301*5486feefSafresh1     }
302*5486feefSafresh1     CLEAR_OPTSTART;
303*5486feefSafresh1 #ifdef DEBUG_DUMPUNTIL
304*5486feefSafresh1     Perl_re_printf( aTHX_  "--- %d\n", (int)indent);
305*5486feefSafresh1 #endif
306*5486feefSafresh1     return node;
307*5486feefSafresh1 }
308*5486feefSafresh1 
309*5486feefSafresh1 #endif  /* PERL_RE_BUILD_DEBUG */
310f2a19305Safresh1 
311f2a19305Safresh1 /*
312f2a19305Safresh1  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
313f2a19305Safresh1  */
314f2a19305Safresh1 #ifdef DEBUGGING
315f2a19305Safresh1 static void
316f2a19305Safresh1 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
317f2a19305Safresh1 {
318f2a19305Safresh1     int bit;
319f2a19305Safresh1     int set=0;
320f2a19305Safresh1 
321*5486feefSafresh1     STATIC_ASSERT_STMT(REG_INTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS);
322f2a19305Safresh1 
323*5486feefSafresh1     for (bit=0; bit < REG_INTFLAGS_NAME_SIZE; bit++) {
324f2a19305Safresh1         if (flags & (1<<bit)) {
325f2a19305Safresh1             if (!set++ && lead)
326f2a19305Safresh1                 Perl_re_printf( aTHX_  "%s", lead);
327f2a19305Safresh1             Perl_re_printf( aTHX_  "%s ", PL_reg_intflags_name[bit]);
328f2a19305Safresh1         }
329f2a19305Safresh1     }
330f2a19305Safresh1     if (lead)  {
331f2a19305Safresh1         if (set)
332f2a19305Safresh1             Perl_re_printf( aTHX_  "\n");
333f2a19305Safresh1         else
334f2a19305Safresh1             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
335f2a19305Safresh1     }
336f2a19305Safresh1 }
337f2a19305Safresh1 
338f2a19305Safresh1 static void
339f2a19305Safresh1 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
340f2a19305Safresh1 {
341f2a19305Safresh1     int bit;
342f2a19305Safresh1     int set=0;
343f2a19305Safresh1     regex_charset cs;
344f2a19305Safresh1 
345*5486feefSafresh1     STATIC_ASSERT_STMT(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags) * CHARBITS);
346f2a19305Safresh1 
347f2a19305Safresh1     for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
348f2a19305Safresh1         if (flags & (1U<<bit)) {
349f2a19305Safresh1             if ((1U<<bit) & RXf_PMf_CHARSET) {  /* Output separately, below */
350f2a19305Safresh1                 continue;
351f2a19305Safresh1             }
352f2a19305Safresh1             if (!set++ && lead)
353f2a19305Safresh1                 Perl_re_printf( aTHX_  "%s", lead);
354f2a19305Safresh1             Perl_re_printf( aTHX_  "%s ", PL_reg_extflags_name[bit]);
355f2a19305Safresh1         }
356f2a19305Safresh1     }
357f2a19305Safresh1     if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
358f2a19305Safresh1             if (!set++ && lead) {
359f2a19305Safresh1                 Perl_re_printf( aTHX_  "%s", lead);
360f2a19305Safresh1             }
361f2a19305Safresh1             switch (cs) {
362f2a19305Safresh1                 case REGEX_UNICODE_CHARSET:
363f2a19305Safresh1                     Perl_re_printf( aTHX_  "UNICODE");
364f2a19305Safresh1                     break;
365f2a19305Safresh1                 case REGEX_LOCALE_CHARSET:
366f2a19305Safresh1                     Perl_re_printf( aTHX_  "LOCALE");
367f2a19305Safresh1                     break;
368f2a19305Safresh1                 case REGEX_ASCII_RESTRICTED_CHARSET:
369f2a19305Safresh1                     Perl_re_printf( aTHX_  "ASCII-RESTRICTED");
370f2a19305Safresh1                     break;
371f2a19305Safresh1                 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
372f2a19305Safresh1                     Perl_re_printf( aTHX_  "ASCII-MORE_RESTRICTED");
373f2a19305Safresh1                     break;
374f2a19305Safresh1                 default:
375f2a19305Safresh1                     Perl_re_printf( aTHX_  "UNKNOWN CHARACTER SET");
376f2a19305Safresh1                     break;
377f2a19305Safresh1             }
378f2a19305Safresh1     }
379f2a19305Safresh1     if (lead)  {
380f2a19305Safresh1         if (set)
381f2a19305Safresh1             Perl_re_printf( aTHX_  "\n");
382f2a19305Safresh1         else
383f2a19305Safresh1             Perl_re_printf( aTHX_  "%s[none-set]\n", lead);
384f2a19305Safresh1     }
385f2a19305Safresh1 }
386f2a19305Safresh1 #endif
387f2a19305Safresh1 
388f2a19305Safresh1 void
389f2a19305Safresh1 Perl_regdump(pTHX_ const regexp *r)
390f2a19305Safresh1 {
391f2a19305Safresh1 #ifdef DEBUGGING
392f2a19305Safresh1     int i;
393f2a19305Safresh1     SV * const sv = sv_newmortal();
394f2a19305Safresh1     SV *dsv= sv_newmortal();
395f2a19305Safresh1     RXi_GET_DECL(r, ri);
396f2a19305Safresh1     DECLARE_AND_GET_RE_DEBUG_FLAGS;
397f2a19305Safresh1 
398f2a19305Safresh1     PERL_ARGS_ASSERT_REGDUMP;
399f2a19305Safresh1 
400f2a19305Safresh1     (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
401f2a19305Safresh1 
402f2a19305Safresh1     /* Header fields of interest. */
403f2a19305Safresh1     for (i = 0; i < 2; i++) {
404f2a19305Safresh1         if (r->substrs->data[i].substr) {
405f2a19305Safresh1             RE_PV_QUOTED_DECL(s, 0, dsv,
406f2a19305Safresh1                             SvPVX_const(r->substrs->data[i].substr),
407f2a19305Safresh1                             RE_SV_DUMPLEN(r->substrs->data[i].substr),
408f2a19305Safresh1                             PL_dump_re_max_len);
409f2a19305Safresh1             Perl_re_printf( aTHX_
410f2a19305Safresh1                           "%s %s%s at %" IVdf "..%" UVuf " ",
411f2a19305Safresh1                           i ? "floating" : "anchored",
412f2a19305Safresh1                           s,
413f2a19305Safresh1                           RE_SV_TAIL(r->substrs->data[i].substr),
414f2a19305Safresh1                           (IV)r->substrs->data[i].min_offset,
415f2a19305Safresh1                           (UV)r->substrs->data[i].max_offset);
416f2a19305Safresh1         }
417f2a19305Safresh1         else if (r->substrs->data[i].utf8_substr) {
418f2a19305Safresh1             RE_PV_QUOTED_DECL(s, 1, dsv,
419f2a19305Safresh1                             SvPVX_const(r->substrs->data[i].utf8_substr),
420f2a19305Safresh1                             RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
421f2a19305Safresh1                             30);
422f2a19305Safresh1             Perl_re_printf( aTHX_
423f2a19305Safresh1                           "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
424f2a19305Safresh1                           i ? "floating" : "anchored",
425f2a19305Safresh1                           s,
426f2a19305Safresh1                           RE_SV_TAIL(r->substrs->data[i].utf8_substr),
427f2a19305Safresh1                           (IV)r->substrs->data[i].min_offset,
428f2a19305Safresh1                           (UV)r->substrs->data[i].max_offset);
429f2a19305Safresh1         }
430f2a19305Safresh1     }
431f2a19305Safresh1 
432f2a19305Safresh1     if (r->check_substr || r->check_utf8)
433f2a19305Safresh1         Perl_re_printf( aTHX_
434f2a19305Safresh1                       (const char *)
435f2a19305Safresh1                       (   r->check_substr == r->substrs->data[1].substr
436f2a19305Safresh1                        && r->check_utf8   == r->substrs->data[1].utf8_substr
437f2a19305Safresh1                        ? "(checking floating" : "(checking anchored"));
438f2a19305Safresh1     if (r->intflags & PREGf_NOSCAN)
439f2a19305Safresh1         Perl_re_printf( aTHX_  " noscan");
440f2a19305Safresh1     if (r->extflags & RXf_CHECK_ALL)
441f2a19305Safresh1         Perl_re_printf( aTHX_  " isall");
442f2a19305Safresh1     if (r->check_substr || r->check_utf8)
443f2a19305Safresh1         Perl_re_printf( aTHX_  ") ");
444f2a19305Safresh1 
445f2a19305Safresh1     if (ri->regstclass) {
446f2a19305Safresh1         regprop(r, sv, ri->regstclass, NULL, NULL);
447f2a19305Safresh1         Perl_re_printf( aTHX_  "stclass %s ", SvPVX_const(sv));
448f2a19305Safresh1     }
449f2a19305Safresh1     if (r->intflags & PREGf_ANCH) {
450f2a19305Safresh1         Perl_re_printf( aTHX_  "anchored");
451f2a19305Safresh1         if (r->intflags & PREGf_ANCH_MBOL)
452f2a19305Safresh1             Perl_re_printf( aTHX_  "(MBOL)");
453f2a19305Safresh1         if (r->intflags & PREGf_ANCH_SBOL)
454f2a19305Safresh1             Perl_re_printf( aTHX_  "(SBOL)");
455f2a19305Safresh1         if (r->intflags & PREGf_ANCH_GPOS)
456f2a19305Safresh1             Perl_re_printf( aTHX_  "(GPOS)");
457f2a19305Safresh1         Perl_re_printf( aTHX_ " ");
458f2a19305Safresh1     }
459f2a19305Safresh1     if (r->intflags & PREGf_GPOS_SEEN)
460f2a19305Safresh1         Perl_re_printf( aTHX_  "GPOS:%" UVuf " ", (UV)r->gofs);
461f2a19305Safresh1     if (r->intflags & PREGf_SKIP)
462f2a19305Safresh1         Perl_re_printf( aTHX_  "plus ");
463f2a19305Safresh1     if (r->intflags & PREGf_IMPLICIT)
464f2a19305Safresh1         Perl_re_printf( aTHX_  "implicit ");
465f2a19305Safresh1     Perl_re_printf( aTHX_  "minlen %" IVdf " ", (IV)r->minlen);
466f2a19305Safresh1     if (r->extflags & RXf_EVAL_SEEN)
467f2a19305Safresh1         Perl_re_printf( aTHX_  "with eval ");
468f2a19305Safresh1     Perl_re_printf( aTHX_  "\n");
469f2a19305Safresh1     DEBUG_FLAGS_r({
470f2a19305Safresh1         regdump_extflags("r->extflags: ", r->extflags);
471f2a19305Safresh1         regdump_intflags("r->intflags: ", r->intflags);
472f2a19305Safresh1     });
473f2a19305Safresh1 #else
474f2a19305Safresh1     PERL_ARGS_ASSERT_REGDUMP;
475f2a19305Safresh1     PERL_UNUSED_CONTEXT;
476f2a19305Safresh1     PERL_UNUSED_ARG(r);
477f2a19305Safresh1 #endif  /* DEBUGGING */
478f2a19305Safresh1 }
479f2a19305Safresh1 
480f2a19305Safresh1 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
481f2a19305Safresh1 #ifdef DEBUGGING
482f2a19305Safresh1 
483f2a19305Safresh1 #  if   CC_WORDCHAR_ != 0 || CC_DIGIT_ != 1        || CC_ALPHA_ != 2    \
484f2a19305Safresh1      || CC_LOWER_ != 3    || CC_UPPER_ != 4        || CC_PUNCT_ != 5    \
485f2a19305Safresh1      || CC_PRINT_ != 6    || CC_ALPHANUMERIC_ != 7 || CC_GRAPH_ != 8    \
486f2a19305Safresh1      || CC_CASED_ != 9    || CC_SPACE_ != 10       || CC_BLANK_ != 11   \
487f2a19305Safresh1      || CC_XDIGIT_ != 12  || CC_CNTRL_ != 13       || CC_ASCII_ != 14   \
488f2a19305Safresh1      || CC_VERTSPACE_ != 15
489f2a19305Safresh1 #   error Need to adjust order of anyofs[]
490f2a19305Safresh1 #  endif
491f2a19305Safresh1 static const char * const anyofs[] = {
492f2a19305Safresh1     "\\w",
493f2a19305Safresh1     "\\W",
494f2a19305Safresh1     "\\d",
495f2a19305Safresh1     "\\D",
496f2a19305Safresh1     "[:alpha:]",
497f2a19305Safresh1     "[:^alpha:]",
498f2a19305Safresh1     "[:lower:]",
499f2a19305Safresh1     "[:^lower:]",
500f2a19305Safresh1     "[:upper:]",
501f2a19305Safresh1     "[:^upper:]",
502f2a19305Safresh1     "[:punct:]",
503f2a19305Safresh1     "[:^punct:]",
504f2a19305Safresh1     "[:print:]",
505f2a19305Safresh1     "[:^print:]",
506f2a19305Safresh1     "[:alnum:]",
507f2a19305Safresh1     "[:^alnum:]",
508f2a19305Safresh1     "[:graph:]",
509f2a19305Safresh1     "[:^graph:]",
510f2a19305Safresh1     "[:cased:]",
511f2a19305Safresh1     "[:^cased:]",
512f2a19305Safresh1     "\\s",
513f2a19305Safresh1     "\\S",
514f2a19305Safresh1     "[:blank:]",
515f2a19305Safresh1     "[:^blank:]",
516f2a19305Safresh1     "[:xdigit:]",
517f2a19305Safresh1     "[:^xdigit:]",
518f2a19305Safresh1     "[:cntrl:]",
519f2a19305Safresh1     "[:^cntrl:]",
520f2a19305Safresh1     "[:ascii:]",
521f2a19305Safresh1     "[:^ascii:]",
522f2a19305Safresh1     "\\v",
523f2a19305Safresh1     "\\V"
524f2a19305Safresh1 };
525f2a19305Safresh1 #endif
526f2a19305Safresh1 
527f2a19305Safresh1 /*
528f2a19305Safresh1 - regprop - printable representation of opcode, with run time support
529f2a19305Safresh1 */
530f2a19305Safresh1 
531f2a19305Safresh1 void
532f2a19305Safresh1 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
533f2a19305Safresh1 {
534f2a19305Safresh1 #ifdef DEBUGGING
535f2a19305Safresh1     U8 k;
536f2a19305Safresh1     const U8 op = OP(o);
537f2a19305Safresh1     RXi_GET_DECL(prog, progi);
538f2a19305Safresh1     DECLARE_AND_GET_RE_DEBUG_FLAGS;
539f2a19305Safresh1 
540f2a19305Safresh1     PERL_ARGS_ASSERT_REGPROP;
541f2a19305Safresh1 
542f2a19305Safresh1     SvPVCLEAR(sv);
543f2a19305Safresh1 
544f2a19305Safresh1     if (op > REGNODE_MAX) {          /* regnode.type is unsigned */
545f2a19305Safresh1         if (pRExC_state) {  /* This gives more info, if we have it */
546f2a19305Safresh1             FAIL3("panic: corrupted regexp opcode %d > %d",
547f2a19305Safresh1                   (int)op, (int)REGNODE_MAX);
548f2a19305Safresh1         }
549f2a19305Safresh1         else {
550f2a19305Safresh1             Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
551f2a19305Safresh1                              (int)op, (int)REGNODE_MAX);
552f2a19305Safresh1         }
553f2a19305Safresh1     }
554f2a19305Safresh1     sv_catpv(sv, REGNODE_NAME(op)); /* Take off const! */
555f2a19305Safresh1 
556f2a19305Safresh1     k = REGNODE_TYPE(op);
557f2a19305Safresh1     if (op == BRANCH) {
558f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG1a(o),(IV)ARG1b(o));
559f2a19305Safresh1     }
560f2a19305Safresh1     else if (op == BRANCHJ) {
561f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")", (IV)ARG2a(o),(IV)ARG2b(o));
562f2a19305Safresh1     }
563f2a19305Safresh1     else if (k == EXACT) {
564f2a19305Safresh1         sv_catpvs(sv, " ");
565f2a19305Safresh1         /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
566f2a19305Safresh1          * is a crude hack but it may be the best for now since
567f2a19305Safresh1          * we have no flag "this EXACTish node was UTF-8"
568f2a19305Safresh1          * --jhi */
569f2a19305Safresh1         pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
570f2a19305Safresh1                   PL_colors[0], PL_colors[1],
571f2a19305Safresh1                   PERL_PV_ESCAPE_UNI_DETECT |
572f2a19305Safresh1                   PERL_PV_ESCAPE_NONASCII   |
573f2a19305Safresh1                   PERL_PV_PRETTY_ELLIPSES   |
574f2a19305Safresh1                   PERL_PV_PRETTY_LTGT       |
575f2a19305Safresh1                   PERL_PV_PRETTY_NOCLEAR
576f2a19305Safresh1                   );
577f2a19305Safresh1     } else if (k == TRIE) {
578f2a19305Safresh1         /* print the details of the trie in dumpuntil instead, as
579f2a19305Safresh1          * progi->data isn't available here */
580f2a19305Safresh1         const U32 n = ARG1u(o);
581f2a19305Safresh1         const reg_ac_data * const ac = IS_TRIE_AC(op) ?
582f2a19305Safresh1                (reg_ac_data *)progi->data->data[n] :
583f2a19305Safresh1                NULL;
584f2a19305Safresh1         const reg_trie_data * const trie
585f2a19305Safresh1             = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
586f2a19305Safresh1 
587f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "-%s", REGNODE_NAME(FLAGS(o)));
588f2a19305Safresh1         DEBUG_TRIE_COMPILE_r({
589f2a19305Safresh1           if (trie->jump)
590f2a19305Safresh1             sv_catpvs(sv, "(JUMP)");
591f2a19305Safresh1           Perl_sv_catpvf(aTHX_ sv,
592f2a19305Safresh1             "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
593f2a19305Safresh1             (UV)trie->startstate,
594f2a19305Safresh1             (IV)trie->statecount-1, /* -1 because of the unused 0 element */
595f2a19305Safresh1             (UV)trie->wordcount,
596f2a19305Safresh1             (UV)trie->minlen,
597f2a19305Safresh1             (UV)trie->maxlen,
598f2a19305Safresh1             (UV)TRIE_CHARCOUNT(trie),
599f2a19305Safresh1             (UV)trie->uniquecharcount
600f2a19305Safresh1           );
601f2a19305Safresh1         });
602f2a19305Safresh1         if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
603f2a19305Safresh1             sv_catpvs(sv, "[");
604f2a19305Safresh1             (void) put_charclass_bitmap_innards(sv,
605f2a19305Safresh1                                                 ((IS_ANYOF_TRIE(op))
606f2a19305Safresh1                                                  ? ANYOF_BITMAP(o)
607f2a19305Safresh1                                                  : TRIE_BITMAP(trie)),
608f2a19305Safresh1                                                 NULL,
609f2a19305Safresh1                                                 NULL,
610f2a19305Safresh1                                                 NULL,
611f2a19305Safresh1                                                 0,
612f2a19305Safresh1                                                 FALSE
613f2a19305Safresh1                                                );
614f2a19305Safresh1             sv_catpvs(sv, "]");
615f2a19305Safresh1         }
616f2a19305Safresh1         if (trie->before_paren || trie->after_paren)
617f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, " (buf:%" IVdf "/%" IVdf ")",
618f2a19305Safresh1                     (IV)trie->before_paren,(IV)trie->after_paren);
619f2a19305Safresh1     } else if (k == CURLY) {
620f2a19305Safresh1         U32 lo = ARG1i(o), hi = ARG2i(o);
621f2a19305Safresh1         if (ARG3u(o)) /* check both ARG3a and ARG3b at the same time */
622f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "<%d:%d>", ARG3a(o),ARG3b(o)); /* paren before, paren after */
623f2a19305Safresh1         if (op == CURLYM || op == CURLYN || op == CURLYX)
624f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o)); /* Parenth number */
625f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
626f2a19305Safresh1         if (hi == REG_INFTY)
627f2a19305Safresh1             sv_catpvs(sv, "INFTY");
628f2a19305Safresh1         else
629f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
630f2a19305Safresh1         sv_catpvs(sv, "}");
631f2a19305Safresh1     }
632f2a19305Safresh1     else if (k == WHILEM && FLAGS(o))                   /* Ordinal/of */
633f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", FLAGS(o) & 0xf, FLAGS(o)>>4);
634f2a19305Safresh1     else if (k == REF || k == OPEN || k == CLOSE
635f2a19305Safresh1              || k == GROUPP || op == ACCEPT)
636f2a19305Safresh1     {
637f2a19305Safresh1         AV *name_list= NULL;
638f2a19305Safresh1         U32 parno= (op == ACCEPT)              ? ARG2u(o) :
639f2a19305Safresh1                    (op == OPEN || op == CLOSE) ? PARNO(o) :
640f2a19305Safresh1                                                  ARG1u(o);
641f2a19305Safresh1         if ( RXp_PAREN_NAMES(prog) ) {
642f2a19305Safresh1             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
643f2a19305Safresh1         } else if ( pRExC_state ) {
644f2a19305Safresh1             name_list= RExC_paren_name_list;
645f2a19305Safresh1         }
646f2a19305Safresh1         if ( name_list ) {
647f2a19305Safresh1             if ( k != REF || (op < REFN)) {
648f2a19305Safresh1                 UV logical_parno = parno;
649f2a19305Safresh1                 if (prog->parno_to_logical)
650f2a19305Safresh1                     logical_parno = prog->parno_to_logical[parno];
651f2a19305Safresh1 
652f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno);     /* Parenth number */
653f2a19305Safresh1                 if (parno != logical_parno)
654f2a19305Safresh1                     Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno);        /* Parenth number */
655f2a19305Safresh1 
656f2a19305Safresh1                 SV **name= av_fetch_simple(name_list, parno, 0 );
657f2a19305Safresh1                 if (name)
658f2a19305Safresh1                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
659f2a19305Safresh1             }
660f2a19305Safresh1             else
661f2a19305Safresh1             if (parno > 0) {
662f2a19305Safresh1                 /* parno must always be larger than 0 for this block
663f2a19305Safresh1                  * as it represents a slot into the data array, which
664f2a19305Safresh1                  * has the 0 slot reserved for a placeholder so any valid
665f2a19305Safresh1                  * index into it is always true, eg non-zero
666f2a19305Safresh1                  * see the '%' "what" type and the implementation of
667f2a19305Safresh1                  * S_reg_add_data()
668f2a19305Safresh1                  */
669f2a19305Safresh1                 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
670f2a19305Safresh1                 I32 *nums=(I32*)SvPVX(sv_dat);
671f2a19305Safresh1                 SV **name= av_fetch_simple(name_list, nums[0], 0 );
672f2a19305Safresh1                 I32 n;
673f2a19305Safresh1                 if (name) {
674f2a19305Safresh1                     for ( n=0; n<SvIVX(sv_dat); n++ ) {
675f2a19305Safresh1                         Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
676f2a19305Safresh1                                     (n ? "," : ""), (IV)nums[n]);
677f2a19305Safresh1                     }
678f2a19305Safresh1                     Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
679f2a19305Safresh1                 }
680f2a19305Safresh1             }
681f2a19305Safresh1         } else if (parno>0) {
682f2a19305Safresh1             UV logical_parno = parno;
683f2a19305Safresh1             if (prog->parno_to_logical)
684f2a19305Safresh1                 logical_parno = prog->parno_to_logical[parno];
685f2a19305Safresh1 
686f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)logical_parno);     /* Parenth number */
687f2a19305Safresh1             if (logical_parno != parno)
688f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, "/%" UVuf, (UV)parno);     /* Parenth number */
689f2a19305Safresh1 
690f2a19305Safresh1         }
691f2a19305Safresh1         if ( k == REF ) {
692f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, " <%" IVdf ">", (IV)ARG2i(o));
693f2a19305Safresh1         }
694f2a19305Safresh1         if ( k == REF && reginfo) {
695f2a19305Safresh1             U32 n = ARG1u(o);  /* which paren pair */
696f2a19305Safresh1             I32 ln = RXp_OFFS_START(prog,n);
697f2a19305Safresh1             if (RXp_LASTPAREN(prog) < n || ln == -1 || RXp_OFFS_END(prog,n) == -1)
698f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
699f2a19305Safresh1             else if (ln == RXp_OFFS_END(prog,n))
700f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
701f2a19305Safresh1             else {
702f2a19305Safresh1                 const char *s = reginfo->strbeg + ln;
703f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, ": ");
704f2a19305Safresh1                 Perl_pv_pretty( aTHX_ sv, s, RXp_OFFS_END(prog,n) - RXp_OFFS_START(prog,n), 32, 0, 0,
705f2a19305Safresh1                     PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
706f2a19305Safresh1             }
707f2a19305Safresh1         }
708f2a19305Safresh1     } else if (k == GOSUB) {
709f2a19305Safresh1         AV *name_list= NULL;
710f2a19305Safresh1         IV parno = ARG1u(o);
711f2a19305Safresh1         IV logical_parno = (parno && prog->parno_to_logical)
712f2a19305Safresh1                          ? prog->parno_to_logical[parno]
713f2a19305Safresh1                          : parno;
714f2a19305Safresh1         if ( RXp_PAREN_NAMES(prog) ) {
715f2a19305Safresh1             name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
716f2a19305Safresh1         } else if ( pRExC_state ) {
717f2a19305Safresh1             name_list= RExC_paren_name_list;
718f2a19305Safresh1         }
719f2a19305Safresh1 
720f2a19305Safresh1         /* Paren and offset */
721f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "%" IVdf, logical_parno);
722f2a19305Safresh1         if (logical_parno != parno)
723f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "/%" IVdf, parno);
724f2a19305Safresh1 
725f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%+d:%d]", (int)ARG2i(o),
726f2a19305Safresh1                 (int)((o + (int)ARG2i(o)) - progi->program) );
727f2a19305Safresh1         if (name_list) {
728f2a19305Safresh1             SV **name= av_fetch_simple(name_list, ARG1u(o), 0 );
729f2a19305Safresh1             if (name)
730f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
731f2a19305Safresh1         }
732f2a19305Safresh1     }
733f2a19305Safresh1     else if (k == LOGICAL)
734f2a19305Safresh1         /* 2: embedded, otherwise 1 */
735f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%d]", FLAGS(o));
736f2a19305Safresh1     else if (k == ANYOF || k == ANYOFH || k == ANYOFR) {
737f2a19305Safresh1         U8 flags;
738f2a19305Safresh1         char * bitmap;
739f2a19305Safresh1         U8 do_sep = 0;    /* Do we need to separate various components of the
740f2a19305Safresh1                              output? */
741f2a19305Safresh1         /* Set if there is still an unresolved user-defined property */
742f2a19305Safresh1         SV *unresolved                = NULL;
743f2a19305Safresh1 
744f2a19305Safresh1         /* Things that are ignored except when the runtime locale is UTF-8 */
745f2a19305Safresh1         SV *only_utf8_locale_invlist = NULL;
746f2a19305Safresh1 
747f2a19305Safresh1         /* Code points that don't fit in the bitmap */
748f2a19305Safresh1         SV *nonbitmap_invlist = NULL;
749f2a19305Safresh1 
750f2a19305Safresh1         /* And things that aren't in the bitmap, but are small enough to be */
751f2a19305Safresh1         SV* bitmap_range_not_in_bitmap = NULL;
752f2a19305Safresh1 
753f2a19305Safresh1         bool inverted;
754f2a19305Safresh1 
755f2a19305Safresh1         if (k != ANYOF) {
756f2a19305Safresh1             flags = 0;
757f2a19305Safresh1             bitmap = NULL;
758f2a19305Safresh1         }
759f2a19305Safresh1         else {
760f2a19305Safresh1             flags = ANYOF_FLAGS(o);
761f2a19305Safresh1             bitmap = ANYOF_BITMAP(o);
762f2a19305Safresh1         }
763f2a19305Safresh1 
764f2a19305Safresh1         if (op == ANYOFL || op == ANYOFPOSIXL) {
765f2a19305Safresh1             if ((flags & ANYOFL_UTF8_LOCALE_REQD)) {
766f2a19305Safresh1                 sv_catpvs(sv, "{utf8-locale-reqd}");
767f2a19305Safresh1             }
768f2a19305Safresh1             if (flags & ANYOFL_FOLD) {
769f2a19305Safresh1                 sv_catpvs(sv, "{i}");
770f2a19305Safresh1             }
771f2a19305Safresh1         }
772f2a19305Safresh1 
773f2a19305Safresh1         inverted = flags & ANYOF_INVERT;
774f2a19305Safresh1 
775f2a19305Safresh1         /* If there is stuff outside the bitmap, get it */
776f2a19305Safresh1         if (k == ANYOFR) {
777f2a19305Safresh1 
778f2a19305Safresh1             /* For a single range, split into the parts inside vs outside the
779f2a19305Safresh1              * bitmap. */
780f2a19305Safresh1             UV start = ANYOFRbase(o);
781f2a19305Safresh1             UV end   = ANYOFRbase(o) + ANYOFRdelta(o);
782f2a19305Safresh1 
783f2a19305Safresh1             if (start < NUM_ANYOF_CODE_POINTS) {
784f2a19305Safresh1                 if (end < NUM_ANYOF_CODE_POINTS) {
785f2a19305Safresh1                     bitmap_range_not_in_bitmap
786f2a19305Safresh1                           = _add_range_to_invlist(bitmap_range_not_in_bitmap,
787f2a19305Safresh1                                                   start, end);
788f2a19305Safresh1                 }
789f2a19305Safresh1                 else {
790f2a19305Safresh1                     bitmap_range_not_in_bitmap
791f2a19305Safresh1                           = _add_range_to_invlist(bitmap_range_not_in_bitmap,
792f2a19305Safresh1                                                   start, NUM_ANYOF_CODE_POINTS);
793f2a19305Safresh1                     start = NUM_ANYOF_CODE_POINTS;
794f2a19305Safresh1                 }
795f2a19305Safresh1             }
796f2a19305Safresh1 
797f2a19305Safresh1             if (start >= NUM_ANYOF_CODE_POINTS) {
798f2a19305Safresh1                 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
799f2a19305Safresh1                                                 ANYOFRbase(o),
800f2a19305Safresh1                                                 ANYOFRbase(o) + ANYOFRdelta(o));
801f2a19305Safresh1             }
802f2a19305Safresh1         }
803f2a19305Safresh1         else if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(o)) {
804f2a19305Safresh1             nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
805f2a19305Safresh1                                                       NUM_ANYOF_CODE_POINTS,
806f2a19305Safresh1                                                       UV_MAX);
807f2a19305Safresh1         }
808f2a19305Safresh1         else if (ANYOF_HAS_AUX(o)) {
809f2a19305Safresh1                 (void) GET_REGCLASS_AUX_DATA(prog, o, FALSE,
810f2a19305Safresh1                                                 &unresolved,
811f2a19305Safresh1                                                 &only_utf8_locale_invlist,
812f2a19305Safresh1                                                 &nonbitmap_invlist);
813f2a19305Safresh1 
814f2a19305Safresh1             /* The aux data may contain stuff that could fit in the bitmap.
815f2a19305Safresh1              * This could come from a user-defined property being finally
816f2a19305Safresh1              * resolved when this call was done; or much more likely because
817f2a19305Safresh1              * there are matches that require UTF-8 to be valid, and so aren't
818f2a19305Safresh1              * in the bitmap (or ANYOFR).  This is teased apart later */
819f2a19305Safresh1             _invlist_intersection(nonbitmap_invlist,
820f2a19305Safresh1                                   PL_InBitmap,
821f2a19305Safresh1                                   &bitmap_range_not_in_bitmap);
822f2a19305Safresh1             /* Leave just the things that don't fit into the bitmap */
823f2a19305Safresh1             _invlist_subtract(nonbitmap_invlist,
824f2a19305Safresh1                               PL_InBitmap,
825f2a19305Safresh1                               &nonbitmap_invlist);
826f2a19305Safresh1         }
827f2a19305Safresh1 
828f2a19305Safresh1         /* Ready to start outputting.  First, the initial left bracket */
829f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
830f2a19305Safresh1 
831f2a19305Safresh1         if (   bitmap
832f2a19305Safresh1             || bitmap_range_not_in_bitmap
833f2a19305Safresh1             || only_utf8_locale_invlist
834f2a19305Safresh1             || unresolved)
835f2a19305Safresh1         {
836f2a19305Safresh1             /* Then all the things that could fit in the bitmap */
837f2a19305Safresh1             do_sep = put_charclass_bitmap_innards(
838f2a19305Safresh1                                     sv,
839f2a19305Safresh1                                     bitmap,
840f2a19305Safresh1                                     bitmap_range_not_in_bitmap,
841f2a19305Safresh1                                     only_utf8_locale_invlist,
842f2a19305Safresh1                                     o,
843f2a19305Safresh1                                     flags,
844f2a19305Safresh1 
845f2a19305Safresh1                                     /* Can't try inverting for a
846f2a19305Safresh1                                                    * better display if there
847f2a19305Safresh1                                                    * are things that haven't
848f2a19305Safresh1                                                    * been resolved */
849f2a19305Safresh1                                     (unresolved != NULL || k == ANYOFR));
850f2a19305Safresh1             SvREFCNT_dec(bitmap_range_not_in_bitmap);
851f2a19305Safresh1 
852f2a19305Safresh1             /* If there are user-defined properties which haven't been defined
853f2a19305Safresh1              * yet, output them.  If the result is not to be inverted, it is
854f2a19305Safresh1              * clearest to output them in a separate [] from the bitmap range
855f2a19305Safresh1              * stuff.  If the result is to be complemented, we have to show
856f2a19305Safresh1              * everything in one [], as the inversion applies to the whole
857f2a19305Safresh1              * thing.  Use {braces} to separate them from anything in the
858f2a19305Safresh1              * bitmap and anything above the bitmap. */
859f2a19305Safresh1             if (unresolved) {
860f2a19305Safresh1                 if (inverted) {
861f2a19305Safresh1                     if (! do_sep) { /* If didn't output anything in the bitmap
862f2a19305Safresh1                                      */
863f2a19305Safresh1                         sv_catpvs(sv, "^");
864f2a19305Safresh1                     }
865f2a19305Safresh1                     sv_catpvs(sv, "{");
866f2a19305Safresh1                 }
867f2a19305Safresh1                 else if (do_sep) {
868f2a19305Safresh1                     Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
869f2a19305Safresh1                                                       PL_colors[0]);
870f2a19305Safresh1                 }
871f2a19305Safresh1                 sv_catsv(sv, unresolved);
872f2a19305Safresh1                 if (inverted) {
873f2a19305Safresh1                     sv_catpvs(sv, "}");
874f2a19305Safresh1                 }
875f2a19305Safresh1                 do_sep = ! inverted;
876f2a19305Safresh1             }
877f2a19305Safresh1             else if (     do_sep == 2
878f2a19305Safresh1                      && ! nonbitmap_invlist
879f2a19305Safresh1                      &&   ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(o))
880f2a19305Safresh1             {
881f2a19305Safresh1                 /* Here, the display shows the class as inverted, and
882f2a19305Safresh1                  * everything above the lower display should also match, but
883f2a19305Safresh1                  * there is no indication of that.  Add this range so the code
884f2a19305Safresh1                  * below will add it to the display */
885f2a19305Safresh1                 _invlist_union_complement_2nd(nonbitmap_invlist,
886f2a19305Safresh1                                               PL_InBitmap,
887f2a19305Safresh1                                               &nonbitmap_invlist);
888f2a19305Safresh1             }
889f2a19305Safresh1         }
890f2a19305Safresh1 
891f2a19305Safresh1         /* And, finally, add the above-the-bitmap stuff */
892f2a19305Safresh1         if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
893f2a19305Safresh1             SV* contents;
894f2a19305Safresh1 
895f2a19305Safresh1             /* See if truncation size is overridden */
896f2a19305Safresh1             const STRLEN dump_len = (PL_dump_re_max_len > 256)
897f2a19305Safresh1                                     ? PL_dump_re_max_len
898f2a19305Safresh1                                     : 256;
899f2a19305Safresh1 
900f2a19305Safresh1             /* This is output in a separate [] */
901f2a19305Safresh1             if (do_sep) {
902f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
903f2a19305Safresh1             }
904f2a19305Safresh1 
905f2a19305Safresh1             /* And, for easy of understanding, it is shown in the
906f2a19305Safresh1              * uncomplemented form if possible.  The one exception being if
907f2a19305Safresh1              * there are unresolved items, where the inversion has to be
908f2a19305Safresh1              * delayed until runtime */
909f2a19305Safresh1             if (inverted && ! unresolved) {
910f2a19305Safresh1                 _invlist_invert(nonbitmap_invlist);
911f2a19305Safresh1                 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
912f2a19305Safresh1             }
913f2a19305Safresh1 
914f2a19305Safresh1             contents = invlist_contents(nonbitmap_invlist,
915f2a19305Safresh1                                         FALSE /* output suitable for catsv */
916f2a19305Safresh1                                        );
917f2a19305Safresh1 
918f2a19305Safresh1             /* If the output is shorter than the permissible maximum, just do it. */
919f2a19305Safresh1             if (SvCUR(contents) <= dump_len) {
920f2a19305Safresh1                 sv_catsv(sv, contents);
921f2a19305Safresh1             }
922f2a19305Safresh1             else {
923f2a19305Safresh1                 const char * contents_string = SvPVX(contents);
924f2a19305Safresh1                 STRLEN i = dump_len;
925f2a19305Safresh1 
926f2a19305Safresh1                 /* Otherwise, start at the permissible max and work back to the
927f2a19305Safresh1                  * first break possibility */
928f2a19305Safresh1                 while (i > 0 && contents_string[i] != ' ') {
929f2a19305Safresh1                     i--;
930f2a19305Safresh1                 }
931f2a19305Safresh1                 if (i == 0) {       /* Fail-safe.  Use the max if we couldn't
932f2a19305Safresh1                                        find a legal break */
933f2a19305Safresh1                     i = dump_len;
934f2a19305Safresh1                 }
935f2a19305Safresh1 
936f2a19305Safresh1                 sv_catpvn(sv, contents_string, i);
937f2a19305Safresh1                 sv_catpvs(sv, "...");
938f2a19305Safresh1             }
939f2a19305Safresh1 
940f2a19305Safresh1             SvREFCNT_dec_NN(contents);
941f2a19305Safresh1             SvREFCNT_dec_NN(nonbitmap_invlist);
942f2a19305Safresh1         }
943f2a19305Safresh1 
944f2a19305Safresh1         /* And finally the matching, closing ']' */
945f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
946f2a19305Safresh1 
947f2a19305Safresh1         if (op == ANYOFHs) {
948f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
949f2a19305Safresh1         }
950f2a19305Safresh1         else if (REGNODE_TYPE(op) != ANYOF) {
951f2a19305Safresh1             U8 lowest = (op != ANYOFHr)
952f2a19305Safresh1                          ? FLAGS(o)
953f2a19305Safresh1                          : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
954f2a19305Safresh1             U8 highest = (op == ANYOFHr)
955f2a19305Safresh1                          ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
956f2a19305Safresh1                          : (op == ANYOFH || op == ANYOFR)
957f2a19305Safresh1                            ? 0xFF
958f2a19305Safresh1                            : lowest;
959f2a19305Safresh1 #ifndef EBCDIC
960f2a19305Safresh1             if (op != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
961f2a19305Safresh1 #endif
962f2a19305Safresh1             {
963f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
964f2a19305Safresh1                 if (lowest != highest) {
965f2a19305Safresh1                     Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
966f2a19305Safresh1                 }
967f2a19305Safresh1                 Perl_sv_catpvf(aTHX_ sv, ")");
968f2a19305Safresh1             }
969f2a19305Safresh1         }
970f2a19305Safresh1 
971f2a19305Safresh1         SvREFCNT_dec(unresolved);
972f2a19305Safresh1     }
973f2a19305Safresh1     else if (k == ANYOFM) {
974f2a19305Safresh1         SV * cp_list = get_ANYOFM_contents(o);
975f2a19305Safresh1 
976f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
977f2a19305Safresh1         if (op == NANYOFM) {
978f2a19305Safresh1             _invlist_invert(cp_list);
979f2a19305Safresh1         }
980f2a19305Safresh1 
981f2a19305Safresh1         put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
982f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
983f2a19305Safresh1 
984f2a19305Safresh1         SvREFCNT_dec(cp_list);
985f2a19305Safresh1     }
986f2a19305Safresh1     else if (k == ANYOFHbbm) {
987f2a19305Safresh1         SV * cp_list = get_ANYOFHbbm_contents(o);
988f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
989f2a19305Safresh1 
990f2a19305Safresh1         sv_catsv(sv, invlist_contents(cp_list,
991f2a19305Safresh1                                       FALSE /* output suitable for catsv */
992f2a19305Safresh1                                      ));
993f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
994f2a19305Safresh1 
995f2a19305Safresh1         SvREFCNT_dec(cp_list);
996f2a19305Safresh1     }
997f2a19305Safresh1     else if (k == POSIXD || k == NPOSIXD) {
998f2a19305Safresh1         U8 index = FLAGS(o) * 2;
999f2a19305Safresh1         if (index < C_ARRAY_LENGTH(anyofs)) {
1000f2a19305Safresh1             if (*anyofs[index] != '[')  {
1001f2a19305Safresh1                 sv_catpvs(sv, "[");
1002f2a19305Safresh1             }
1003f2a19305Safresh1             sv_catpv(sv, anyofs[index]);
1004f2a19305Safresh1             if (*anyofs[index] != '[')  {
1005f2a19305Safresh1                 sv_catpvs(sv, "]");
1006f2a19305Safresh1             }
1007f2a19305Safresh1         }
1008f2a19305Safresh1         else {
1009f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
1010f2a19305Safresh1         }
1011f2a19305Safresh1     }
1012f2a19305Safresh1     else if (k == BOUND || k == NBOUND) {
1013f2a19305Safresh1         /* Must be synced with order of 'bound_type' in regcomp.h */
1014f2a19305Safresh1         const char * const bounds[] = {
1015f2a19305Safresh1             "",      /* Traditional */
1016f2a19305Safresh1             "{gcb}",
1017f2a19305Safresh1             "{lb}",
1018f2a19305Safresh1             "{sb}",
1019f2a19305Safresh1             "{wb}"
1020f2a19305Safresh1         };
1021f2a19305Safresh1         assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
1022f2a19305Safresh1         sv_catpv(sv, bounds[FLAGS(o)]);
1023f2a19305Safresh1     }
1024f2a19305Safresh1     else if (k == BRANCHJ && (op == UNLESSM || op == IFMATCH)) {
1025f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "[%d", -(FLAGS(o)));
1026f2a19305Safresh1         if (NEXT_OFF(o)) {
1027f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, "..-%d", FLAGS(o) - NEXT_OFF(o));
1028f2a19305Safresh1         }
1029f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "]");
1030f2a19305Safresh1     }
1031f2a19305Safresh1     else if (op == SBOL)
1032f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, " /%s/", FLAGS(o) ? "\\A" : "^");
1033f2a19305Safresh1     else if (op == EVAL) {
1034f2a19305Safresh1         if (FLAGS(o) & EVAL_OPTIMISTIC_FLAG)
1035f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, " optimistic");
1036f2a19305Safresh1     }
1037f2a19305Safresh1 
1038f2a19305Safresh1     /* add on the verb argument if there is one */
1039f2a19305Safresh1     if ( ( k == VERB || op == ACCEPT || op == OPFAIL ) && FLAGS(o)) {
1040f2a19305Safresh1         if ( ARG1u(o) )
1041f2a19305Safresh1             Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
1042f2a19305Safresh1                        SVfARG((MUTABLE_SV(progi->data->data[ ARG1u( o ) ]))));
1043f2a19305Safresh1         else
1044f2a19305Safresh1             sv_catpvs(sv, ":NULL");
1045f2a19305Safresh1     }
1046f2a19305Safresh1 #else
1047f2a19305Safresh1     PERL_UNUSED_CONTEXT;
1048f2a19305Safresh1     PERL_UNUSED_ARG(sv);
1049f2a19305Safresh1     PERL_UNUSED_ARG(o);
1050f2a19305Safresh1     PERL_UNUSED_ARG(prog);
1051f2a19305Safresh1     PERL_UNUSED_ARG(reginfo);
1052f2a19305Safresh1     PERL_UNUSED_ARG(pRExC_state);
1053f2a19305Safresh1 #endif  /* DEBUGGING */
1054f2a19305Safresh1 }
1055f2a19305Safresh1 
1056f2a19305Safresh1 
1057*5486feefSafresh1 #ifdef DEBUGGING
1058f2a19305Safresh1 STATIC void
1059f2a19305Safresh1 S_put_code_point(pTHX_ SV *sv, UV c)
1060f2a19305Safresh1 {
1061f2a19305Safresh1     PERL_ARGS_ASSERT_PUT_CODE_POINT;
1062f2a19305Safresh1 
1063f2a19305Safresh1     if (c > 255) {
1064f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
1065f2a19305Safresh1     }
1066f2a19305Safresh1     else if (isPRINT(c)) {
1067f2a19305Safresh1         const char string = (char) c;
1068f2a19305Safresh1 
1069f2a19305Safresh1         /* We use {phrase} as metanotation in the class, so also escape literal
1070f2a19305Safresh1          * braces */
1071f2a19305Safresh1         if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
1072f2a19305Safresh1             sv_catpvs(sv, "\\");
1073f2a19305Safresh1         sv_catpvn(sv, &string, 1);
1074f2a19305Safresh1     }
1075f2a19305Safresh1     else if (isMNEMONIC_CNTRL(c)) {
1076f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
1077f2a19305Safresh1     }
1078f2a19305Safresh1     else {
1079f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
1080f2a19305Safresh1     }
1081f2a19305Safresh1 }
1082f2a19305Safresh1 
1083f2a19305Safresh1 STATIC void
1084f2a19305Safresh1 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
1085f2a19305Safresh1 {
1086f2a19305Safresh1     /* Appends to 'sv' a displayable version of the range of code points from
1087f2a19305Safresh1      * 'start' to 'end'.  Mnemonics (like '\r') are used for the few controls
1088f2a19305Safresh1      * that have them, when they occur at the beginning or end of the range.
1089f2a19305Safresh1      * It uses hex to output the remaining code points, unless 'allow_literals'
1090f2a19305Safresh1      * is true, in which case the printable ASCII ones are output as-is (though
1091f2a19305Safresh1      * some of these will be escaped by put_code_point()).
1092f2a19305Safresh1      *
1093f2a19305Safresh1      * NOTE:  This is designed only for printing ranges of code points that fit
1094f2a19305Safresh1      *        inside an ANYOF bitmap.  Higher code points are simply suppressed
1095f2a19305Safresh1      */
1096f2a19305Safresh1 
1097f2a19305Safresh1     const unsigned int min_range_count = 3;
1098f2a19305Safresh1 
1099f2a19305Safresh1     assert(start <= end);
1100f2a19305Safresh1 
1101f2a19305Safresh1     PERL_ARGS_ASSERT_PUT_RANGE;
1102f2a19305Safresh1 
1103f2a19305Safresh1     while (start <= end) {
1104f2a19305Safresh1         UV this_end;
1105f2a19305Safresh1         const char * format;
1106f2a19305Safresh1 
1107f2a19305Safresh1         if (    end - start < min_range_count
1108f2a19305Safresh1             && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
1109f2a19305Safresh1         {
1110f2a19305Safresh1             /* Output a range of 1 or 2 chars individually, or longer ranges
1111f2a19305Safresh1              * when printable */
1112f2a19305Safresh1             for (; start <= end; start++) {
1113f2a19305Safresh1                 put_code_point(sv, start);
1114f2a19305Safresh1             }
1115f2a19305Safresh1             break;
1116f2a19305Safresh1         }
1117f2a19305Safresh1 
1118f2a19305Safresh1         /* If permitted by the input options, and there is a possibility that
1119f2a19305Safresh1          * this range contains a printable literal, look to see if there is
1120f2a19305Safresh1          * one. */
1121f2a19305Safresh1         if (allow_literals && start <= MAX_PRINT_A) {
1122f2a19305Safresh1 
1123f2a19305Safresh1             /* If the character at the beginning of the range isn't an ASCII
1124f2a19305Safresh1              * printable, effectively split the range into two parts:
1125f2a19305Safresh1              *  1) the portion before the first such printable,
1126f2a19305Safresh1              *  2) the rest
1127f2a19305Safresh1              * and output them separately. */
1128f2a19305Safresh1             if (! isPRINT_A(start)) {
1129f2a19305Safresh1                 UV temp_end = start + 1;
1130f2a19305Safresh1 
1131f2a19305Safresh1                 /* There is no point looking beyond the final possible
1132f2a19305Safresh1                  * printable, in MAX_PRINT_A */
1133f2a19305Safresh1                 UV max = MIN(end, MAX_PRINT_A);
1134f2a19305Safresh1 
1135f2a19305Safresh1                 while (temp_end <= max && ! isPRINT_A(temp_end)) {
1136f2a19305Safresh1                     temp_end++;
1137f2a19305Safresh1                 }
1138f2a19305Safresh1 
1139f2a19305Safresh1                 /* Here, temp_end points to one beyond the first printable if
1140f2a19305Safresh1                  * found, or to one beyond 'max' if not.  If none found, make
1141f2a19305Safresh1                  * sure that we use the entire range */
1142f2a19305Safresh1                 if (temp_end > MAX_PRINT_A) {
1143f2a19305Safresh1                     temp_end = end + 1;
1144f2a19305Safresh1                 }
1145f2a19305Safresh1 
1146f2a19305Safresh1                 /* Output the first part of the split range: the part that
1147f2a19305Safresh1                  * doesn't have printables, with the parameter set to not look
1148f2a19305Safresh1                  * for literals (otherwise we would infinitely recurse) */
1149f2a19305Safresh1                 put_range(sv, start, temp_end - 1, FALSE);
1150f2a19305Safresh1 
1151f2a19305Safresh1                 /* The 2nd part of the range (if any) starts here. */
1152f2a19305Safresh1                 start = temp_end;
1153f2a19305Safresh1 
1154f2a19305Safresh1                 /* We do a continue, instead of dropping down, because even if
1155f2a19305Safresh1                  * the 2nd part is non-empty, it could be so short that we want
1156f2a19305Safresh1                  * to output it as individual characters, as tested for at the
1157f2a19305Safresh1                  * top of this loop.  */
1158f2a19305Safresh1                 continue;
1159f2a19305Safresh1             }
1160f2a19305Safresh1 
1161f2a19305Safresh1             /* Here, 'start' is a printable ASCII.  If it is an alphanumeric,
1162f2a19305Safresh1              * output a sub-range of just the digits or letters, then process
1163f2a19305Safresh1              * the remaining portion as usual. */
1164f2a19305Safresh1             if (isALPHANUMERIC_A(start)) {
1165f2a19305Safresh1                 UV mask = (isDIGIT_A(start))
1166f2a19305Safresh1                            ? CC_DIGIT_
1167f2a19305Safresh1                              : isUPPER_A(start)
1168f2a19305Safresh1                                ? CC_UPPER_
1169f2a19305Safresh1                                : CC_LOWER_;
1170f2a19305Safresh1                 UV temp_end = start + 1;
1171f2a19305Safresh1 
1172f2a19305Safresh1                 /* Find the end of the sub-range that includes just the
1173f2a19305Safresh1                  * characters in the same class as the first character in it */
1174f2a19305Safresh1                 while (temp_end <= end && generic_isCC_A_(temp_end, mask)) {
1175f2a19305Safresh1                     temp_end++;
1176f2a19305Safresh1                 }
1177f2a19305Safresh1                 temp_end--;
1178f2a19305Safresh1 
1179f2a19305Safresh1                 /* For short ranges, don't duplicate the code above to output
1180f2a19305Safresh1                  * them; just call recursively */
1181f2a19305Safresh1                 if (temp_end - start < min_range_count) {
1182f2a19305Safresh1                     put_range(sv, start, temp_end, FALSE);
1183f2a19305Safresh1                 }
1184f2a19305Safresh1                 else {  /* Output as a range */
1185f2a19305Safresh1                     put_code_point(sv, start);
1186f2a19305Safresh1                     sv_catpvs(sv, "-");
1187f2a19305Safresh1                     put_code_point(sv, temp_end);
1188f2a19305Safresh1                 }
1189f2a19305Safresh1                 start = temp_end + 1;
1190f2a19305Safresh1                 continue;
1191f2a19305Safresh1             }
1192f2a19305Safresh1 
1193f2a19305Safresh1             /* We output any other printables as individual characters */
1194f2a19305Safresh1             if (isPUNCT_A(start) || isSPACE_A(start)) {
1195f2a19305Safresh1                 while (start <= end && (isPUNCT_A(start)
1196f2a19305Safresh1                                         || isSPACE_A(start)))
1197f2a19305Safresh1                 {
1198f2a19305Safresh1                     put_code_point(sv, start);
1199f2a19305Safresh1                     start++;
1200f2a19305Safresh1                 }
1201f2a19305Safresh1                 continue;
1202f2a19305Safresh1             }
1203f2a19305Safresh1         } /* End of looking for literals */
1204f2a19305Safresh1 
1205f2a19305Safresh1         /* Here is not to output as a literal.  Some control characters have
1206f2a19305Safresh1          * mnemonic names.  Split off any of those at the beginning and end of
1207f2a19305Safresh1          * the range to print mnemonically.  It isn't possible for many of
1208f2a19305Safresh1          * these to be in a row, so this won't overwhelm with output */
1209f2a19305Safresh1         if (   start <= end
1210f2a19305Safresh1             && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
1211f2a19305Safresh1         {
1212f2a19305Safresh1             while (isMNEMONIC_CNTRL(start) && start <= end) {
1213f2a19305Safresh1                 put_code_point(sv, start);
1214f2a19305Safresh1                 start++;
1215f2a19305Safresh1             }
1216f2a19305Safresh1 
1217f2a19305Safresh1             /* If this didn't take care of the whole range ... */
1218f2a19305Safresh1             if (start <= end) {
1219f2a19305Safresh1 
1220f2a19305Safresh1                 /* Look backwards from the end to find the final non-mnemonic
1221f2a19305Safresh1                  * */
1222f2a19305Safresh1                 UV temp_end = end;
1223f2a19305Safresh1                 while (isMNEMONIC_CNTRL(temp_end)) {
1224f2a19305Safresh1                     temp_end--;
1225f2a19305Safresh1                 }
1226f2a19305Safresh1 
1227f2a19305Safresh1                 /* And separately output the interior range that doesn't start
1228f2a19305Safresh1                  * or end with mnemonics */
1229f2a19305Safresh1                 put_range(sv, start, temp_end, FALSE);
1230f2a19305Safresh1 
1231f2a19305Safresh1                 /* Then output the mnemonic trailing controls */
1232f2a19305Safresh1                 start = temp_end + 1;
1233f2a19305Safresh1                 while (start <= end) {
1234f2a19305Safresh1                     put_code_point(sv, start);
1235f2a19305Safresh1                     start++;
1236f2a19305Safresh1                 }
1237f2a19305Safresh1                 break;
1238f2a19305Safresh1             }
1239f2a19305Safresh1         }
1240f2a19305Safresh1 
1241f2a19305Safresh1         /* As a final resort, output the range or subrange as hex. */
1242f2a19305Safresh1 
1243f2a19305Safresh1         if (start >= NUM_ANYOF_CODE_POINTS) {
1244f2a19305Safresh1             this_end = end;
1245f2a19305Safresh1         }
1246f2a19305Safresh1         else {  /* Have to split range at the bitmap boundary */
1247f2a19305Safresh1             this_end = (end < NUM_ANYOF_CODE_POINTS)
1248f2a19305Safresh1                         ? end
1249f2a19305Safresh1                         : NUM_ANYOF_CODE_POINTS - 1;
1250f2a19305Safresh1         }
1251f2a19305Safresh1 #if NUM_ANYOF_CODE_POINTS > 256
1252f2a19305Safresh1         format = (this_end < 256)
1253f2a19305Safresh1                  ? "\\x%02" UVXf "-\\x%02" UVXf
1254f2a19305Safresh1                  : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
1255f2a19305Safresh1 #else
1256f2a19305Safresh1         format = "\\x%02" UVXf "-\\x%02" UVXf;
1257f2a19305Safresh1 #endif
1258f2a19305Safresh1         GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
1259f2a19305Safresh1         Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
1260f2a19305Safresh1         GCC_DIAG_RESTORE_STMT;
1261f2a19305Safresh1         break;
1262f2a19305Safresh1     }
1263f2a19305Safresh1 }
1264f2a19305Safresh1 
1265f2a19305Safresh1 STATIC void
1266f2a19305Safresh1 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
1267f2a19305Safresh1 {
1268f2a19305Safresh1     /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
1269f2a19305Safresh1      * 'invlist' */
1270f2a19305Safresh1 
1271f2a19305Safresh1     UV start, end;
1272f2a19305Safresh1     bool allow_literals = TRUE;
1273f2a19305Safresh1 
1274f2a19305Safresh1     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
1275f2a19305Safresh1 
1276f2a19305Safresh1     /* Generally, it is more readable if printable characters are output as
1277f2a19305Safresh1      * literals, but if a range (nearly) spans all of them, it's best to output
1278f2a19305Safresh1      * it as a single range.  This code will use a single range if all but 2
1279f2a19305Safresh1      * ASCII printables are in it */
1280f2a19305Safresh1     invlist_iterinit(invlist);
1281f2a19305Safresh1     while (invlist_iternext(invlist, &start, &end)) {
1282f2a19305Safresh1 
1283f2a19305Safresh1         /* If the range starts beyond the final printable, it doesn't have any
1284f2a19305Safresh1          * in it */
1285f2a19305Safresh1         if (start > MAX_PRINT_A) {
1286f2a19305Safresh1             break;
1287f2a19305Safresh1         }
1288f2a19305Safresh1 
1289f2a19305Safresh1         /* In both ASCII and EBCDIC, a SPACE is the lowest printable.  To span
1290f2a19305Safresh1          * all but two, the range must start and end no later than 2 from
1291f2a19305Safresh1          * either end */
1292f2a19305Safresh1         if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
1293f2a19305Safresh1             if (end > MAX_PRINT_A) {
1294f2a19305Safresh1                 end = MAX_PRINT_A;
1295f2a19305Safresh1             }
1296f2a19305Safresh1             if (start < ' ') {
1297f2a19305Safresh1                 start = ' ';
1298f2a19305Safresh1             }
1299f2a19305Safresh1             if (end - start >= MAX_PRINT_A - ' ' - 2) {
1300f2a19305Safresh1                 allow_literals = FALSE;
1301f2a19305Safresh1             }
1302f2a19305Safresh1             break;
1303f2a19305Safresh1         }
1304f2a19305Safresh1     }
1305f2a19305Safresh1     invlist_iterfinish(invlist);
1306f2a19305Safresh1 
1307f2a19305Safresh1     /* Here we have figured things out.  Output each range */
1308f2a19305Safresh1     invlist_iterinit(invlist);
1309f2a19305Safresh1     while (invlist_iternext(invlist, &start, &end)) {
1310f2a19305Safresh1         if (start >= NUM_ANYOF_CODE_POINTS) {
1311f2a19305Safresh1             break;
1312f2a19305Safresh1         }
1313f2a19305Safresh1         put_range(sv, start, end, allow_literals);
1314f2a19305Safresh1     }
1315f2a19305Safresh1     invlist_iterfinish(invlist);
1316f2a19305Safresh1 
1317f2a19305Safresh1     return;
1318f2a19305Safresh1 }
1319f2a19305Safresh1 
1320f2a19305Safresh1 STATIC SV*
1321f2a19305Safresh1 S_put_charclass_bitmap_innards_common(pTHX_
1322f2a19305Safresh1         SV* invlist,            /* The bitmap */
1323f2a19305Safresh1         SV* posixes,            /* Under /l, things like [:word:], \S */
1324f2a19305Safresh1         SV* only_utf8,          /* Under /d, matches iff the target is UTF-8 */
1325f2a19305Safresh1         SV* not_utf8,           /* /d, matches iff the target isn't UTF-8 */
1326f2a19305Safresh1         SV* only_utf8_locale,   /* Under /l, matches if the locale is UTF-8 */
1327f2a19305Safresh1         const bool invert       /* Is the result to be inverted? */
1328f2a19305Safresh1 )
1329f2a19305Safresh1 {
1330f2a19305Safresh1     /* Create and return an SV containing a displayable version of the bitmap
1331f2a19305Safresh1      * and associated information determined by the input parameters.  If the
1332f2a19305Safresh1      * output would have been only the inversion indicator '^', NULL is instead
1333f2a19305Safresh1      * returned. */
1334f2a19305Safresh1 
1335f2a19305Safresh1     SV * output;
1336f2a19305Safresh1 
1337f2a19305Safresh1     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
1338f2a19305Safresh1 
1339f2a19305Safresh1     if (invert) {
1340f2a19305Safresh1         output = newSVpvs("^");
1341f2a19305Safresh1     }
1342f2a19305Safresh1     else {
1343f2a19305Safresh1         output = newSVpvs("");
1344f2a19305Safresh1     }
1345f2a19305Safresh1 
1346f2a19305Safresh1     /* First, the code points in the bitmap that are unconditionally there */
1347f2a19305Safresh1     put_charclass_bitmap_innards_invlist(output, invlist);
1348f2a19305Safresh1 
1349f2a19305Safresh1     /* Traditionally, these have been placed after the main code points */
1350f2a19305Safresh1     if (posixes) {
1351f2a19305Safresh1         sv_catsv(output, posixes);
1352f2a19305Safresh1     }
1353f2a19305Safresh1 
1354f2a19305Safresh1     if (only_utf8 && _invlist_len(only_utf8)) {
1355f2a19305Safresh1         Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
1356f2a19305Safresh1         put_charclass_bitmap_innards_invlist(output, only_utf8);
1357f2a19305Safresh1     }
1358f2a19305Safresh1 
1359f2a19305Safresh1     if (not_utf8 && _invlist_len(not_utf8)) {
1360f2a19305Safresh1         Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
1361f2a19305Safresh1         put_charclass_bitmap_innards_invlist(output, not_utf8);
1362f2a19305Safresh1     }
1363f2a19305Safresh1 
1364f2a19305Safresh1     if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
1365f2a19305Safresh1         Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
1366f2a19305Safresh1         put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
1367f2a19305Safresh1 
1368f2a19305Safresh1         /* This is the only list in this routine that can legally contain code
1369f2a19305Safresh1          * points outside the bitmap range.  The call just above to
1370f2a19305Safresh1          * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
1371f2a19305Safresh1          * output them here.  There's about a half-dozen possible, and none in
1372f2a19305Safresh1          * contiguous ranges longer than 2 */
1373f2a19305Safresh1         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1374f2a19305Safresh1             UV start, end;
1375f2a19305Safresh1             SV* above_bitmap = NULL;
1376f2a19305Safresh1 
1377f2a19305Safresh1             _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
1378f2a19305Safresh1 
1379f2a19305Safresh1             invlist_iterinit(above_bitmap);
1380f2a19305Safresh1             while (invlist_iternext(above_bitmap, &start, &end)) {
1381f2a19305Safresh1                 UV i;
1382f2a19305Safresh1 
1383f2a19305Safresh1                 for (i = start; i <= end; i++) {
1384f2a19305Safresh1                     put_code_point(output, i);
1385f2a19305Safresh1                 }
1386f2a19305Safresh1             }
1387f2a19305Safresh1             invlist_iterfinish(above_bitmap);
1388f2a19305Safresh1             SvREFCNT_dec_NN(above_bitmap);
1389f2a19305Safresh1         }
1390f2a19305Safresh1     }
1391f2a19305Safresh1 
1392f2a19305Safresh1     if (invert && SvCUR(output) == 1) {
1393f2a19305Safresh1         return NULL;
1394f2a19305Safresh1     }
1395f2a19305Safresh1 
1396f2a19305Safresh1     return output;
1397f2a19305Safresh1 }
1398f2a19305Safresh1 
1399f2a19305Safresh1 STATIC U8
1400f2a19305Safresh1 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
1401f2a19305Safresh1                                      char *bitmap,
1402f2a19305Safresh1                                      SV *nonbitmap_invlist,
1403f2a19305Safresh1                                      SV *only_utf8_locale_invlist,
1404f2a19305Safresh1                                      const regnode * const node,
1405f2a19305Safresh1                                      const U8 flags,
1406f2a19305Safresh1                                      const bool force_as_is_display)
1407f2a19305Safresh1 {
1408f2a19305Safresh1     /* Appends to 'sv' a displayable version of the innards of the bracketed
1409f2a19305Safresh1      * character class defined by the other arguments:
1410f2a19305Safresh1      *  'bitmap' points to the bitmap, or NULL if to ignore that.
1411f2a19305Safresh1      *  'nonbitmap_invlist' is an inversion list of the code points that are in
1412f2a19305Safresh1      *      the bitmap range, but for some reason aren't in the bitmap; NULL if
1413f2a19305Safresh1      *      none.  The reasons for this could be that they require some
1414f2a19305Safresh1      *      condition such as the target string being or not being in UTF-8
1415f2a19305Safresh1      *      (under /d), or because they came from a user-defined property that
1416f2a19305Safresh1      *      was not resolved at the time of the regex compilation (under /u)
1417f2a19305Safresh1      *  'only_utf8_locale_invlist' is an inversion list of the code points that
1418f2a19305Safresh1      *      are valid only if the runtime locale is a UTF-8 one; NULL if none
1419f2a19305Safresh1      *  'node' is the regex pattern ANYOF node.  It is needed only when the
1420f2a19305Safresh1      *      above two parameters are not null, and is passed so that this
1421f2a19305Safresh1      *      routine can tease apart the various reasons for them.
1422f2a19305Safresh1      *  'flags' is the flags field of 'node'
1423f2a19305Safresh1      *  'force_as_is_display' is TRUE if this routine should definitely NOT try
1424f2a19305Safresh1      *      to invert things to see if that leads to a cleaner display.  If
1425f2a19305Safresh1      *      FALSE, this routine is free to use its judgment about doing this.
1426f2a19305Safresh1      *
1427f2a19305Safresh1      * It returns 0 if nothing was actually output.  (It may be that
1428f2a19305Safresh1      *              the bitmap, etc is empty.)
1429f2a19305Safresh1      *            1 if the output wasn't inverted (didn't begin with a '^')
1430f2a19305Safresh1      *            2 if the output was inverted (did begin with a '^')
1431f2a19305Safresh1      *
1432f2a19305Safresh1      * When called for outputting the bitmap of a non-ANYOF node, just pass the
1433f2a19305Safresh1      * bitmap, with the succeeding parameters set to NULL, and the final one to
1434f2a19305Safresh1      * FALSE.
1435f2a19305Safresh1      */
1436f2a19305Safresh1 
1437f2a19305Safresh1     /* In general, it tries to display the 'cleanest' representation of the
1438f2a19305Safresh1      * innards, choosing whether to display them inverted or not, regardless of
1439f2a19305Safresh1      * whether the class itself is to be inverted.  However,  there are some
1440f2a19305Safresh1      * cases where it can't try inverting, as what actually matches isn't known
1441f2a19305Safresh1      * until runtime, and hence the inversion isn't either. */
1442f2a19305Safresh1 
1443f2a19305Safresh1     bool inverting_allowed = ! force_as_is_display;
1444f2a19305Safresh1 
1445f2a19305Safresh1     int i;
1446f2a19305Safresh1     STRLEN orig_sv_cur = SvCUR(sv);
1447f2a19305Safresh1 
1448f2a19305Safresh1     SV* invlist;            /* Inversion list we accumulate of code points that
1449f2a19305Safresh1                                are unconditionally matched */
1450f2a19305Safresh1     SV* only_utf8 = NULL;   /* Under /d, list of matches iff the target is
1451f2a19305Safresh1                                UTF-8 */
1452f2a19305Safresh1     SV* not_utf8 =  NULL;   /* /d, list of matches iff the target isn't UTF-8
1453f2a19305Safresh1                              */
1454f2a19305Safresh1     SV* posixes = NULL;     /* Under /l, string of things like [:word:], \D */
1455f2a19305Safresh1     SV* only_utf8_locale = NULL;    /* Under /l, list of matches if the locale
1456f2a19305Safresh1                                        is UTF-8 */
1457f2a19305Safresh1 
1458f2a19305Safresh1     SV* as_is_display;      /* The output string when we take the inputs
1459f2a19305Safresh1                                literally */
1460f2a19305Safresh1     SV* inverted_display;   /* The output string when we invert the inputs */
1461f2a19305Safresh1 
1462f2a19305Safresh1     bool invert = cBOOL(flags & ANYOF_INVERT);  /* Is the input to be inverted
1463f2a19305Safresh1                                                    to match? */
1464f2a19305Safresh1     /* We are biased in favor of displaying things without them being inverted,
1465f2a19305Safresh1      * as that is generally easier to understand */
1466f2a19305Safresh1     const int bias = 5;
1467f2a19305Safresh1 
1468f2a19305Safresh1     PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
1469f2a19305Safresh1 
1470f2a19305Safresh1     /* Start off with whatever code points are passed in.  (We clone, so we
1471f2a19305Safresh1      * don't change the caller's list) */
1472f2a19305Safresh1     if (nonbitmap_invlist) {
1473f2a19305Safresh1         assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
1474f2a19305Safresh1         invlist = invlist_clone(nonbitmap_invlist, NULL);
1475f2a19305Safresh1     }
1476f2a19305Safresh1     else {  /* Worst case size is every other code point is matched */
1477f2a19305Safresh1         invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
1478f2a19305Safresh1     }
1479f2a19305Safresh1 
1480f2a19305Safresh1     if (flags) {
1481f2a19305Safresh1         if (OP(node) == ANYOFD) {
1482f2a19305Safresh1 
1483f2a19305Safresh1             /* This flag indicates that the code points below 0x100 in the
1484f2a19305Safresh1              * nonbitmap list are precisely the ones that match only when the
1485f2a19305Safresh1              * target is UTF-8 (they should all be non-ASCII). */
1486f2a19305Safresh1             if (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES) {
1487f2a19305Safresh1                 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
1488f2a19305Safresh1                 _invlist_subtract(invlist, only_utf8, &invlist);
1489f2a19305Safresh1             }
1490f2a19305Safresh1 
1491f2a19305Safresh1             /* And this flag for matching all non-ASCII 0xFF and below */
1492f2a19305Safresh1             if (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared) {
1493f2a19305Safresh1                 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
1494f2a19305Safresh1             }
1495f2a19305Safresh1         }
1496f2a19305Safresh1         else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
1497f2a19305Safresh1 
1498f2a19305Safresh1             /* If either of these flags are set, what matches isn't
1499f2a19305Safresh1              * determinable except during execution, so don't know enough here
1500f2a19305Safresh1              * to invert */
1501f2a19305Safresh1             if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
1502f2a19305Safresh1                 inverting_allowed = FALSE;
1503f2a19305Safresh1             }
1504f2a19305Safresh1 
1505f2a19305Safresh1             /* What the posix classes match also varies at runtime, so these
1506f2a19305Safresh1              * will be output symbolically. */
1507f2a19305Safresh1             if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
1508f2a19305Safresh1                 int i;
1509f2a19305Safresh1 
1510f2a19305Safresh1                 posixes = newSVpvs("");
1511f2a19305Safresh1                 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
1512f2a19305Safresh1                     if (ANYOF_POSIXL_TEST(node, i)) {
1513f2a19305Safresh1                         sv_catpv(posixes, anyofs[i]);
1514f2a19305Safresh1                     }
1515f2a19305Safresh1                 }
1516f2a19305Safresh1             }
1517f2a19305Safresh1         }
1518f2a19305Safresh1     }
1519f2a19305Safresh1 
1520f2a19305Safresh1     /* Accumulate the bit map into the unconditional match list */
1521f2a19305Safresh1     if (bitmap) {
1522f2a19305Safresh1         for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1523f2a19305Safresh1             if (BITMAP_TEST(bitmap, i)) {
1524f2a19305Safresh1                 int start = i++;
1525f2a19305Safresh1                 for (;
1526f2a19305Safresh1                      i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
1527f2a19305Safresh1                      i++)
1528f2a19305Safresh1                 { /* empty */ }
1529f2a19305Safresh1                 invlist = _add_range_to_invlist(invlist, start, i-1);
1530f2a19305Safresh1             }
1531f2a19305Safresh1         }
1532f2a19305Safresh1     }
1533f2a19305Safresh1 
1534f2a19305Safresh1     /* Make sure that the conditional match lists don't have anything in them
1535f2a19305Safresh1      * that match unconditionally; otherwise the output is quite confusing.
1536f2a19305Safresh1      * This could happen if the code that populates these misses some
1537f2a19305Safresh1      * duplication. */
1538f2a19305Safresh1     if (only_utf8) {
1539f2a19305Safresh1         _invlist_subtract(only_utf8, invlist, &only_utf8);
1540f2a19305Safresh1     }
1541f2a19305Safresh1     if (not_utf8) {
1542f2a19305Safresh1         _invlist_subtract(not_utf8, invlist, &not_utf8);
1543f2a19305Safresh1     }
1544f2a19305Safresh1 
1545f2a19305Safresh1     if (only_utf8_locale_invlist) {
1546f2a19305Safresh1 
1547f2a19305Safresh1         /* Since this list is passed in, we have to make a copy before
1548f2a19305Safresh1          * modifying it */
1549f2a19305Safresh1         only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
1550f2a19305Safresh1 
1551f2a19305Safresh1         _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
1552f2a19305Safresh1 
1553f2a19305Safresh1         /* And, it can get really weird for us to try outputting an inverted
1554f2a19305Safresh1          * form of this list when it has things above the bitmap, so don't even
1555f2a19305Safresh1          * try */
1556f2a19305Safresh1         if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
1557f2a19305Safresh1             inverting_allowed = FALSE;
1558f2a19305Safresh1         }
1559f2a19305Safresh1     }
1560f2a19305Safresh1 
1561f2a19305Safresh1     /* Calculate what the output would be if we take the input as-is */
1562f2a19305Safresh1     as_is_display = put_charclass_bitmap_innards_common(invlist,
1563f2a19305Safresh1                                                     posixes,
1564f2a19305Safresh1                                                     only_utf8,
1565f2a19305Safresh1                                                     not_utf8,
1566f2a19305Safresh1                                                     only_utf8_locale,
1567f2a19305Safresh1                                                     invert);
1568f2a19305Safresh1 
1569f2a19305Safresh1     /* If have to take the output as-is, just do that */
1570f2a19305Safresh1     if (! inverting_allowed) {
1571f2a19305Safresh1         if (as_is_display) {
1572f2a19305Safresh1             sv_catsv(sv, as_is_display);
1573f2a19305Safresh1             SvREFCNT_dec_NN(as_is_display);
1574f2a19305Safresh1         }
1575f2a19305Safresh1     }
1576f2a19305Safresh1     else { /* But otherwise, create the output again on the inverted input, and
1577f2a19305Safresh1               use whichever version is shorter */
1578f2a19305Safresh1 
1579f2a19305Safresh1         int inverted_bias, as_is_bias;
1580f2a19305Safresh1 
1581f2a19305Safresh1         /* We will apply our bias to whichever of the results doesn't have
1582f2a19305Safresh1          * the '^' */
1583f2a19305Safresh1         bool trial_invert;
1584f2a19305Safresh1         if (invert) {
1585f2a19305Safresh1             trial_invert = FALSE;
1586f2a19305Safresh1             as_is_bias = bias;
1587f2a19305Safresh1             inverted_bias = 0;
1588f2a19305Safresh1         }
1589f2a19305Safresh1         else {
1590f2a19305Safresh1             trial_invert = TRUE;
1591f2a19305Safresh1             as_is_bias = 0;
1592f2a19305Safresh1             inverted_bias = bias;
1593f2a19305Safresh1         }
1594f2a19305Safresh1 
1595f2a19305Safresh1         /* Now invert each of the lists that contribute to the output,
1596f2a19305Safresh1          * excluding from the result things outside the possible range */
1597f2a19305Safresh1 
1598f2a19305Safresh1         /* For the unconditional inversion list, we have to add in all the
1599f2a19305Safresh1          * conditional code points, so that when inverted, they will be gone
1600f2a19305Safresh1          * from it */
1601f2a19305Safresh1         _invlist_union(only_utf8, invlist, &invlist);
1602f2a19305Safresh1         _invlist_union(not_utf8, invlist, &invlist);
1603f2a19305Safresh1         _invlist_union(only_utf8_locale, invlist, &invlist);
1604f2a19305Safresh1         _invlist_invert(invlist);
1605f2a19305Safresh1         _invlist_intersection(invlist, PL_InBitmap, &invlist);
1606f2a19305Safresh1 
1607f2a19305Safresh1         if (only_utf8) {
1608f2a19305Safresh1             _invlist_invert(only_utf8);
1609f2a19305Safresh1             _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
1610f2a19305Safresh1         }
1611f2a19305Safresh1         else if (not_utf8) {
1612f2a19305Safresh1 
1613f2a19305Safresh1             /* If a code point matches iff the target string is not in UTF-8,
1614f2a19305Safresh1              * then complementing the result has it not match iff not in UTF-8,
1615f2a19305Safresh1              * which is the same thing as matching iff it is UTF-8. */
1616f2a19305Safresh1             only_utf8 = not_utf8;
1617f2a19305Safresh1             not_utf8 = NULL;
1618f2a19305Safresh1         }
1619f2a19305Safresh1 
1620f2a19305Safresh1         if (only_utf8_locale) {
1621f2a19305Safresh1             _invlist_invert(only_utf8_locale);
1622f2a19305Safresh1             _invlist_intersection(only_utf8_locale,
1623f2a19305Safresh1                                   PL_InBitmap,
1624f2a19305Safresh1                                   &only_utf8_locale);
1625f2a19305Safresh1         }
1626f2a19305Safresh1 
1627f2a19305Safresh1         inverted_display = put_charclass_bitmap_innards_common(
1628f2a19305Safresh1                                             invlist,
1629f2a19305Safresh1                                             posixes,
1630f2a19305Safresh1                                             only_utf8,
1631f2a19305Safresh1                                             not_utf8,
1632f2a19305Safresh1                                             only_utf8_locale, trial_invert);
1633f2a19305Safresh1 
1634f2a19305Safresh1         /* Use the shortest representation, taking into account our bias
1635f2a19305Safresh1          * against showing it inverted */
1636f2a19305Safresh1         if (   inverted_display
1637f2a19305Safresh1             && (   ! as_is_display
1638f2a19305Safresh1                 || (  SvCUR(inverted_display) + inverted_bias
1639f2a19305Safresh1                     < SvCUR(as_is_display)    + as_is_bias)))
1640f2a19305Safresh1         {
1641f2a19305Safresh1             sv_catsv(sv, inverted_display);
1642f2a19305Safresh1             invert = ! invert;
1643f2a19305Safresh1         }
1644f2a19305Safresh1         else if (as_is_display) {
1645f2a19305Safresh1             sv_catsv(sv, as_is_display);
1646f2a19305Safresh1         }
1647f2a19305Safresh1 
1648f2a19305Safresh1         SvREFCNT_dec(as_is_display);
1649f2a19305Safresh1         SvREFCNT_dec(inverted_display);
1650f2a19305Safresh1     }
1651f2a19305Safresh1 
1652f2a19305Safresh1     SvREFCNT_dec_NN(invlist);
1653f2a19305Safresh1     SvREFCNT_dec(only_utf8);
1654f2a19305Safresh1     SvREFCNT_dec(not_utf8);
1655f2a19305Safresh1     SvREFCNT_dec(posixes);
1656f2a19305Safresh1     SvREFCNT_dec(only_utf8_locale);
1657f2a19305Safresh1 
1658f2a19305Safresh1     U8 did_output_something = (bool) (SvCUR(sv) > orig_sv_cur);
1659f2a19305Safresh1     if (did_output_something) {
1660f2a19305Safresh1         /* Distinguish between non and inverted cases */
1661f2a19305Safresh1         did_output_something += invert;
1662f2a19305Safresh1     }
1663f2a19305Safresh1 
1664f2a19305Safresh1     return did_output_something;
1665f2a19305Safresh1 }
1666f2a19305Safresh1 #endif /* DEBUGGING */
1667