xref: /openbsd-src/gnu/usr.bin/perl/cpan/Text-Balanced/lib/Text/Balanced.pm (revision e068048151d29f2562a32185e21a8ba885482260)
1eac174f2Safresh1# Copyright (C) 1997-2001 Damian Conway.  All rights reserved.
2eac174f2Safresh1# Copyright (C) 2009 Adam Kennedy.
3*e0680481Safresh1# Copyright (C) 2015, 2022 Steve Hay and other contributors.  All rights
4*e0680481Safresh1# reserved.
5eac174f2Safresh1
6eac174f2Safresh1# This module is free software; you can redistribute it and/or modify it under
7eac174f2Safresh1# the same terms as Perl itself, i.e. under the terms of either the GNU General
8eac174f2Safresh1# Public License or the Artistic License, as specified in the F<LICENCE> file.
9eac174f2Safresh1
10b39c5158Smillertpackage Text::Balanced;
11b39c5158Smillert
12b39c5158Smillert# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
13b39c5158Smillert# FOR FULL DOCUMENTATION SEE Balanced.pod
14b39c5158Smillert
15eac174f2Safresh1use 5.008001;
16b39c5158Smillertuse strict;
17b39c5158Smillertuse Exporter ();
18b39c5158Smillert
19b39c5158Smillertuse vars qw { $VERSION @ISA %EXPORT_TAGS };
20b39c5158SmillertBEGIN {
21*e0680481Safresh1    $VERSION     = '2.06';
22b39c5158Smillert    @ISA         = 'Exporter';
23b39c5158Smillert    %EXPORT_TAGS = (
24b39c5158Smillert        ALL => [ qw{
25b39c5158Smillert            &extract_delimited
26b39c5158Smillert            &extract_bracketed
27b39c5158Smillert            &extract_quotelike
28b39c5158Smillert            &extract_codeblock
29b39c5158Smillert            &extract_variable
30b39c5158Smillert            &extract_tagged
31b39c5158Smillert            &extract_multiple
32b39c5158Smillert            &gen_delimited_pat
33b39c5158Smillert            &gen_extract_tagged
34b39c5158Smillert            &delimited_pat
35b39c5158Smillert        } ],
36b39c5158Smillert    );
37b39c5158Smillert}
38b39c5158Smillert
39b39c5158SmillertExporter::export_ok_tags('ALL');
40b39c5158Smillert
41*e0680481Safresh1our $RE_PREREGEX_PAT = qr#(
42*e0680481Safresh1    [!=]~
43*e0680481Safresh1    | split|grep|map
44*e0680481Safresh1    | not|and|or|xor
45*e0680481Safresh1)#x;
46*e0680481Safresh1our $RE_EXPR_PAT = qr#(
47*e0680481Safresh1    (?:\*\*|&&|\|\||<<|>>|//|[-+*x%^&|.])=?
48*e0680481Safresh1    | /(?:[^/])
49*e0680481Safresh1    | =(?!>)
50*e0680481Safresh1    | return
51*e0680481Safresh1    | [\(\[]
52*e0680481Safresh1)#x;
53*e0680481Safresh1our $RE_NUM = qr/\s*[+\-.0-9][+\-.0-9e]*/i; # numerical constant
54eac174f2Safresh1
55*e0680481Safresh1our %ref2slashvalid; # is quotelike /.../ pattern valid here for given textref?
56*e0680481Safresh1our %ref2qmarkvalid; # is quotelike ?...? pattern valid here for given textref?
57b39c5158Smillert
58b39c5158Smillert# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
59b39c5158Smillert
60b39c5158Smillertsub _failmsg {
61b39c5158Smillert    my ($message, $pos) = @_;
62b39c5158Smillert    $@ = bless {
63b39c5158Smillert        error => $message,
64b39c5158Smillert        pos   => $pos,
65b39c5158Smillert    }, 'Text::Balanced::ErrorMsg';
66b39c5158Smillert}
67b39c5158Smillert
68b39c5158Smillertsub _fail {
69b39c5158Smillert    my ($wantarray, $textref, $message, $pos) = @_;
70b39c5158Smillert    _failmsg $message, $pos if $message;
71b39c5158Smillert    return (undef, $$textref, undef) if $wantarray;
72eac174f2Safresh1    return;
73b39c5158Smillert}
74b39c5158Smillert
75b39c5158Smillertsub _succeed {
76b39c5158Smillert    $@ = undef;
77b39c5158Smillert    my ($wantarray,$textref) = splice @_, 0, 2;
78b39c5158Smillert    my ($extrapos, $extralen) = @_ > 18
79b39c5158Smillert        ? splice(@_, -2, 2)
80b39c5158Smillert        : (0, 0);
81b39c5158Smillert    my ($startlen, $oppos) = @_[5,6];
82b39c5158Smillert    my $remainderpos = $_[2];
83b39c5158Smillert    if ( $wantarray ) {
84b39c5158Smillert        my @res;
85b39c5158Smillert        while (my ($from, $len) = splice @_, 0, 2) {
86b39c5158Smillert            push @res, substr($$textref, $from, $len);
87b39c5158Smillert        }
88b39c5158Smillert        if ( $extralen ) { # CORRECT FILLET
89b39c5158Smillert            my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
90b39c5158Smillert            $res[1] = "$extra$res[1]";
91b39c5158Smillert            eval { substr($$textref,$remainderpos,0) = $extra;
92b39c5158Smillert                   substr($$textref,$extrapos,$extralen,"\n")} ;
93b39c5158Smillert                    #REARRANGE HERE DOC AND FILLET IF POSSIBLE
94b39c5158Smillert            pos($$textref) = $remainderpos-$extralen+1; # RESET \G
95b39c5158Smillert        } else {
96b39c5158Smillert            pos($$textref) = $remainderpos;             # RESET \G
97b39c5158Smillert        }
98b39c5158Smillert        return @res;
99b39c5158Smillert    } else {
100b39c5158Smillert        my $match = substr($$textref,$_[0],$_[1]);
101b39c5158Smillert        substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
102b39c5158Smillert        my $extra = $extralen
103b39c5158Smillert            ? substr($$textref, $extrapos, $extralen)."\n" : "";
104b39c5158Smillert        eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
105b39c5158Smillert        pos($$textref) = $_[4];                         # RESET \G
106b39c5158Smillert        return $match;
107b39c5158Smillert    }
108b39c5158Smillert}
109b39c5158Smillert
110b39c5158Smillert# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
111*e0680481Safresh1## no critic (Subroutines::ProhibitSubroutinePrototypes)
112b39c5158Smillert
113b39c5158Smillertsub gen_delimited_pat($;$)  # ($delimiters;$escapes)
114b39c5158Smillert{
115b39c5158Smillert    my ($dels, $escs) = @_;
116b39c5158Smillert    return "" unless $dels =~ /\S/;
117b39c5158Smillert    $escs = '\\' unless $escs;
118b39c5158Smillert    $escs .= substr($escs,-1) x (length($dels)-length($escs));
119b39c5158Smillert    my @pat = ();
120b39c5158Smillert    my $i;
121b39c5158Smillert    for ($i=0; $i<length $dels; $i++)
122b39c5158Smillert    {
123b39c5158Smillert        my $del = quotemeta substr($dels,$i,1);
124b39c5158Smillert        my $esc = quotemeta substr($escs,$i,1);
125b39c5158Smillert        if ($del eq $esc)
126b39c5158Smillert        {
127b39c5158Smillert            push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
128b39c5158Smillert        }
129b39c5158Smillert        else
130b39c5158Smillert        {
131b39c5158Smillert            push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
132b39c5158Smillert        }
133b39c5158Smillert    }
134b39c5158Smillert    my $pat = join '|', @pat;
135b39c5158Smillert    return "(?:$pat)";
136b39c5158Smillert}
137b39c5158Smillert
138b39c5158Smillert*delimited_pat = \&gen_delimited_pat;
139b39c5158Smillert
140b39c5158Smillert# THE EXTRACTION FUNCTIONS
141b39c5158Smillert
142b39c5158Smillertsub extract_delimited (;$$$$)
143b39c5158Smillert{
144b39c5158Smillert    my $textref = defined $_[0] ? \$_[0] : \$_;
145*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
146b39c5158Smillert    my $wantarray = wantarray;
147b39c5158Smillert    my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
148b39c5158Smillert    my $pre  = defined $_[2] ? $_[2] : '\s*';
149b39c5158Smillert    my $esc  = defined $_[3] ? $_[3] : qq{\\};
150b39c5158Smillert    my $pat = gen_delimited_pat($del, $esc);
151b39c5158Smillert    my $startpos = pos $$textref || 0;
152b39c5158Smillert    return _fail($wantarray, $textref, "Not a delimited pattern", 0)
153b39c5158Smillert        unless $$textref =~ m/\G($pre)($pat)/gc;
154b39c5158Smillert    my $prelen = length($1);
155b39c5158Smillert    my $matchpos = $startpos+$prelen;
156b39c5158Smillert    my $endpos = pos $$textref;
157b39c5158Smillert    return _succeed $wantarray, $textref,
158b39c5158Smillert                    $matchpos, $endpos-$matchpos,               # MATCH
159b39c5158Smillert                    $endpos,   length($$textref)-$endpos,       # REMAINDER
160b39c5158Smillert                    $startpos, $prelen;                         # PREFIX
161b39c5158Smillert}
162b39c5158Smillert
163*e0680481Safresh1my %eb_delim_cache;
164*e0680481Safresh1sub _eb_delims {
165*e0680481Safresh1    my ($ldel_orig) = @_;
166*e0680481Safresh1    return @{ $eb_delim_cache{$ldel_orig} } if $eb_delim_cache{$ldel_orig};
167b39c5158Smillert    my $qdel = "";
168b39c5158Smillert    my $quotelike;
169*e0680481Safresh1    my $ldel = $ldel_orig;
170b39c5158Smillert    $ldel =~ s/'//g and $qdel .= q{'};
171b39c5158Smillert    $ldel =~ s/"//g and $qdel .= q{"};
172b39c5158Smillert    $ldel =~ s/`//g and $qdel .= q{`};
173b39c5158Smillert    $ldel =~ s/q//g and $quotelike = 1;
174b39c5158Smillert    $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
175b39c5158Smillert    my $rdel = $ldel;
176*e0680481Safresh1    return @{ $eb_delim_cache{$ldel_orig} = [] } unless $rdel =~ tr/[({</])}>/;
177*e0680481Safresh1    my $posbug = pos;
178*e0680481Safresh1    $ldel = join('|', map { quotemeta $_ } split('', $ldel));
179*e0680481Safresh1    $rdel = join('|', map { quotemeta $_ } split('', $rdel));
180*e0680481Safresh1    pos = $posbug;
181*e0680481Safresh1    @{ $eb_delim_cache{$ldel_orig} = [
182*e0680481Safresh1        qr/\G($ldel)/, $qdel && qr/\G([$qdel])/, $quotelike, qr/\G($rdel)/
183*e0680481Safresh1    ] };
184*e0680481Safresh1}
185*e0680481Safresh1sub extract_bracketed (;$$$)
186*e0680481Safresh1{
187*e0680481Safresh1    my $textref = defined $_[0] ? \$_[0] : \$_;
188*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
189*e0680481Safresh1    my $ldel = defined $_[1] ? $_[1] : '{([<';
190*e0680481Safresh1    my $pre  = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/;
191*e0680481Safresh1    my $wantarray = wantarray;
192*e0680481Safresh1    my @ret = _eb_delims($ldel);
193*e0680481Safresh1    unless (@ret)
194b39c5158Smillert    {
195b39c5158Smillert        return _fail $wantarray, $textref,
196b39c5158Smillert                     "Did not find a suitable bracket in delimiter: \"$_[1]\"",
197b39c5158Smillert                     0;
198b39c5158Smillert    }
199b39c5158Smillert
200b39c5158Smillert    my $startpos = pos $$textref || 0;
201*e0680481Safresh1    my @match = _match_bracketed($textref, $pre, @ret);
202b39c5158Smillert
203b39c5158Smillert    return _fail ($wantarray, $textref) unless @match;
204b39c5158Smillert
205b39c5158Smillert    return _succeed ( $wantarray, $textref,
206b39c5158Smillert                      $match[2], $match[5]+2,           # MATCH
207b39c5158Smillert                      @match[8,9],                      # REMAINDER
208b39c5158Smillert                      @match[0,1],                      # PREFIX
209b39c5158Smillert                    );
210b39c5158Smillert}
211b39c5158Smillert
212*e0680481Safresh1sub _match_bracketed    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
213b39c5158Smillert{
214b39c5158Smillert    my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
215b39c5158Smillert    my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
216*e0680481Safresh1    unless ($$textref =~ m/$pre/gc)
217b39c5158Smillert    {
218b39c5158Smillert        _failmsg "Did not find prefix: /$pre/", $startpos;
219b39c5158Smillert        return;
220b39c5158Smillert    }
221b39c5158Smillert
222b39c5158Smillert    $ldelpos = pos $$textref;
223b39c5158Smillert
224*e0680481Safresh1    unless ($$textref =~ m/$ldel/gc)
225b39c5158Smillert    {
226b39c5158Smillert        _failmsg "Did not find opening bracket after prefix: \"$pre\"",
227b39c5158Smillert                 pos $$textref;
228b39c5158Smillert        pos $$textref = $startpos;
229b39c5158Smillert        return;
230b39c5158Smillert    }
231b39c5158Smillert
232b39c5158Smillert    my @nesting = ( $1 );
233b39c5158Smillert    my $textlen = length $$textref;
234b39c5158Smillert    while (pos $$textref < $textlen)
235b39c5158Smillert    {
236b39c5158Smillert        next if $$textref =~ m/\G\\./gcs;
237b39c5158Smillert
238*e0680481Safresh1        if ($$textref =~ m/$ldel/gc)
239b39c5158Smillert        {
240b39c5158Smillert            push @nesting, $1;
241b39c5158Smillert        }
242*e0680481Safresh1        elsif ($$textref =~ m/$rdel/gc)
243b39c5158Smillert        {
244b39c5158Smillert            my ($found, $brackettype) = ($1, $1);
245b39c5158Smillert            if ($#nesting < 0)
246b39c5158Smillert            {
247b39c5158Smillert                _failmsg "Unmatched closing bracket: \"$found\"",
248b39c5158Smillert                         pos $$textref;
249b39c5158Smillert                pos $$textref = $startpos;
250b39c5158Smillert                return;
251b39c5158Smillert            }
252b39c5158Smillert            my $expected = pop(@nesting);
253b39c5158Smillert            $expected =~ tr/({[</)}]>/;
254b39c5158Smillert            if ($expected ne $brackettype)
255b39c5158Smillert            {
256b39c5158Smillert                _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
257b39c5158Smillert                         pos $$textref;
258b39c5158Smillert                pos $$textref = $startpos;
259b39c5158Smillert                return;
260b39c5158Smillert            }
261b39c5158Smillert            last if $#nesting < 0;
262b39c5158Smillert        }
263*e0680481Safresh1        elsif ($qdel && $$textref =~ m/$qdel/gc)
264b39c5158Smillert        {
265b39c5158Smillert            $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
266b39c5158Smillert            _failmsg "Unmatched embedded quote ($1)",
267b39c5158Smillert                     pos $$textref;
268b39c5158Smillert            pos $$textref = $startpos;
269b39c5158Smillert            return;
270b39c5158Smillert        }
271*e0680481Safresh1        elsif ($quotelike && _match_quotelike($textref,qr/\G()/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}))
272b39c5158Smillert        {
273*e0680481Safresh1            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; # back-compat
274b39c5158Smillert            next;
275b39c5158Smillert        }
276b39c5158Smillert
277b39c5158Smillert        else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
278b39c5158Smillert    }
279b39c5158Smillert    if ($#nesting>=0)
280b39c5158Smillert    {
281b39c5158Smillert        _failmsg "Unmatched opening bracket(s): "
282b39c5158Smillert                     . join("..",@nesting)."..",
283b39c5158Smillert                 pos $$textref;
284b39c5158Smillert        pos $$textref = $startpos;
285b39c5158Smillert        return;
286b39c5158Smillert    }
287b39c5158Smillert
288b39c5158Smillert    $endpos = pos $$textref;
289b39c5158Smillert
290b39c5158Smillert    return (
291b39c5158Smillert        $startpos,  $ldelpos-$startpos,         # PREFIX
292b39c5158Smillert        $ldelpos,   1,                          # OPENING BRACKET
293b39c5158Smillert        $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
294b39c5158Smillert        $endpos-1,  1,                          # CLOSING BRACKET
295b39c5158Smillert        $endpos,    length($$textref)-$endpos,  # REMAINDER
296b39c5158Smillert    );
297b39c5158Smillert}
298b39c5158Smillert
299b39c5158Smillertsub _revbracket($)
300b39c5158Smillert{
301b39c5158Smillert    my $brack = reverse $_[0];
302b39c5158Smillert    $brack =~ tr/[({</])}>/;
303b39c5158Smillert    return $brack;
304b39c5158Smillert}
305b39c5158Smillert
306b39c5158Smillertmy $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
307b39c5158Smillert
308*e0680481Safresh1my $et_default_ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>';
309b39c5158Smillertsub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
310b39c5158Smillert{
311b39c5158Smillert    my $textref = defined $_[0] ? \$_[0] : \$_;
312*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
313b39c5158Smillert    my $ldel    = $_[1];
314b39c5158Smillert    my $rdel    = $_[2];
315*e0680481Safresh1    my $pre     = defined $_[3] ? qr/\G$_[3]/ : qr/\G\s*/;
316b39c5158Smillert    my %options = defined $_[4] ? %{$_[4]} : ();
317b39c5158Smillert    my $omode   = defined $options{fail} ? $options{fail} : '';
318b39c5158Smillert    my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
319b39c5158Smillert                : defined($options{reject})        ? $options{reject}
320b39c5158Smillert                :                                    ''
321b39c5158Smillert                ;
322b39c5158Smillert    my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
323b39c5158Smillert                : defined($options{ignore})        ? $options{ignore}
324b39c5158Smillert                :                                    ''
325b39c5158Smillert                ;
326b39c5158Smillert
327*e0680481Safresh1    $ldel = $et_default_ldel if !defined $ldel;
328b39c5158Smillert    $@ = undef;
329b39c5158Smillert
330b39c5158Smillert    my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
331b39c5158Smillert
332b39c5158Smillert    return _fail(wantarray, $textref) unless @match;
333b39c5158Smillert    return _succeed wantarray, $textref,
334b39c5158Smillert            $match[2], $match[3]+$match[5]+$match[7],    # MATCH
335b39c5158Smillert            @match[8..9,0..1,2..7];                      # REM, PRE, BITS
336b39c5158Smillert}
337b39c5158Smillert
338b39c5158Smillertsub _match_tagged       # ($$$$$$$)
339b39c5158Smillert{
340b39c5158Smillert    my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
341b39c5158Smillert    my $rdelspec;
342b39c5158Smillert
343b39c5158Smillert    my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
344b39c5158Smillert
345*e0680481Safresh1    unless ($$textref =~ m/$pre/gc)
346b39c5158Smillert    {
347b39c5158Smillert        _failmsg "Did not find prefix: /$pre/", pos $$textref;
348b39c5158Smillert        goto failed;
349b39c5158Smillert    }
350b39c5158Smillert
351b39c5158Smillert    $opentagpos = pos($$textref);
352b39c5158Smillert
353b39c5158Smillert    unless ($$textref =~ m/\G$ldel/gc)
354b39c5158Smillert    {
355b39c5158Smillert        _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
356b39c5158Smillert        goto failed;
357b39c5158Smillert    }
358b39c5158Smillert
359b39c5158Smillert    $textpos = pos($$textref);
360b39c5158Smillert
361b39c5158Smillert    if (!defined $rdel)
362b39c5158Smillert    {
363b39c5158Smillert        $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
364b39c5158Smillert        unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
365b39c5158Smillert        {
366b39c5158Smillert            _failmsg "Unable to construct closing tag to match: $rdel",
367b39c5158Smillert                     pos $$textref;
368b39c5158Smillert            goto failed;
369b39c5158Smillert        }
370b39c5158Smillert    }
371b39c5158Smillert    else
372b39c5158Smillert    {
373eac174f2Safresh1        ## no critic (BuiltinFunctions::ProhibitStringyEval)
374b39c5158Smillert        $rdelspec = eval "qq{$rdel}" || do {
375b39c5158Smillert            my $del;
376b39c5158Smillert            for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
377b39c5158Smillert                { next if $rdel =~ /\Q$_/; $del = $_; last }
378b39c5158Smillert            unless ($del) {
379b39c5158Smillert                use Carp;
380b39c5158Smillert                croak "Can't interpolate right delimiter $rdel"
381b39c5158Smillert            }
382b39c5158Smillert            eval "qq$del$rdel$del";
383b39c5158Smillert        };
384b39c5158Smillert    }
385b39c5158Smillert
386b39c5158Smillert    while (pos($$textref) < length($$textref))
387b39c5158Smillert    {
388b39c5158Smillert        next if $$textref =~ m/\G\\./gc;
389b39c5158Smillert
390b39c5158Smillert        if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
391b39c5158Smillert        {
392b39c5158Smillert            $parapos = pos($$textref) - length($1)
393b39c5158Smillert                unless defined $parapos;
394b39c5158Smillert        }
395b39c5158Smillert        elsif ($$textref =~ m/\G($rdelspec)/gc )
396b39c5158Smillert        {
397b39c5158Smillert            $closetagpos = pos($$textref)-length($1);
398b39c5158Smillert            goto matched;
399b39c5158Smillert        }
400b39c5158Smillert        elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
401b39c5158Smillert        {
402b39c5158Smillert            next;
403b39c5158Smillert        }
404b39c5158Smillert        elsif ($bad && $$textref =~ m/\G($bad)/gcs)
405b39c5158Smillert        {
406b39c5158Smillert            pos($$textref) -= length($1);       # CUT OFF WHATEVER CAUSED THE SHORTNESS
407b39c5158Smillert            goto short if ($omode eq 'PARA' || $omode eq 'MAX');
408b39c5158Smillert            _failmsg "Found invalid nested tag: $1", pos $$textref;
409b39c5158Smillert            goto failed;
410b39c5158Smillert        }
411b39c5158Smillert        elsif ($$textref =~ m/\G($ldel)/gc)
412b39c5158Smillert        {
413b39c5158Smillert            my $tag = $1;
414b39c5158Smillert            pos($$textref) -= length($tag);     # REWIND TO NESTED TAG
415b39c5158Smillert            unless (_match_tagged(@_))  # MATCH NESTED TAG
416b39c5158Smillert            {
417b39c5158Smillert                goto short if $omode eq 'PARA' || $omode eq 'MAX';
418b39c5158Smillert                _failmsg "Found unbalanced nested tag: $tag",
419b39c5158Smillert                         pos $$textref;
420b39c5158Smillert                goto failed;
421b39c5158Smillert            }
422b39c5158Smillert        }
423b39c5158Smillert        else { $$textref =~ m/./gcs }
424b39c5158Smillert    }
425b39c5158Smillert
426b39c5158Smillertshort:
427b39c5158Smillert    $closetagpos = pos($$textref);
428b39c5158Smillert    goto matched if $omode eq 'MAX';
429b39c5158Smillert    goto failed unless $omode eq 'PARA';
430b39c5158Smillert
431b39c5158Smillert    if (defined $parapos) { pos($$textref) = $parapos }
432b39c5158Smillert    else                  { $parapos = pos($$textref) }
433b39c5158Smillert
434b39c5158Smillert    return (
435b39c5158Smillert        $startpos,    $opentagpos-$startpos,            # PREFIX
436b39c5158Smillert        $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
437b39c5158Smillert        $textpos,     $parapos-$textpos,                # TEXT
438b39c5158Smillert        $parapos,     0,                                # NO CLOSING TAG
439b39c5158Smillert        $parapos,     length($$textref)-$parapos,       # REMAINDER
440b39c5158Smillert    );
441b39c5158Smillert
442b39c5158Smillertmatched:
443b39c5158Smillert    $endpos = pos($$textref);
444b39c5158Smillert    return (
445b39c5158Smillert        $startpos,    $opentagpos-$startpos,            # PREFIX
446b39c5158Smillert        $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
447b39c5158Smillert        $textpos,     $closetagpos-$textpos,            # TEXT
448b39c5158Smillert        $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
449b39c5158Smillert        $endpos,      length($$textref)-$endpos,        # REMAINDER
450b39c5158Smillert    );
451b39c5158Smillert
452b39c5158Smillertfailed:
453b39c5158Smillert    _failmsg "Did not find closing tag", pos $$textref unless $@;
454b39c5158Smillert    pos($$textref) = $startpos;
455b39c5158Smillert    return;
456b39c5158Smillert}
457b39c5158Smillert
458b39c5158Smillertsub extract_variable (;$$)
459b39c5158Smillert{
460b39c5158Smillert    my $textref = defined $_[0] ? \$_[0] : \$_;
461b39c5158Smillert    return ("","","") unless defined $$textref;
462*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
463*e0680481Safresh1    my $pre  = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/;
464b39c5158Smillert
465b39c5158Smillert    my @match = _match_variable($textref,$pre);
466b39c5158Smillert
467b39c5158Smillert    return _fail wantarray, $textref unless @match;
468b39c5158Smillert
469b39c5158Smillert    return _succeed wantarray, $textref,
470b39c5158Smillert                    @match[2..3,4..5,0..1];        # MATCH, REMAINDER, PREFIX
471b39c5158Smillert}
472b39c5158Smillert
473*e0680481Safresh1sub _match_variable
474b39c5158Smillert{
475b39c5158Smillert#  $#
476b39c5158Smillert#  $^
477b39c5158Smillert#  $$
478b39c5158Smillert    my ($textref, $pre) = @_;
479b39c5158Smillert    my $startpos = pos($$textref) = pos($$textref)||0;
480*e0680481Safresh1    unless ($$textref =~ m/$pre/gc)
481b39c5158Smillert    {
482b39c5158Smillert        _failmsg "Did not find prefix: /$pre/", pos $$textref;
483b39c5158Smillert        return;
484b39c5158Smillert    }
485b39c5158Smillert    my $varpos = pos($$textref);
486b39c5158Smillert    unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
487b39c5158Smillert    {
488b39c5158Smillert        unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
489b39c5158Smillert        {
490b39c5158Smillert            _failmsg "Did not find leading dereferencer", pos $$textref;
491b39c5158Smillert            pos $$textref = $startpos;
492b39c5158Smillert            return;
493b39c5158Smillert        }
494b39c5158Smillert        my $deref = $1;
495b39c5158Smillert
496b39c5158Smillert        unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
497*e0680481Safresh1            or _match_codeblock($textref, qr/\G()/, '\{', qr/\G\s*(\})/, '\{', '\}', 0, 1)
498*e0680481Safresh1            or $deref eq '$#' or $deref eq '$$'
499*e0680481Safresh1            or pos($$textref) == length $$textref )
500b39c5158Smillert        {
501b39c5158Smillert            _failmsg "Bad identifier after dereferencer", pos $$textref;
502b39c5158Smillert            pos $$textref = $startpos;
503b39c5158Smillert            return;
504b39c5158Smillert        }
505b39c5158Smillert    }
506b39c5158Smillert
507b39c5158Smillert    while (1)
508b39c5158Smillert    {
509b39c5158Smillert        next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
510b39c5158Smillert        next if _match_codeblock($textref,
511*e0680481Safresh1                                 qr/\G\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
512*e0680481Safresh1                                 qr/[({[]/, qr/\G\s*([)}\]])/,
513*e0680481Safresh1                                 qr/[({[]/, qr/[)}\]]/, 0, 1);
514b39c5158Smillert        next if _match_codeblock($textref,
515*e0680481Safresh1                                 qr/\G\s*/, qr/[{[]/, qr/\G\s*([}\]])/,
516*e0680481Safresh1                                 qr/[{[]/, qr/[}\]]/, 0, 1);
517*e0680481Safresh1        next if _match_variable($textref,qr/\G\s*->\s*/);
518b39c5158Smillert        next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
519b39c5158Smillert        last;
520b39c5158Smillert    }
521*e0680481Safresh1    $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
522b39c5158Smillert
523b39c5158Smillert    my $endpos = pos($$textref);
524b39c5158Smillert    return ($startpos, $varpos-$startpos,
525b39c5158Smillert            $varpos,   $endpos-$varpos,
526b39c5158Smillert            $endpos,   length($$textref)-$endpos
527b39c5158Smillert    );
528b39c5158Smillert}
529b39c5158Smillert
530*e0680481Safresh1my %ec_delim_cache;
531*e0680481Safresh1sub _ec_delims {
532*e0680481Safresh1    my ($ldel_inner, $ldel_outer) = @_;
533*e0680481Safresh1    return @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} }
534*e0680481Safresh1        if $ec_delim_cache{$ldel_outer}{$ldel_inner};
535b39c5158Smillert    my $rdel_inner = $ldel_inner;
536b39c5158Smillert    my $rdel_outer = $ldel_outer;
537b39c5158Smillert    my $posbug = pos;
538b39c5158Smillert    for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
539b39c5158Smillert    for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
540b39c5158Smillert    for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
541b39c5158Smillert    {
542b39c5158Smillert        $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
543b39c5158Smillert    }
544b39c5158Smillert    pos = $posbug;
545*e0680481Safresh1    @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} = [
546*e0680481Safresh1        $ldel_outer, qr/\G\s*($rdel_outer)/, $ldel_inner, $rdel_inner
547*e0680481Safresh1    ] };
548*e0680481Safresh1}
549*e0680481Safresh1sub extract_codeblock (;$$$$$)
550*e0680481Safresh1{
551*e0680481Safresh1    my $textref = defined $_[0] ? \$_[0] : \$_;
552*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
553*e0680481Safresh1    my $wantarray = wantarray;
554*e0680481Safresh1    my $ldel_inner = defined $_[1] ? $_[1] : '{';
555*e0680481Safresh1    my $pre = !defined $_[2] ? qr/\G\s*/ : qr/\G$_[2]/;
556*e0680481Safresh1    my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
557*e0680481Safresh1    my $rd         = $_[4];
558*e0680481Safresh1    my @delims = _ec_delims($ldel_inner, $ldel_outer);
559b39c5158Smillert
560*e0680481Safresh1    my @match = _match_codeblock($textref, $pre, @delims, $rd, 1);
561b39c5158Smillert    return _fail($wantarray, $textref) unless @match;
562b39c5158Smillert    return _succeed($wantarray, $textref,
563b39c5158Smillert                    @match[2..3,4..5,0..1]    # MATCH, REMAINDER, PREFIX
564b39c5158Smillert    );
565b39c5158Smillert}
566b39c5158Smillert
567*e0680481Safresh1sub _match_codeblock
568b39c5158Smillert{
569*e0680481Safresh1    my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd, $no_backcompat) = @_;
570*e0680481Safresh1    $rdel_outer = qr/\G\s*($rdel_outer)/ if !$no_backcompat; # Switch calls this func directly
571b39c5158Smillert    my $startpos = pos($$textref) = pos($$textref) || 0;
572*e0680481Safresh1    unless ($$textref =~ m/$pre/gc)
573b39c5158Smillert    {
574b39c5158Smillert        _failmsg qq{Did not match prefix /$pre/ at"} .
575b39c5158Smillert                     substr($$textref,pos($$textref),20) .
576b39c5158Smillert                     q{..."},
577b39c5158Smillert                 pos $$textref;
578b39c5158Smillert        return;
579b39c5158Smillert    }
580b39c5158Smillert    my $codepos = pos($$textref);
581b39c5158Smillert    unless ($$textref =~ m/\G($ldel_outer)/gc)  # OUTERMOST DELIMITER
582b39c5158Smillert    {
583b39c5158Smillert        _failmsg qq{Did not find expected opening bracket at "} .
584b39c5158Smillert                     substr($$textref,pos($$textref),20) .
585b39c5158Smillert                     q{..."},
586b39c5158Smillert                 pos $$textref;
587b39c5158Smillert        pos $$textref = $startpos;
588b39c5158Smillert        return;
589b39c5158Smillert    }
590b39c5158Smillert    my $closing = $1;
591b39c5158Smillert       $closing =~ tr/([<{/)]>}/;
592b39c5158Smillert    my $matched;
593*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0
594*e0680481Safresh1      if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset
595b39c5158Smillert    while (pos($$textref) < length($$textref))
596b39c5158Smillert    {
597b39c5158Smillert        if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
598b39c5158Smillert        {
599*e0680481Safresh1            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
600b39c5158Smillert            next;
601b39c5158Smillert        }
602b39c5158Smillert
603b39c5158Smillert        if ($$textref =~ m/\G\s*#.*/gc)
604b39c5158Smillert        {
605b39c5158Smillert            next;
606b39c5158Smillert        }
607b39c5158Smillert
608*e0680481Safresh1        if ($$textref =~ m/$rdel_outer/gc)
609b39c5158Smillert        {
610b39c5158Smillert            unless ($matched = ($closing && $1 eq $closing) )
611b39c5158Smillert            {
612b39c5158Smillert                next if $1 eq '>';      # MIGHT BE A "LESS THAN"
613b39c5158Smillert                _failmsg q{Mismatched closing bracket at "} .
614b39c5158Smillert                             substr($$textref,pos($$textref),20) .
615b39c5158Smillert                             qq{...". Expected '$closing'},
616b39c5158Smillert                         pos $$textref;
617b39c5158Smillert            }
618b39c5158Smillert            last;
619b39c5158Smillert        }
620b39c5158Smillert
621*e0680481Safresh1        if (_match_variable($textref,qr/\G\s*/) ||
622*e0680481Safresh1            _match_quotelike($textref,qr/\G\s*/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}) )
623b39c5158Smillert        {
624*e0680481Safresh1            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
625b39c5158Smillert            next;
626b39c5158Smillert        }
627b39c5158Smillert
628*e0680481Safresh1        if ($$textref =~ m#\G\s*(?!$ldel_inner)(?:$RE_PREREGEX_PAT|$RE_EXPR_PAT)#gc)
629b39c5158Smillert        {
630*e0680481Safresh1            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1;
631b39c5158Smillert            next;
632b39c5158Smillert        }
633b39c5158Smillert
634*e0680481Safresh1        if ( _match_codeblock($textref, qr/\G\s*/, $ldel_inner, qr/\G\s*($rdel_inner)/, $ldel_inner, $rdel_inner, $rd, 1) )
635b39c5158Smillert        {
636*e0680481Safresh1            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1;
637b39c5158Smillert            next;
638b39c5158Smillert        }
639b39c5158Smillert
640b39c5158Smillert        if ($$textref =~ m/\G\s*$ldel_outer/gc)
641b39c5158Smillert        {
642b39c5158Smillert            _failmsg q{Improperly nested codeblock at "} .
643b39c5158Smillert                         substr($$textref,pos($$textref),20) .
644b39c5158Smillert                         q{..."},
645b39c5158Smillert                     pos $$textref;
646b39c5158Smillert            last;
647b39c5158Smillert        }
648b39c5158Smillert
649*e0680481Safresh1        $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
650b39c5158Smillert        $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
651b39c5158Smillert    }
652b39c5158Smillert    continue { $@ = undef }
653b39c5158Smillert
654b39c5158Smillert    unless ($matched)
655b39c5158Smillert    {
656b39c5158Smillert        _failmsg 'No match found for opening bracket', pos $$textref
657b39c5158Smillert                unless $@;
658b39c5158Smillert        return;
659b39c5158Smillert    }
660b39c5158Smillert
661*e0680481Safresh1    $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = undef;
662b39c5158Smillert    my $endpos = pos($$textref);
663b39c5158Smillert    return ( $startpos, $codepos-$startpos,
664b39c5158Smillert             $codepos, $endpos-$codepos,
665b39c5158Smillert             $endpos,  length($$textref)-$endpos,
666b39c5158Smillert    );
667b39c5158Smillert}
668b39c5158Smillert
669b39c5158Smillert
670b39c5158Smillertmy %mods   = (
671b39c5158Smillert    'none' => '[cgimsox]*',
672b39c5158Smillert    'm'    => '[cgimsox]*',
673b39c5158Smillert    's'    => '[cegimsox]*',
674b39c5158Smillert    'tr'   => '[cds]*',
675b39c5158Smillert    'y'    => '[cds]*',
676b39c5158Smillert    'qq'   => '',
677b39c5158Smillert    'qx'   => '',
678b39c5158Smillert    'qw'   => '',
679b39c5158Smillert    'qr'   => '[imsx]*',
680b39c5158Smillert    'q'    => '',
681b39c5158Smillert);
682b39c5158Smillert
683b39c5158Smillertsub extract_quotelike (;$$)
684b39c5158Smillert{
685b39c5158Smillert    my $textref = $_[0] ? \$_[0] : \$_;
686*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
687b39c5158Smillert    my $wantarray = wantarray;
688*e0680481Safresh1    my $pre  = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/;
689b39c5158Smillert
690*e0680481Safresh1    my @match = _match_quotelike($textref,$pre,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref});
691b39c5158Smillert    return _fail($wantarray, $textref) unless @match;
692b39c5158Smillert    return _succeed($wantarray, $textref,
693b39c5158Smillert                    $match[2], $match[18]-$match[2],    # MATCH
694b39c5158Smillert                    @match[18,19],                      # REMAINDER
695b39c5158Smillert                    @match[0,1],                        # PREFIX
696b39c5158Smillert                    @match[2..17],                      # THE BITS
697b39c5158Smillert                    @match[20,21],                      # ANY FILLET?
698b39c5158Smillert    );
699b39c5158Smillert};
700b39c5158Smillert
701*e0680481Safresh1my %maybe_quote = map +($_=>1), qw(" ' `);
702*e0680481Safresh1sub _match_quotelike
703b39c5158Smillert{
704*e0680481Safresh1    my ($textref, $pre, $allow_slash_match, $allow_qmark_match) = @_;
705*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0
706*e0680481Safresh1      if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset
707b39c5158Smillert
708b39c5158Smillert    my ($textlen,$startpos,
709b39c5158Smillert        $preld1pos,$ld1pos,$str1pos,$rd1pos,
710b39c5158Smillert        $preld2pos,$ld2pos,$str2pos,$rd2pos,
711b39c5158Smillert        $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
712b39c5158Smillert
713*e0680481Safresh1    unless ($$textref =~ m/$pre/gc)
714b39c5158Smillert    {
715b39c5158Smillert        _failmsg qq{Did not find prefix /$pre/ at "} .
716b39c5158Smillert                     substr($$textref, pos($$textref), 20) .
717b39c5158Smillert                     q{..."},
718b39c5158Smillert                 pos $$textref;
719b39c5158Smillert        return;
720b39c5158Smillert    }
721*e0680481Safresh1    my $oppos = pos($$textref);
722b39c5158Smillert    my $initial = substr($$textref,$oppos,1);
723*e0680481Safresh1    if ($initial && $maybe_quote{$initial}
724*e0680481Safresh1                 || $allow_slash_match && $initial eq '/'
725*e0680481Safresh1                 || $allow_qmark_match && $initial eq '?')
726b39c5158Smillert    {
727*e0680481Safresh1        unless ($$textref =~ m/\G \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
728b39c5158Smillert        {
729b39c5158Smillert            _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
730b39c5158Smillert                         substr($$textref, $oppos, 20) .
731b39c5158Smillert                         q{..."},
732b39c5158Smillert                     pos $$textref;
733b39c5158Smillert            pos $$textref = $startpos;
734b39c5158Smillert            return;
735b39c5158Smillert        }
736b39c5158Smillert        $modpos= pos($$textref);
737b39c5158Smillert        $rd1pos = $modpos-1;
738b39c5158Smillert
739b39c5158Smillert        if ($initial eq '/' || $initial eq '?')
740b39c5158Smillert        {
741b39c5158Smillert            $$textref =~ m/\G$mods{none}/gc
742b39c5158Smillert        }
743b39c5158Smillert
744b39c5158Smillert        my $endpos = pos($$textref);
745*e0680481Safresh1        $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0;
746b39c5158Smillert        return (
747b39c5158Smillert            $startpos,  $oppos-$startpos,       # PREFIX
748b39c5158Smillert            $oppos,     0,                      # NO OPERATOR
749b39c5158Smillert            $oppos,     1,                      # LEFT DEL
750b39c5158Smillert            $oppos+1,   $rd1pos-$oppos-1,       # STR/PAT
751b39c5158Smillert            $rd1pos,    1,                      # RIGHT DEL
752b39c5158Smillert            $modpos,    0,                      # NO 2ND LDEL
753b39c5158Smillert            $modpos,    0,                      # NO 2ND STR
754b39c5158Smillert            $modpos,    0,                      # NO 2ND RDEL
755b39c5158Smillert            $modpos,    $endpos-$modpos,        # MODIFIERS
756b39c5158Smillert            $endpos,    $textlen-$endpos,       # REMAINDER
757b39c5158Smillert        );
758b39c5158Smillert    }
759b39c5158Smillert
760*e0680481Safresh1    unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=[a-zA-Z]|\s*['"`;,]))}gc)
761b39c5158Smillert    {
762b39c5158Smillert        _failmsg q{No quotelike operator found after prefix at "} .
763b39c5158Smillert                     substr($$textref, pos($$textref), 20) .
764b39c5158Smillert                     q{..."},
765b39c5158Smillert                 pos $$textref;
766b39c5158Smillert        pos $$textref = $startpos;
767b39c5158Smillert        return;
768b39c5158Smillert    }
769b39c5158Smillert
770b39c5158Smillert    my $op = $1;
771b39c5158Smillert    $preld1pos = pos($$textref);
772b39c5158Smillert    if ($op eq '<<') {
773b39c5158Smillert        $ld1pos = pos($$textref);
774b39c5158Smillert        my $label;
775b39c5158Smillert        if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
776b39c5158Smillert            $label = $1;
777b39c5158Smillert        }
778b39c5158Smillert        elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
779b39c5158Smillert                             | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
780b39c5158Smillert                             | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
781b39c5158Smillert                             }gcsx) {
782b39c5158Smillert            $label = $+;
783b39c5158Smillert        }
784b39c5158Smillert        else {
785b39c5158Smillert            $label = "";
786b39c5158Smillert        }
787b39c5158Smillert        my $extrapos = pos($$textref);
788b39c5158Smillert        $$textref =~ m{.*\n}gc;
789b39c5158Smillert        $str1pos = pos($$textref)--;
790b39c5158Smillert        unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
791b39c5158Smillert            _failmsg qq{Missing here doc terminator ('$label') after "} .
792b39c5158Smillert                         substr($$textref, $startpos, 20) .
793b39c5158Smillert                         q{..."},
794b39c5158Smillert                     pos $$textref;
795b39c5158Smillert            pos $$textref = $startpos;
796b39c5158Smillert            return;
797b39c5158Smillert        }
798b39c5158Smillert        $rd1pos = pos($$textref);
799b39c5158Smillert        $$textref =~ m{\Q$label\E\n}gc;
800b39c5158Smillert        $ld2pos = pos($$textref);
801*e0680481Safresh1        $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0;
802b39c5158Smillert        return (
803b39c5158Smillert            $startpos,  $oppos-$startpos,       # PREFIX
804b39c5158Smillert            $oppos,     length($op),            # OPERATOR
805b39c5158Smillert            $ld1pos,    $extrapos-$ld1pos,      # LEFT DEL
806b39c5158Smillert            $str1pos,   $rd1pos-$str1pos,       # STR/PAT
807b39c5158Smillert            $rd1pos,    $ld2pos-$rd1pos,        # RIGHT DEL
808b39c5158Smillert            $ld2pos,    0,                      # NO 2ND LDEL
809b39c5158Smillert            $ld2pos,    0,                      # NO 2ND STR
810b39c5158Smillert            $ld2pos,    0,                      # NO 2ND RDEL
811b39c5158Smillert            $ld2pos,    0,                      # NO MODIFIERS
812b39c5158Smillert            $ld2pos,    $textlen-$ld2pos,       # REMAINDER
813b39c5158Smillert            $extrapos,  $str1pos-$extrapos,     # FILLETED BIT
814b39c5158Smillert        );
815b39c5158Smillert    }
816b39c5158Smillert
817b39c5158Smillert    $$textref =~ m/\G\s*/gc;
818b39c5158Smillert    $ld1pos = pos($$textref);
819b39c5158Smillert    $str1pos = $ld1pos+1;
820b39c5158Smillert
821*e0680481Safresh1    if ($$textref !~ m/\G(\S)/gc)   # SHOULD USE LOOKAHEAD
822b39c5158Smillert    {
823b39c5158Smillert        _failmsg "No block delimiter found after quotelike $op",
824b39c5158Smillert                 pos $$textref;
825b39c5158Smillert        pos $$textref = $startpos;
826b39c5158Smillert        return;
827b39c5158Smillert    }
828*e0680481Safresh1    elsif (substr($$textref, $ld1pos, 2) eq '=>')
829*e0680481Safresh1    {
830*e0680481Safresh1        _failmsg "quotelike $op was actually quoted by '=>'",
831*e0680481Safresh1                 pos $$textref;
832*e0680481Safresh1        pos $$textref = $startpos;
833*e0680481Safresh1        return;
834*e0680481Safresh1    }
835b39c5158Smillert    pos($$textref) = $ld1pos;   # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
836b39c5158Smillert    my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
837b39c5158Smillert    if ($ldel1 =~ /[[(<{]/)
838b39c5158Smillert    {
839b39c5158Smillert        $rdel1 =~ tr/[({</])}>/;
840*e0680481Safresh1        defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel1)/,"","",qr/\G($rdel1)/))
841b39c5158Smillert            || do { pos $$textref = $startpos; return };
842b39c5158Smillert        $ld2pos = pos($$textref);
843b39c5158Smillert        $rd1pos = $ld2pos-1;
844b39c5158Smillert    }
845b39c5158Smillert    else
846b39c5158Smillert    {
847b39c5158Smillert        $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
848b39c5158Smillert            || do { pos $$textref = $startpos; return };
849b39c5158Smillert        $ld2pos = $rd1pos = pos($$textref)-1;
850b39c5158Smillert    }
851b39c5158Smillert
852b39c5158Smillert    my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
853b39c5158Smillert    if ($second_arg)
854b39c5158Smillert    {
855b39c5158Smillert        my ($ldel2, $rdel2);
856b39c5158Smillert        if ($ldel1 =~ /[[(<{]/)
857b39c5158Smillert        {
858b39c5158Smillert            unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
859b39c5158Smillert            {
860b39c5158Smillert                _failmsg "Missing second block for quotelike $op",
861b39c5158Smillert                         pos $$textref;
862b39c5158Smillert                pos $$textref = $startpos;
863b39c5158Smillert                return;
864b39c5158Smillert            }
865b39c5158Smillert            $ldel2 = $rdel2 = "\Q$1";
866b39c5158Smillert            $rdel2 =~ tr/[({</])}>/;
867b39c5158Smillert        }
868b39c5158Smillert        else
869b39c5158Smillert        {
870b39c5158Smillert            $ldel2 = $rdel2 = $ldel1;
871b39c5158Smillert        }
872b39c5158Smillert        $str2pos = $ld2pos+1;
873b39c5158Smillert
874b39c5158Smillert        if ($ldel2 =~ /[[(<{]/)
875b39c5158Smillert        {
876b39c5158Smillert            pos($$textref)--;   # OVERCOME BROKEN LOOKAHEAD
877*e0680481Safresh1            defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel2)/,"","",qr/\G($rdel2)/))
878b39c5158Smillert                || do { pos $$textref = $startpos; return };
879b39c5158Smillert        }
880b39c5158Smillert        else
881b39c5158Smillert        {
882b39c5158Smillert            $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
883b39c5158Smillert                || do { pos $$textref = $startpos; return };
884b39c5158Smillert        }
885b39c5158Smillert        $rd2pos = pos($$textref)-1;
886b39c5158Smillert    }
887b39c5158Smillert    else
888b39c5158Smillert    {
889b39c5158Smillert        $ld2pos = $str2pos = $rd2pos = $rd1pos;
890b39c5158Smillert    }
891b39c5158Smillert
892b39c5158Smillert    $modpos = pos $$textref;
893b39c5158Smillert
894b39c5158Smillert    $$textref =~ m/\G($mods{$op})/gc;
895b39c5158Smillert    my $endpos = pos $$textref;
896*e0680481Safresh1    $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = undef;
897b39c5158Smillert
898b39c5158Smillert    return (
899b39c5158Smillert        $startpos,      $oppos-$startpos,       # PREFIX
900b39c5158Smillert        $oppos,         length($op),            # OPERATOR
901b39c5158Smillert        $ld1pos,        1,                      # LEFT DEL
902b39c5158Smillert        $str1pos,       $rd1pos-$str1pos,       # STR/PAT
903b39c5158Smillert        $rd1pos,        1,                      # RIGHT DEL
904b39c5158Smillert        $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
905b39c5158Smillert        $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
906b39c5158Smillert        $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
907b39c5158Smillert        $modpos,        $endpos-$modpos,        # MODIFIERS
908b39c5158Smillert        $endpos,        $textlen-$endpos,       # REMAINDER
909b39c5158Smillert    );
910b39c5158Smillert}
911b39c5158Smillert
912b39c5158Smillertmy $def_func = [
913b39c5158Smillert    sub { extract_variable($_[0], '') },
914b39c5158Smillert    sub { extract_quotelike($_[0],'') },
915b39c5158Smillert    sub { extract_codeblock($_[0],'{}','') },
916b39c5158Smillert];
917*e0680481Safresh1my %ref_not_regex = map +($_=>1), qw(CODE Text::Balanced::Extractor);
918b39c5158Smillert
919*e0680481Safresh1sub _update_patvalid {
920*e0680481Safresh1    my ($textref, $text) = @_;
921*e0680481Safresh1    if ($ref2slashvalid{$textref} && $text =~ m/(?:$RE_NUM|[\)\]])\s*$/)
922*e0680481Safresh1    {
923*e0680481Safresh1        $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
924*e0680481Safresh1    } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_PREREGEX_PAT\s*$/)
925*e0680481Safresh1    {
926*e0680481Safresh1        $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1;
927*e0680481Safresh1    } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_EXPR_PAT\s*$/)
928*e0680481Safresh1    {
929*e0680481Safresh1        $ref2slashvalid{$textref} = 1;
930*e0680481Safresh1        $ref2qmarkvalid{$textref} = 0;
931*e0680481Safresh1    }
932*e0680481Safresh1}
933b39c5158Smillertsub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
934b39c5158Smillert{
935b39c5158Smillert    my $textref = defined($_[0]) ? \$_[0] : \$_;
936*e0680481Safresh1    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
937b39c5158Smillert    my $posbug = pos;
938b39c5158Smillert    my ($lastpos, $firstpos);
939b39c5158Smillert    my @fields = ();
940b39c5158Smillert
941b39c5158Smillert    #for ($$textref)
942b39c5158Smillert    {
943b39c5158Smillert        my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
944b39c5158Smillert        my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
945b39c5158Smillert        my $igunk = $_[3];
946b39c5158Smillert
947b39c5158Smillert        pos $$textref ||= 0;
948b39c5158Smillert
949b39c5158Smillert        unless (wantarray)
950b39c5158Smillert        {
951b39c5158Smillert            use Carp;
952b39c5158Smillert            carp "extract_multiple reset maximal count to 1 in scalar context"
953b39c5158Smillert                    if $^W && defined($_[2]) && $max > 1;
954b39c5158Smillert            $max = 1
955b39c5158Smillert        }
956b39c5158Smillert
957b39c5158Smillert        my @class;
958eac174f2Safresh1        foreach my $func ( @func )
959b39c5158Smillert        {
960b39c5158Smillert            push @class, undef;
961*e0680481Safresh1            ($class[-1], $func) = %$func if ref($func) eq 'HASH';
962*e0680481Safresh1            $func = qr/\G$func/ if !$ref_not_regex{ref $func};
963b39c5158Smillert        }
964b39c5158Smillert
965*e0680481Safresh1        my $unkpos;
966b39c5158Smillert        FIELD: while (pos($$textref) < length($$textref))
967b39c5158Smillert        {
968b39c5158Smillert            foreach my $i ( 0..$#func )
969b39c5158Smillert            {
970*e0680481Safresh1                my ($field, $pref);
971*e0680481Safresh1                my ($class, $func) = ($class[$i], $func[$i]);
972b39c5158Smillert                $lastpos = pos $$textref;
973b39c5158Smillert                if (ref($func) eq 'CODE')
974*e0680481Safresh1                    { ($field,undef,$pref) = $func->($$textref) }
975b39c5158Smillert                elsif (ref($func) eq 'Text::Balanced::Extractor')
976*e0680481Safresh1                    { $field = $func->extract($$textref) }
977*e0680481Safresh1                elsif( $$textref =~ m/$func[$i]/gc )
978*e0680481Safresh1                    { $field = defined($1)
979b39c5158Smillert                        ? $1
980b39c5158Smillert                        : substr($$textref, $-[0], $+[0] - $-[0])
981b39c5158Smillert                    }
982b39c5158Smillert                $pref ||= "";
983b39c5158Smillert                if (defined($field) && length($field))
984b39c5158Smillert                {
985b39c5158Smillert                    if (!$igunk) {
986b39c5158Smillert                        $unkpos = $lastpos
987b39c5158Smillert                            if length($pref) && !defined($unkpos);
988b39c5158Smillert                        if (defined $unkpos)
989b39c5158Smillert                        {
990b39c5158Smillert                            push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
991b39c5158Smillert                            $firstpos = $unkpos unless defined $firstpos;
992b39c5158Smillert                            undef $unkpos;
993b39c5158Smillert                            last FIELD if @fields == $max;
994b39c5158Smillert                        }
995b39c5158Smillert                    }
996*e0680481Safresh1                    push @fields, $class ? bless(\$field, $class) : $field;
997*e0680481Safresh1                    _update_patvalid($textref, $fields[-1]);
998b39c5158Smillert                    $firstpos = $lastpos unless defined $firstpos;
999b39c5158Smillert                    $lastpos = pos $$textref;
1000b39c5158Smillert                    last FIELD if @fields == $max;
1001b39c5158Smillert                    next FIELD;
1002b39c5158Smillert                }
1003b39c5158Smillert            }
1004b39c5158Smillert            if ($$textref =~ /\G(.)/gcs)
1005b39c5158Smillert            {
1006b39c5158Smillert                $unkpos = pos($$textref)-1
1007b39c5158Smillert                    unless $igunk || defined $unkpos;
1008*e0680481Safresh1                _update_patvalid($textref, substr $$textref, $unkpos, pos($$textref)-$unkpos);
1009b39c5158Smillert            }
1010b39c5158Smillert        }
1011b39c5158Smillert
1012b39c5158Smillert        if (defined $unkpos)
1013b39c5158Smillert        {
1014b39c5158Smillert            push @fields, substr($$textref, $unkpos);
1015b39c5158Smillert            $firstpos = $unkpos unless defined $firstpos;
1016b39c5158Smillert            $lastpos = length $$textref;
1017b39c5158Smillert        }
1018b39c5158Smillert        last;
1019b39c5158Smillert    }
1020b39c5158Smillert
1021b39c5158Smillert    pos $$textref = $lastpos;
1022b39c5158Smillert    return @fields if wantarray;
1023b39c5158Smillert
1024b39c5158Smillert    $firstpos ||= 0;
1025b39c5158Smillert    eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
1026b39c5158Smillert           pos $$textref = $firstpos };
1027b39c5158Smillert    return $fields[0];
1028b39c5158Smillert}
1029b39c5158Smillert
1030b39c5158Smillertsub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
1031b39c5158Smillert{
1032b39c5158Smillert    my $ldel    = $_[0];
1033b39c5158Smillert    my $rdel    = $_[1];
1034*e0680481Safresh1    my $pre     = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/;
1035b39c5158Smillert    my %options = defined $_[3] ? %{$_[3]} : ();
1036b39c5158Smillert    my $omode   = defined $options{fail} ? $options{fail} : '';
1037b39c5158Smillert    my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
1038b39c5158Smillert                : defined($options{reject})        ? $options{reject}
1039b39c5158Smillert                :                                    ''
1040b39c5158Smillert                ;
1041b39c5158Smillert    my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
1042b39c5158Smillert                : defined($options{ignore})        ? $options{ignore}
1043b39c5158Smillert                :                                    ''
1044b39c5158Smillert                ;
1045b39c5158Smillert
1046*e0680481Safresh1    $ldel = $et_default_ldel if !defined $ldel;
1047b39c5158Smillert
1048b39c5158Smillert    my $posbug = pos;
1049*e0680481Safresh1    for ($ldel, $bad, $ignore) { $_ = qr/$_/ if $_ }
1050b39c5158Smillert    pos = $posbug;
1051b39c5158Smillert
1052b39c5158Smillert    my $closure = sub
1053b39c5158Smillert    {
1054b39c5158Smillert        my $textref = defined $_[0] ? \$_[0] : \$_;
1055*e0680481Safresh1        my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1056b39c5158Smillert
1057b39c5158Smillert        return _fail(wantarray, $textref) unless @match;
1058b39c5158Smillert        return _succeed wantarray, $textref,
1059b39c5158Smillert                        $match[2], $match[3]+$match[5]+$match[7],   # MATCH
1060b39c5158Smillert                        @match[8..9,0..1,2..7];                     # REM, PRE, BITS
1061b39c5158Smillert    };
1062b39c5158Smillert
1063b39c5158Smillert    bless $closure, 'Text::Balanced::Extractor';
1064b39c5158Smillert}
1065b39c5158Smillert
1066b39c5158Smillertpackage Text::Balanced::Extractor;
1067b39c5158Smillert
1068b39c5158Smillertsub extract($$) # ($self, $text)
1069b39c5158Smillert{
1070b39c5158Smillert    &{$_[0]}($_[1]);
1071b39c5158Smillert}
1072b39c5158Smillert
1073b39c5158Smillertpackage Text::Balanced::ErrorMsg;
1074b39c5158Smillert
1075*e0680481Safresh1use overload
1076*e0680481Safresh1    '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" },
1077*e0680481Safresh1    fallback => 1;
1078b39c5158Smillert
1079b39c5158Smillert1;
1080b39c5158Smillert
1081b39c5158Smillert__END__
1082b39c5158Smillert
1083b39c5158Smillert=pod
1084b39c5158Smillert
1085b39c5158Smillert=head1 NAME
1086b39c5158Smillert
1087b39c5158SmillertText::Balanced - Extract delimited text sequences from strings.
1088b39c5158Smillert
1089b39c5158Smillert=head1 SYNOPSIS
1090b39c5158Smillert
1091b39c5158Smillert    use Text::Balanced qw (
1092b39c5158Smillert        extract_delimited
1093b39c5158Smillert        extract_bracketed
1094b39c5158Smillert        extract_quotelike
1095b39c5158Smillert        extract_codeblock
1096b39c5158Smillert        extract_variable
1097b39c5158Smillert        extract_tagged
1098b39c5158Smillert        extract_multiple
1099b39c5158Smillert        gen_delimited_pat
1100b39c5158Smillert        gen_extract_tagged
1101b39c5158Smillert    );
1102b39c5158Smillert
1103b39c5158Smillert    # Extract the initial substring of $text that is delimited by
1104b39c5158Smillert    # two (unescaped) instances of the first character in $delim.
1105b39c5158Smillert
1106b39c5158Smillert    ($extracted, $remainder) = extract_delimited($text,$delim);
1107b39c5158Smillert
1108b39c5158Smillert    # Extract the initial substring of $text that is bracketed
1109b39c5158Smillert    # with a delimiter(s) specified by $delim (where the string
1110b39c5158Smillert    # in $delim contains one or more of '(){}[]<>').
1111b39c5158Smillert
1112b39c5158Smillert    ($extracted, $remainder) = extract_bracketed($text,$delim);
1113b39c5158Smillert
1114b39c5158Smillert    # Extract the initial substring of $text that is bounded by
1115b39c5158Smillert    # an XML tag.
1116b39c5158Smillert
1117b39c5158Smillert    ($extracted, $remainder) = extract_tagged($text);
1118b39c5158Smillert
1119b39c5158Smillert    # Extract the initial substring of $text that is bounded by
1120b39c5158Smillert    # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1121b39c5158Smillert
1122b39c5158Smillert    ($extracted, $remainder) =
1123b39c5158Smillert        extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1124b39c5158Smillert
1125b39c5158Smillert    # Extract the initial substring of $text that represents a
1126b39c5158Smillert    # Perl "quote or quote-like operation"
1127b39c5158Smillert
1128b39c5158Smillert    ($extracted, $remainder) = extract_quotelike($text);
1129b39c5158Smillert
1130b39c5158Smillert    # Extract the initial substring of $text that represents a block
1131b39c5158Smillert    # of Perl code, bracketed by any of character(s) specified by $delim
1132b39c5158Smillert    # (where the string $delim contains one or more of '(){}[]<>').
1133b39c5158Smillert
1134b39c5158Smillert    ($extracted, $remainder) = extract_codeblock($text,$delim);
1135b39c5158Smillert
1136b39c5158Smillert    # Extract the initial substrings of $text that would be extracted by
1137b39c5158Smillert    # one or more sequential applications of the specified functions
1138b39c5158Smillert    # or regular expressions
1139b39c5158Smillert
1140b39c5158Smillert    @extracted = extract_multiple($text,
1141b39c5158Smillert                                  [ \&extract_bracketed,
1142b39c5158Smillert                                    \&extract_quotelike,
1143b39c5158Smillert                                    \&some_other_extractor_sub,
1144b39c5158Smillert                                    qr/[xyz]*/,
1145b39c5158Smillert                                    'literal',
1146b39c5158Smillert                                  ]);
1147b39c5158Smillert
1148b39c5158Smillert    # Create a string representing an optimized pattern (a la Friedl)
1149b39c5158Smillert    # that matches a substring delimited by any of the specified characters
1150b39c5158Smillert    # (in this case: any type of quote or a slash)
1151b39c5158Smillert
1152b39c5158Smillert    $patstring = gen_delimited_pat(q{'"`/});
1153b39c5158Smillert
1154b39c5158Smillert    # Generate a reference to an anonymous sub that is just like extract_tagged
1155eac174f2Safresh1    # but pre-compiled and optimized for a specific pair of tags, and
1156eac174f2Safresh1    # consequently much faster (i.e. 3 times faster). It uses qr// for better
1157eac174f2Safresh1    # performance on repeated calls.
1158b39c5158Smillert
1159b39c5158Smillert    $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1160b39c5158Smillert    ($extracted, $remainder) = $extract_head->($text);
1161b39c5158Smillert
1162b39c5158Smillert=head1 DESCRIPTION
1163b39c5158Smillert
1164b39c5158SmillertThe various C<extract_...> subroutines may be used to
1165b39c5158Smillertextract a delimited substring, possibly after skipping a
1166b39c5158Smillertspecified prefix string. By default, that prefix is
1167b39c5158Smillertoptional whitespace (C</\s*/>), but you can change it to whatever
1168b39c5158Smillertyou wish (see below).
1169b39c5158Smillert
1170b39c5158SmillertThe substring to be extracted must appear at the
1171b39c5158Smillertcurrent C<pos> location of the string's variable
1172b39c5158Smillert(or at index zero, if no C<pos> position is defined).
1173b39c5158SmillertIn other words, the C<extract_...> subroutines I<don't>
1174b39c5158Smillertextract the first occurrence of a substring anywhere
1175b39c5158Smillertin a string (like an unanchored regex would). Rather,
1176b39c5158Smillertthey extract an occurrence of the substring appearing
1177b39c5158Smillertimmediately at the current matching position in the
1178b39c5158Smillertstring (like a C<\G>-anchored regex would).
1179b39c5158Smillert
1180eac174f2Safresh1=head2 General Behaviour in List Contexts
1181b39c5158Smillert
1182b39c5158SmillertIn a list context, all the subroutines return a list, the first three
1183b39c5158Smillertelements of which are always:
1184b39c5158Smillert
1185b39c5158Smillert=over 4
1186b39c5158Smillert
1187b39c5158Smillert=item [0]
1188b39c5158Smillert
1189b39c5158SmillertThe extracted string, including the specified delimiters.
1190b39c5158SmillertIf the extraction fails C<undef> is returned.
1191b39c5158Smillert
1192b39c5158Smillert=item [1]
1193b39c5158Smillert
1194b39c5158SmillertThe remainder of the input string (i.e. the characters after the
1195b39c5158Smillertextracted string). On failure, the entire string is returned.
1196b39c5158Smillert
1197b39c5158Smillert=item [2]
1198b39c5158Smillert
1199b39c5158SmillertThe skipped prefix (i.e. the characters before the extracted string).
1200b39c5158SmillertOn failure, C<undef> is returned.
1201b39c5158Smillert
1202b39c5158Smillert=back
1203b39c5158Smillert
1204b39c5158SmillertNote that in a list context, the contents of the original input text (the first
1205b39c5158Smillertargument) are not modified in any way.
1206b39c5158Smillert
1207b39c5158SmillertHowever, if the input text was passed in a variable, that variable's
1208b39c5158SmillertC<pos> value is updated to point at the first character after the
1209b39c5158Smillertextracted text. That means that in a list context the various
1210b39c5158Smillertsubroutines can be used much like regular expressions. For example:
1211b39c5158Smillert
1212b39c5158Smillert    while ( $next = (extract_quotelike($text))[0] )
1213b39c5158Smillert    {
1214b39c5158Smillert        # process next quote-like (in $next)
1215b39c5158Smillert    }
1216b39c5158Smillert
1217eac174f2Safresh1=head2 General Behaviour in Scalar and Void Contexts
1218b39c5158Smillert
1219b39c5158SmillertIn a scalar context, the extracted string is returned, having first been
1220b39c5158Smillertremoved from the input text. Thus, the following code also processes
1221b39c5158Smillerteach quote-like operation, but actually removes them from $text:
1222b39c5158Smillert
1223b39c5158Smillert    while ( $next = extract_quotelike($text) )
1224b39c5158Smillert    {
1225b39c5158Smillert        # process next quote-like (in $next)
1226b39c5158Smillert    }
1227b39c5158Smillert
1228b39c5158SmillertNote that if the input text is a read-only string (i.e. a literal),
1229b39c5158Smillertno attempt is made to remove the extracted text.
1230b39c5158Smillert
1231b39c5158SmillertIn a void context the behaviour of the extraction subroutines is
1232b39c5158Smillertexactly the same as in a scalar context, except (of course) that the
1233b39c5158Smillertextracted substring is not returned.
1234b39c5158Smillert
1235eac174f2Safresh1=head2 A Note About Prefixes
1236b39c5158Smillert
1237b39c5158SmillertPrefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1238b39c5158SmillertThis can bite you if you're expecting a prefix specification like
1239b39c5158Smillert'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1240b39c5158Smillertpattern will only succeed if the <H1> tag is on the current line, since
1241b39c5158Smillert. normally doesn't match newlines.
1242b39c5158Smillert
1243b39c5158SmillertTo overcome this limitation, you need to turn on /s matching within
1244b39c5158Smillertthe prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1245b39c5158Smillert
1246eac174f2Safresh1=head2 Functions
1247eac174f2Safresh1
1248eac174f2Safresh1=over 4
1249eac174f2Safresh1
1250eac174f2Safresh1=item C<extract_delimited>
1251b39c5158Smillert
1252b39c5158SmillertThe C<extract_delimited> function formalizes the common idiom
1253b39c5158Smillertof extracting a single-character-delimited substring from the start of
1254b39c5158Smillerta string. For example, to extract a single-quote delimited string, the
1255b39c5158Smillertfollowing code is typically used:
1256b39c5158Smillert
1257b39c5158Smillert    ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1258b39c5158Smillert    $extracted = $1;
1259b39c5158Smillert
1260b39c5158Smillertbut with C<extract_delimited> it can be simplified to:
1261b39c5158Smillert
1262b39c5158Smillert    ($extracted,$remainder) = extract_delimited($text, "'");
1263b39c5158Smillert
1264b39c5158SmillertC<extract_delimited> takes up to four scalars (the input text, the
1265b39c5158Smillertdelimiters, a prefix pattern to be skipped, and any escape characters)
1266b39c5158Smillertand extracts the initial substring of the text that
1267b39c5158Smillertis appropriately delimited. If the delimiter string has multiple
1268b39c5158Smillertcharacters, the first one encountered in the text is taken to delimit
1269b39c5158Smillertthe substring.
1270b39c5158SmillertThe third argument specifies a prefix pattern that is to be skipped
1271b39c5158Smillert(but must be present!) before the substring is extracted.
1272b39c5158SmillertThe final argument specifies the escape character to be used for each
1273b39c5158Smillertdelimiter.
1274b39c5158Smillert
1275b39c5158SmillertAll arguments are optional. If the escape characters are not specified,
1276b39c5158Smillertevery delimiter is escaped with a backslash (C<\>).
1277b39c5158SmillertIf the prefix is not specified, the
1278b39c5158Smillertpattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1279b39c5158Smillertis also not specified, the set C</["'`]/> is used. If the text to be processed
1280b39c5158Smillertis not specified either, C<$_> is used.
1281b39c5158Smillert
1282b39c5158SmillertIn list context, C<extract_delimited> returns a array of three
1283b39c5158Smillertelements, the extracted substring (I<including the surrounding
1284b39c5158Smillertdelimiters>), the remainder of the text, and the skipped prefix (if
1285b39c5158Smillertany). If a suitable delimited substring is not found, the first
1286b39c5158Smillertelement of the array is the empty string, the second is the complete
1287b39c5158Smillertoriginal text, and the prefix returned in the third element is an
1288b39c5158Smillertempty string.
1289b39c5158Smillert
1290b39c5158SmillertIn a scalar context, just the extracted substring is returned. In
1291b39c5158Smillerta void context, the extracted substring (and any prefix) are simply
1292b39c5158Smillertremoved from the beginning of the first argument.
1293b39c5158Smillert
1294b39c5158SmillertExamples:
1295b39c5158Smillert
1296b39c5158Smillert    # Remove a single-quoted substring from the very beginning of $text:
1297b39c5158Smillert
1298b39c5158Smillert        $substring = extract_delimited($text, "'", '');
1299b39c5158Smillert
1300b39c5158Smillert    # Remove a single-quoted Pascalish substring (i.e. one in which
1301b39c5158Smillert    # doubling the quote character escapes it) from the very
1302b39c5158Smillert    # beginning of $text:
1303b39c5158Smillert
1304b39c5158Smillert        $substring = extract_delimited($text, "'", '', "'");
1305b39c5158Smillert
1306b39c5158Smillert    # Extract a single- or double- quoted substring from the
1307b39c5158Smillert    # beginning of $text, optionally after some whitespace
1308b39c5158Smillert    # (note the list context to protect $text from modification):
1309b39c5158Smillert
1310b39c5158Smillert        ($substring) = extract_delimited $text, q{"'};
1311b39c5158Smillert
1312b39c5158Smillert    # Delete the substring delimited by the first '/' in $text:
1313b39c5158Smillert
1314b39c5158Smillert        $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1315b39c5158Smillert
1316b39c5158SmillertNote that this last example is I<not> the same as deleting the first
1317b39c5158Smillertquote-like pattern. For instance, if C<$text> contained the string:
1318b39c5158Smillert
1319b39c5158Smillert    "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1320b39c5158Smillert
1321b39c5158Smillertthen after the deletion it would contain:
1322b39c5158Smillert
1323b39c5158Smillert    "if ('.$UNIXCMD/s) { $cmd = $1; }"
1324b39c5158Smillert
1325b39c5158Smillertnot:
1326b39c5158Smillert
1327b39c5158Smillert    "if ('./cmd' =~ ms) { $cmd = $1; }"
1328b39c5158Smillert
1329b39c5158SmillertSee L<"extract_quotelike"> for a (partial) solution to this problem.
1330b39c5158Smillert
1331eac174f2Safresh1=item C<extract_bracketed>
1332b39c5158Smillert
1333b39c5158SmillertLike C<"extract_delimited">, the C<extract_bracketed> function takes
1334b39c5158Smillertup to three optional scalar arguments: a string to extract from, a delimiter
1335b39c5158Smillertspecifier, and a prefix pattern. As before, a missing prefix defaults to
1336b39c5158Smillertoptional whitespace and a missing text defaults to C<$_>. However, a missing
1337b39c5158Smillertdelimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1338b39c5158Smillert
1339b39c5158SmillertC<extract_bracketed> extracts a balanced-bracket-delimited
1340b39c5158Smillertsubstring (using any one (or more) of the user-specified delimiter
1341b39c5158Smillertbrackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1342b39c5158Smillertrespect quoted unbalanced brackets (see below).
1343b39c5158Smillert
1344b39c5158SmillertA "delimiter bracket" is a bracket in list of delimiters passed as
1345b39c5158SmillertC<extract_bracketed>'s second argument. Delimiter brackets are
1346b39c5158Smillertspecified by giving either the left or right (or both!) versions
1347b39c5158Smillertof the required bracket(s). Note that the order in which
1348b39c5158Smillerttwo or more delimiter brackets are specified is not significant.
1349b39c5158Smillert
1350b39c5158SmillertA "balanced-bracket-delimited substring" is a substring bounded by
1351b39c5158Smillertmatched brackets, such that any other (left or right) delimiter
1352b39c5158Smillertbracket I<within> the substring is also matched by an opposite
1353b39c5158Smillert(right or left) delimiter bracket I<at the same level of nesting>. Any
1354b39c5158Smillerttype of bracket not in the delimiter list is treated as an ordinary
1355b39c5158Smillertcharacter.
1356b39c5158Smillert
1357b39c5158SmillertIn other words, each type of bracket specified as a delimiter must be
1358b39c5158Smillertbalanced and correctly nested within the substring, and any other kind of
1359b39c5158Smillert("non-delimiter") bracket in the substring is ignored.
1360b39c5158Smillert
1361b39c5158SmillertFor example, given the string:
1362b39c5158Smillert
1363b39c5158Smillert    $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1364b39c5158Smillert
1365b39c5158Smillertthen a call to C<extract_bracketed> in a list context:
1366b39c5158Smillert
1367b39c5158Smillert    @result = extract_bracketed( $text, '{}' );
1368b39c5158Smillert
1369b39c5158Smillertwould return:
1370b39c5158Smillert
1371b39c5158Smillert    ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1372b39c5158Smillert
1373b39c5158Smillertsince both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1374b39c5158Smillert(In a scalar context just the first element of the array would be returned. In
1375b39c5158Smillerta void context, C<$text> would be replaced by an empty string.)
1376b39c5158Smillert
1377b39c5158SmillertLikewise the call in:
1378b39c5158Smillert
1379b39c5158Smillert    @result = extract_bracketed( $text, '{[' );
1380b39c5158Smillert
1381b39c5158Smillertwould return the same result, since all sets of both types of specified
1382b39c5158Smillertdelimiter brackets are correctly nested and balanced.
1383b39c5158Smillert
1384b39c5158SmillertHowever, the call in:
1385b39c5158Smillert
1386b39c5158Smillert    @result = extract_bracketed( $text, '{([<' );
1387b39c5158Smillert
1388b39c5158Smillertwould fail, returning:
1389b39c5158Smillert
1390b39c5158Smillert    ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1391b39c5158Smillert
1392b39c5158Smillertbecause the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1393b39c5158Smillertthe embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1394b39c5158Smillertreturn an empty string. In a void context, C<$text> would be unchanged.)
1395b39c5158Smillert
1396b39c5158SmillertNote that the embedded single-quotes in the string don't help in this
1397b39c5158Smillertcase, since they have not been specified as acceptable delimiters and are
1398b39c5158Smillerttherefore treated as non-delimiter characters (and ignored).
1399b39c5158Smillert
1400b39c5158SmillertHowever, if a particular species of quote character is included in the
1401b39c5158Smillertdelimiter specification, then that type of quote will be correctly handled.
1402b39c5158Smillertfor example, if C<$text> is:
1403b39c5158Smillert
1404b39c5158Smillert    $text = '<A HREF=">>>>">link</A>';
1405b39c5158Smillert
1406b39c5158Smillertthen
1407b39c5158Smillert
1408b39c5158Smillert    @result = extract_bracketed( $text, '<">' );
1409b39c5158Smillert
1410b39c5158Smillertreturns:
1411b39c5158Smillert
1412b39c5158Smillert    ( '<A HREF=">>>>">', 'link</A>', "" )
1413b39c5158Smillert
1414b39c5158Smillertas expected. Without the specification of C<"> as an embedded quoter:
1415b39c5158Smillert
1416b39c5158Smillert    @result = extract_bracketed( $text, '<>' );
1417b39c5158Smillert
1418b39c5158Smillertthe result would be:
1419b39c5158Smillert
1420b39c5158Smillert    ( '<A HREF=">', '>>>">link</A>', "" )
1421b39c5158Smillert
1422b39c5158SmillertIn addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1423b39c5158Smillertquoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1424b39c5158Smillertletter 'q' as a delimiter. Hence:
1425b39c5158Smillert
1426b39c5158Smillert    @result = extract_bracketed( $text, '<q>' );
1427b39c5158Smillert
1428b39c5158Smillertwould correctly match something like this:
1429b39c5158Smillert
1430b39c5158Smillert    $text = '<leftop: conj /and/ conj>';
1431b39c5158Smillert
1432b39c5158SmillertSee also: C<"extract_quotelike"> and C<"extract_codeblock">.
1433b39c5158Smillert
1434eac174f2Safresh1=item C<extract_variable>
1435b39c5158Smillert
1436b39c5158SmillertC<extract_variable> extracts any valid Perl variable or
1437b39c5158Smillertvariable-involved expression, including scalars, arrays, hashes, array
1438b39c5158Smillertaccesses, hash look-ups, method calls through objects, subroutine calls
1439b39c5158Smillertthrough subroutine references, etc.
1440b39c5158Smillert
1441b39c5158SmillertThe subroutine takes up to two optional arguments:
1442b39c5158Smillert
1443b39c5158Smillert=over 4
1444b39c5158Smillert
1445b39c5158Smillert=item 1.
1446b39c5158Smillert
1447b39c5158SmillertA string to be processed (C<$_> if the string is omitted or C<undef>)
1448b39c5158Smillert
1449b39c5158Smillert=item 2.
1450b39c5158Smillert
1451b39c5158SmillertA string specifying a pattern to be matched as a prefix (which is to be
1452b39c5158Smillertskipped). If omitted, optional whitespace is skipped.
1453b39c5158Smillert
1454b39c5158Smillert=back
1455b39c5158Smillert
1456b39c5158SmillertOn success in a list context, an array of 3 elements is returned. The
1457b39c5158Smillertelements are:
1458b39c5158Smillert
1459b39c5158Smillert=over 4
1460b39c5158Smillert
1461b39c5158Smillert=item [0]
1462b39c5158Smillert
1463b39c5158Smillertthe extracted variable, or variablish expression
1464b39c5158Smillert
1465b39c5158Smillert=item [1]
1466b39c5158Smillert
1467b39c5158Smillertthe remainder of the input text,
1468b39c5158Smillert
1469b39c5158Smillert=item [2]
1470b39c5158Smillert
1471b39c5158Smillertthe prefix substring (if any),
1472b39c5158Smillert
1473b39c5158Smillert=back
1474b39c5158Smillert
1475b39c5158SmillertOn failure, all of these values (except the remaining text) are C<undef>.
1476b39c5158Smillert
1477b39c5158SmillertIn a scalar context, C<extract_variable> returns just the complete
1478b39c5158Smillertsubstring that matched a variablish expression. C<undef> is returned on
1479b39c5158Smillertfailure. In addition, the original input text has the returned substring
1480b39c5158Smillert(and any prefix) removed from it.
1481b39c5158Smillert
1482b39c5158SmillertIn a void context, the input text just has the matched substring (and
1483b39c5158Smillertany specified prefix) removed.
1484b39c5158Smillert
1485eac174f2Safresh1=item C<extract_tagged>
1486b39c5158Smillert
1487b39c5158SmillertC<extract_tagged> extracts and segments text between (balanced)
1488b39c5158Smillertspecified tags.
1489b39c5158Smillert
1490b39c5158SmillertThe subroutine takes up to five optional arguments:
1491b39c5158Smillert
1492b39c5158Smillert=over 4
1493b39c5158Smillert
1494b39c5158Smillert=item 1.
1495b39c5158Smillert
1496b39c5158SmillertA string to be processed (C<$_> if the string is omitted or C<undef>)
1497b39c5158Smillert
1498b39c5158Smillert=item 2.
1499b39c5158Smillert
1500*e0680481Safresh1A string specifying a pattern (i.e. regex) to be matched as the opening tag.
1501b39c5158SmillertIf the pattern string is omitted (or C<undef>) then a pattern
1502b39c5158Smillertthat matches any standard XML tag is used.
1503b39c5158Smillert
1504b39c5158Smillert=item 3.
1505b39c5158Smillert
1506b39c5158SmillertA string specifying a pattern to be matched at the closing tag.
1507b39c5158SmillertIf the pattern string is omitted (or C<undef>) then the closing
1508b39c5158Smillerttag is constructed by inserting a C</> after any leading bracket
1509b39c5158Smillertcharacters in the actual opening tag that was matched (I<not> the pattern
1510b39c5158Smillertthat matched the tag). For example, if the opening tag pattern
1511b39c5158Smillertis specified as C<'{{\w+}}'> and actually matched the opening tag
1512b39c5158SmillertC<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1513b39c5158Smillert
1514b39c5158Smillert=item 4.
1515b39c5158Smillert
1516b39c5158SmillertA string specifying a pattern to be matched as a prefix (which is to be
1517b39c5158Smillertskipped). If omitted, optional whitespace is skipped.
1518b39c5158Smillert
1519b39c5158Smillert=item 5.
1520b39c5158Smillert
1521b39c5158SmillertA hash reference containing various parsing options (see below)
1522b39c5158Smillert
1523b39c5158Smillert=back
1524b39c5158Smillert
1525b39c5158SmillertThe various options that can be specified are:
1526b39c5158Smillert
1527b39c5158Smillert=over 4
1528b39c5158Smillert
1529b39c5158Smillert=item C<reject =E<gt> $listref>
1530b39c5158Smillert
1531b39c5158SmillertThe list reference contains one or more strings specifying patterns
1532b39c5158Smillertthat must I<not> appear within the tagged text.
1533b39c5158Smillert
1534b39c5158SmillertFor example, to extract
1535b39c5158Smillertan HTML link (which should not contain nested links) use:
1536b39c5158Smillert
1537b39c5158Smillert        extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1538b39c5158Smillert
1539b39c5158Smillert=item C<ignore =E<gt> $listref>
1540b39c5158Smillert
1541b39c5158SmillertThe list reference contains one or more strings specifying patterns
1542eac174f2Safresh1that are I<not> to be treated as nested tags within the tagged text
1543b39c5158Smillert(even if they would match the start tag pattern).
1544b39c5158Smillert
1545b39c5158SmillertFor example, to extract an arbitrary XML tag, but ignore "empty" elements:
1546b39c5158Smillert
1547b39c5158Smillert        extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1548b39c5158Smillert
1549b39c5158Smillert(also see L<"gen_delimited_pat"> below).
1550b39c5158Smillert
1551b39c5158Smillert=item C<fail =E<gt> $str>
1552b39c5158Smillert
1553b39c5158SmillertThe C<fail> option indicates the action to be taken if a matching end
1554b39c5158Smillerttag is not encountered (i.e. before the end of the string or some
1555b39c5158SmillertC<reject> pattern matches). By default, a failure to match a closing
1556b39c5158Smillerttag causes C<extract_tagged> to immediately fail.
1557b39c5158Smillert
1558b39c5158SmillertHowever, if the string value associated with <reject> is "MAX", then
1559b39c5158SmillertC<extract_tagged> returns the complete text up to the point of failure.
1560b39c5158SmillertIf the string is "PARA", C<extract_tagged> returns only the first paragraph
1561b39c5158Smillertafter the tag (up to the first line that is either empty or contains
1562b39c5158Smillertonly whitespace characters).
15639dc91179Safresh1If the string is "", the default behaviour (i.e. failure) is reinstated.
1564b39c5158Smillert
1565b39c5158SmillertFor example, suppose the start tag "/para" introduces a paragraph, which then
1566b39c5158Smillertcontinues until the next "/endpara" tag or until another "/para" tag is
1567b39c5158Smillertencountered:
1568b39c5158Smillert
1569b39c5158Smillert        $text = "/para line 1\n\nline 3\n/para line 4";
1570b39c5158Smillert
1571b39c5158Smillert        extract_tagged($text, '/para', '/endpara', undef,
1572b39c5158Smillert                                {reject => '/para', fail => MAX );
1573b39c5158Smillert
1574b39c5158Smillert        # EXTRACTED: "/para line 1\n\nline 3\n"
1575b39c5158Smillert
1576b39c5158SmillertSuppose instead, that if no matching "/endpara" tag is found, the "/para"
1577b39c5158Smillerttag refers only to the immediately following paragraph:
1578b39c5158Smillert
1579b39c5158Smillert        $text = "/para line 1\n\nline 3\n/para line 4";
1580b39c5158Smillert
1581b39c5158Smillert        extract_tagged($text, '/para', '/endpara', undef,
1582b39c5158Smillert                        {reject => '/para', fail => MAX );
1583b39c5158Smillert
1584b39c5158Smillert        # EXTRACTED: "/para line 1\n"
1585b39c5158Smillert
1586b39c5158SmillertNote that the specified C<fail> behaviour applies to nested tags as well.
1587b39c5158Smillert
1588b39c5158Smillert=back
1589b39c5158Smillert
1590b39c5158SmillertOn success in a list context, an array of 6 elements is returned. The elements are:
1591b39c5158Smillert
1592b39c5158Smillert=over 4
1593b39c5158Smillert
1594b39c5158Smillert=item [0]
1595b39c5158Smillert
1596b39c5158Smillertthe extracted tagged substring (including the outermost tags),
1597b39c5158Smillert
1598b39c5158Smillert=item [1]
1599b39c5158Smillert
1600b39c5158Smillertthe remainder of the input text,
1601b39c5158Smillert
1602b39c5158Smillert=item [2]
1603b39c5158Smillert
1604b39c5158Smillertthe prefix substring (if any),
1605b39c5158Smillert
1606b39c5158Smillert=item [3]
1607b39c5158Smillert
1608b39c5158Smillertthe opening tag
1609b39c5158Smillert
1610b39c5158Smillert=item [4]
1611b39c5158Smillert
1612b39c5158Smillertthe text between the opening and closing tags
1613b39c5158Smillert
1614b39c5158Smillert=item [5]
1615b39c5158Smillert
1616b39c5158Smillertthe closing tag (or "" if no closing tag was found)
1617b39c5158Smillert
1618b39c5158Smillert=back
1619b39c5158Smillert
1620b39c5158SmillertOn failure, all of these values (except the remaining text) are C<undef>.
1621b39c5158Smillert
1622b39c5158SmillertIn a scalar context, C<extract_tagged> returns just the complete
1623b39c5158Smillertsubstring that matched a tagged text (including the start and end
1624b39c5158Smillerttags). C<undef> is returned on failure. In addition, the original input
1625b39c5158Smillerttext has the returned substring (and any prefix) removed from it.
1626b39c5158Smillert
1627b39c5158SmillertIn a void context, the input text just has the matched substring (and
1628b39c5158Smillertany specified prefix) removed.
1629b39c5158Smillert
1630eac174f2Safresh1=item C<gen_extract_tagged>
1631b39c5158Smillert
1632b39c5158SmillertC<gen_extract_tagged> generates a new anonymous subroutine which
1633b39c5158Smillertextracts text between (balanced) specified tags. In other words,
1634b39c5158Smillertit generates a function identical in function to C<extract_tagged>.
1635b39c5158Smillert
1636b39c5158SmillertThe difference between C<extract_tagged> and the anonymous
1637b39c5158Smillertsubroutines generated by
1638b39c5158SmillertC<gen_extract_tagged>, is that those generated subroutines:
1639b39c5158Smillert
1640b39c5158Smillert=over 4
1641b39c5158Smillert
1642b39c5158Smillert=item *
1643b39c5158Smillert
1644b39c5158Smillertdo not have to reparse tag specification or parsing options every time
1645b39c5158Smillertthey are called (whereas C<extract_tagged> has to effectively rebuild
1646b39c5158Smillertits tag parser on every call);
1647b39c5158Smillert
1648b39c5158Smillert=item *
1649b39c5158Smillert
1650b39c5158Smillertmake use of the new qr// construct to pre-compile the regexes they use
1651b39c5158Smillert(whereas C<extract_tagged> uses standard string variable interpolation
1652b39c5158Smillertto create tag-matching patterns).
1653b39c5158Smillert
1654b39c5158Smillert=back
1655b39c5158Smillert
1656b39c5158SmillertThe subroutine takes up to four optional arguments (the same set as
1657b39c5158SmillertC<extract_tagged> except for the string to be processed). It returns
1658b39c5158Smillerta reference to a subroutine which in turn takes a single argument (the text to
1659b39c5158Smillertbe extracted from).
1660b39c5158Smillert
1661b39c5158SmillertIn other words, the implementation of C<extract_tagged> is exactly
1662b39c5158Smillertequivalent to:
1663b39c5158Smillert
1664b39c5158Smillert        sub extract_tagged
1665b39c5158Smillert        {
1666b39c5158Smillert                my $text = shift;
1667b39c5158Smillert                $extractor = gen_extract_tagged(@_);
1668b39c5158Smillert                return $extractor->($text);
1669b39c5158Smillert        }
1670b39c5158Smillert
1671eac174f2Safresh1(although C<extract_tagged> is not currently implemented that way).
1672b39c5158Smillert
1673b39c5158SmillertUsing C<gen_extract_tagged> to create extraction functions for specific tags
1674b39c5158Smillertis a good idea if those functions are going to be called more than once, since
1675b39c5158Smillerttheir performance is typically twice as good as the more general-purpose
1676b39c5158SmillertC<extract_tagged>.
1677b39c5158Smillert
1678eac174f2Safresh1=item C<extract_quotelike>
1679b39c5158Smillert
1680b39c5158SmillertC<extract_quotelike> attempts to recognize, extract, and segment any
1681b39c5158Smillertone of the various Perl quotes and quotelike operators (see
1682b39c5158SmillertL<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1683b39c5158Smillertdelimiters (for the quotelike operators), and trailing modifiers are
1684b39c5158Smillertall caught. For example, in:
1685b39c5158Smillert
1686b39c5158Smillert        extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1687b39c5158Smillert
1688b39c5158Smillert        extract_quotelike '  "You said, \"Use sed\"."  '
1689b39c5158Smillert
1690b39c5158Smillert        extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1691b39c5158Smillert
1692b39c5158Smillert        extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1693b39c5158Smillert
1694b39c5158Smillertthe full Perl quotelike operations are all extracted correctly.
1695b39c5158Smillert
1696b39c5158SmillertNote too that, when using the /x modifier on a regex, any comment
1697b39c5158Smillertcontaining the current pattern delimiter will cause the regex to be
1698b39c5158Smillertimmediately terminated. In other words:
1699b39c5158Smillert
1700b39c5158Smillert        'm /
1701b39c5158Smillert                (?i)            # CASE INSENSITIVE
1702b39c5158Smillert                [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1703b39c5158Smillert                [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1704b39c5158Smillert           /x'
1705b39c5158Smillert
1706b39c5158Smillertwill be extracted as if it were:
1707b39c5158Smillert
1708b39c5158Smillert        'm /
1709b39c5158Smillert                (?i)            # CASE INSENSITIVE
1710b39c5158Smillert                [a-z_]          # LEADING ALPHABETIC/'
1711b39c5158Smillert
1712b39c5158SmillertThis behaviour is identical to that of the actual compiler.
1713b39c5158Smillert
1714b39c5158SmillertC<extract_quotelike> takes two arguments: the text to be processed and
1715b39c5158Smillerta prefix to be matched at the very beginning of the text. If no prefix
1716b39c5158Smillertis specified, optional whitespace is the default. If no text is given,
1717b39c5158SmillertC<$_> is used.
1718b39c5158Smillert
1719b39c5158SmillertIn a list context, an array of 11 elements is returned. The elements are:
1720b39c5158Smillert
1721b39c5158Smillert=over 4
1722b39c5158Smillert
1723b39c5158Smillert=item [0]
1724b39c5158Smillert
1725b39c5158Smillertthe extracted quotelike substring (including trailing modifiers),
1726b39c5158Smillert
1727b39c5158Smillert=item [1]
1728b39c5158Smillert
1729b39c5158Smillertthe remainder of the input text,
1730b39c5158Smillert
1731b39c5158Smillert=item [2]
1732b39c5158Smillert
1733b39c5158Smillertthe prefix substring (if any),
1734b39c5158Smillert
1735b39c5158Smillert=item [3]
1736b39c5158Smillert
1737b39c5158Smillertthe name of the quotelike operator (if any),
1738b39c5158Smillert
1739b39c5158Smillert=item [4]
1740b39c5158Smillert
1741b39c5158Smillertthe left delimiter of the first block of the operation,
1742b39c5158Smillert
1743b39c5158Smillert=item [5]
1744b39c5158Smillert
1745b39c5158Smillertthe text of the first block of the operation
1746b39c5158Smillert(that is, the contents of
1747b39c5158Smillerta quote, the regex of a match or substitution or the target list of a
1748b39c5158Smillerttranslation),
1749b39c5158Smillert
1750b39c5158Smillert=item [6]
1751b39c5158Smillert
1752b39c5158Smillertthe right delimiter of the first block of the operation,
1753b39c5158Smillert
1754b39c5158Smillert=item [7]
1755b39c5158Smillert
1756b39c5158Smillertthe left delimiter of the second block of the operation
1757b39c5158Smillert(that is, if it is a C<s>, C<tr>, or C<y>),
1758b39c5158Smillert
1759b39c5158Smillert=item [8]
1760b39c5158Smillert
1761b39c5158Smillertthe text of the second block of the operation
1762b39c5158Smillert(that is, the replacement of a substitution or the translation list
1763b39c5158Smillertof a translation),
1764b39c5158Smillert
1765b39c5158Smillert=item [9]
1766b39c5158Smillert
1767b39c5158Smillertthe right delimiter of the second block of the operation (if any),
1768b39c5158Smillert
1769b39c5158Smillert=item [10]
1770b39c5158Smillert
1771b39c5158Smillertthe trailing modifiers on the operation (if any).
1772b39c5158Smillert
1773b39c5158Smillert=back
1774b39c5158Smillert
1775b39c5158SmillertFor each of the fields marked "(if any)" the default value on success is
1776b39c5158Smillertan empty string.
1777b39c5158SmillertOn failure, all of these values (except the remaining text) are C<undef>.
1778b39c5158Smillert
1779b39c5158SmillertIn a scalar context, C<extract_quotelike> returns just the complete substring
1780b39c5158Smillertthat matched a quotelike operation (or C<undef> on failure). In a scalar or
1781b39c5158Smillertvoid context, the input text has the same substring (and any specified
1782b39c5158Smillertprefix) removed.
1783b39c5158Smillert
1784b39c5158SmillertExamples:
1785b39c5158Smillert
1786b39c5158Smillert        # Remove the first quotelike literal that appears in text
1787b39c5158Smillert
1788b39c5158Smillert                $quotelike = extract_quotelike($text,'.*?');
1789b39c5158Smillert
1790b39c5158Smillert        # Replace one or more leading whitespace-separated quotelike
1791b39c5158Smillert        # literals in $_ with "<QLL>"
1792b39c5158Smillert
1793b39c5158Smillert                do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1794b39c5158Smillert
1795b39c5158Smillert
1796b39c5158Smillert        # Isolate the search pattern in a quotelike operation from $text
1797b39c5158Smillert
1798b39c5158Smillert                ($op,$pat) = (extract_quotelike $text)[3,5];
1799b39c5158Smillert                if ($op =~ /[ms]/)
1800b39c5158Smillert                {
1801b39c5158Smillert                        print "search pattern: $pat\n";
1802b39c5158Smillert                }
1803b39c5158Smillert                else
1804b39c5158Smillert                {
1805b39c5158Smillert                        print "$op is not a pattern matching operation\n";
1806b39c5158Smillert                }
1807b39c5158Smillert
1808eac174f2Safresh1=item C<extract_quotelike>
1809b39c5158Smillert
1810b39c5158SmillertC<extract_quotelike> can successfully extract "here documents" from an input
1811b39c5158Smillertstring, but with an important caveat in list contexts.
1812b39c5158Smillert
1813b39c5158SmillertUnlike other types of quote-like literals, a here document is rarely
1814b39c5158Smillerta contiguous substring. For example, a typical piece of code using
1815b39c5158Smillerthere document might look like this:
1816b39c5158Smillert
1817b39c5158Smillert        <<'EOMSG' || die;
1818b39c5158Smillert        This is the message.
1819b39c5158Smillert        EOMSG
1820b39c5158Smillert        exit;
1821b39c5158Smillert
1822b39c5158SmillertGiven this as an input string in a scalar context, C<extract_quotelike>
1823b39c5158Smillertwould correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1824b39c5158Smillertleaving the string " || die;\nexit;" in the original variable. In other words,
1825b39c5158Smillertthe two separate pieces of the here document are successfully extracted and
1826b39c5158Smillertconcatenated.
1827b39c5158Smillert
1828b39c5158SmillertIn a list context, C<extract_quotelike> would return the list
1829b39c5158Smillert
1830b39c5158Smillert=over 4
1831b39c5158Smillert
1832b39c5158Smillert=item [0]
1833b39c5158Smillert
1834b39c5158Smillert"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1835b39c5158Smillertincluding fore and aft delimiters),
1836b39c5158Smillert
1837b39c5158Smillert=item [1]
1838b39c5158Smillert
1839b39c5158Smillert" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1840b39c5158Smillert
1841b39c5158Smillert=item [2]
1842b39c5158Smillert
1843b39c5158Smillert"" (i.e. the prefix substring -- trivial in this case),
1844b39c5158Smillert
1845b39c5158Smillert=item [3]
1846b39c5158Smillert
1847b39c5158Smillert"<<" (i.e. the "name" of the quotelike operator)
1848b39c5158Smillert
1849b39c5158Smillert=item [4]
1850b39c5158Smillert
1851b39c5158Smillert"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1852b39c5158Smillert
1853b39c5158Smillert=item [5]
1854b39c5158Smillert
1855b39c5158Smillert"This is the message.\n" (i.e. the text of the here document),
1856b39c5158Smillert
1857b39c5158Smillert=item [6]
1858b39c5158Smillert
1859b39c5158Smillert"EOMSG" (i.e. the right delimiter of the here document),
1860b39c5158Smillert
1861b39c5158Smillert=item [7..10]
1862b39c5158Smillert
1863b39c5158Smillert"" (a here document has no second left delimiter, second text, second right
1864b39c5158Smillertdelimiter, or trailing modifiers).
1865b39c5158Smillert
1866b39c5158Smillert=back
1867b39c5158Smillert
1868b39c5158SmillertHowever, the matching position of the input variable would be set to
1869b39c5158Smillert"exit;" (i.e. I<after> the closing delimiter of the here document),
1870b39c5158Smillertwhich would cause the earlier " || die;\nexit;" to be skipped in any
1871b39c5158Smillertsequence of code fragment extractions.
1872b39c5158Smillert
1873b39c5158SmillertTo avoid this problem, when it encounters a here document whilst
1874b39c5158Smillertextracting from a modifiable string, C<extract_quotelike> silently
1875b39c5158Smillertrearranges the string to an equivalent piece of Perl:
1876b39c5158Smillert
1877b39c5158Smillert        <<'EOMSG'
1878b39c5158Smillert        This is the message.
1879b39c5158Smillert        EOMSG
1880b39c5158Smillert        || die;
1881b39c5158Smillert        exit;
1882b39c5158Smillert
1883b39c5158Smillertin which the here document I<is> contiguous. It still leaves the
1884b39c5158Smillertmatching position after the here document, but now the rest of the line
1885b39c5158Smillerton which the here document starts is not skipped.
1886b39c5158Smillert
1887b39c5158SmillertTo prevent <extract_quotelike> from mucking about with the input in this way
1888b39c5158Smillert(this is the only case where a list-context C<extract_quotelike> does so),
1889b39c5158Smillertyou can pass the input variable as an interpolated literal:
1890b39c5158Smillert
1891b39c5158Smillert        $quotelike = extract_quotelike("$var");
1892b39c5158Smillert
1893eac174f2Safresh1=item C<extract_codeblock>
1894b39c5158Smillert
1895b39c5158SmillertC<extract_codeblock> attempts to recognize and extract a balanced
1896b39c5158Smillertbracket delimited substring that may contain unbalanced brackets
1897b39c5158Smillertinside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1898b39c5158Smillertis like a combination of C<"extract_bracketed"> and
1899b39c5158SmillertC<"extract_quotelike">.
1900b39c5158Smillert
1901b39c5158SmillertC<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1902b39c5158Smillerta text to process, a set of delimiter brackets to look for, and a prefix to
1903b39c5158Smillertmatch first. It also takes an optional fourth parameter, which allows the
1904*e0680481Safresh1outermost delimiter brackets to be specified separately (see below),
1905*e0680481Safresh1and a fifth parameter used only by L<Parse::RecDescent>.
1906b39c5158Smillert
1907b39c5158SmillertOmitting the first argument (input text) means process C<$_> instead.
1908b39c5158SmillertOmitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1909b39c5158SmillertOmitting the third argument (prefix argument) implies optional whitespace at the start.
1910b39c5158SmillertOmitting the fourth argument (outermost delimiter brackets) indicates that the
1911b39c5158Smillertvalue of the second argument is to be used for the outermost delimiters.
1912b39c5158Smillert
1913b39c5158SmillertOnce the prefix and the outermost opening delimiter bracket have been
1914b39c5158Smillertrecognized, code blocks are extracted by stepping through the input text and
1915b39c5158Smillerttrying the following alternatives in sequence:
1916b39c5158Smillert
1917b39c5158Smillert=over 4
1918b39c5158Smillert
1919b39c5158Smillert=item 1.
1920b39c5158Smillert
1921b39c5158SmillertTry and match a closing delimiter bracket. If the bracket was the same
1922b39c5158Smillertspecies as the last opening bracket, return the substring to that
1923b39c5158Smillertpoint. If the bracket was mismatched, return an error.
1924b39c5158Smillert
1925b39c5158Smillert=item 2.
1926b39c5158Smillert
1927b39c5158SmillertTry to match a quote or quotelike operator. If found, call
1928b39c5158SmillertC<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1929b39c5158Smillertthe error it returned. Otherwise go back to step 1.
1930b39c5158Smillert
1931b39c5158Smillert=item 3.
1932b39c5158Smillert
1933b39c5158SmillertTry to match an opening delimiter bracket. If found, call
1934b39c5158SmillertC<extract_codeblock> recursively to eat the embedded block. If the
1935b39c5158Smillertrecursive call fails, return an error. Otherwise, go back to step 1.
1936b39c5158Smillert
1937b39c5158Smillert=item 4.
1938b39c5158Smillert
1939b39c5158SmillertUnconditionally match a bareword or any other single character, and
1940b39c5158Smillertthen go back to step 1.
1941b39c5158Smillert
1942b39c5158Smillert=back
1943b39c5158Smillert
1944b39c5158SmillertExamples:
1945b39c5158Smillert
1946b39c5158Smillert        # Find a while loop in the text
1947b39c5158Smillert
1948b39c5158Smillert                if ($text =~ s/.*?while\s*\{/{/)
1949b39c5158Smillert                {
1950b39c5158Smillert                        $loop = "while " . extract_codeblock($text);
1951b39c5158Smillert                }
1952b39c5158Smillert
1953b39c5158Smillert        # Remove the first round-bracketed list (which may include
1954b39c5158Smillert        # round- or curly-bracketed code blocks or quotelike operators)
1955b39c5158Smillert
1956b39c5158Smillert                extract_codeblock $text, "(){}", '[^(]*';
1957b39c5158Smillert
1958b39c5158Smillert
1959b39c5158SmillertThe ability to specify a different outermost delimiter bracket is useful
1960b39c5158Smillertin some circumstances. For example, in the Parse::RecDescent module,
1961b39c5158Smillertparser actions which are to be performed only on a successful parse
1962b39c5158Smillertare specified using a C<E<lt>defer:...E<gt>> directive. For example:
1963b39c5158Smillert
1964b39c5158Smillert        sentence: subject verb object
1965b39c5158Smillert                        <defer: {$::theVerb = $item{verb}} >
1966b39c5158Smillert
1967b39c5158SmillertParse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1968b39c5158Smillertwithin the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1969b39c5158Smillert
1970b39c5158SmillertA deferred action like this:
1971b39c5158Smillert
1972b39c5158Smillert                        <defer: {if ($count>10) {$count--}} >
1973b39c5158Smillert
1974b39c5158Smillertwill be incorrectly parsed as:
1975b39c5158Smillert
1976b39c5158Smillert                        <defer: {if ($count>
1977b39c5158Smillert
1978b39c5158Smillertbecause the "less than" operator is interpreted as a closing delimiter.
1979b39c5158Smillert
1980b39c5158SmillertBut, by extracting the directive using
1981b39c5158SmillertS<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1982b39c5158Smillertthe '>' character is only treated as a delimited at the outermost
1983b39c5158Smillertlevel of the code block, so the directive is parsed correctly.
1984b39c5158Smillert
1985eac174f2Safresh1=item C<extract_multiple>
1986b39c5158Smillert
1987b39c5158SmillertThe C<extract_multiple> subroutine takes a string to be processed and a
1988b39c5158Smillertlist of extractors (subroutines or regular expressions) to apply to that string.
1989b39c5158Smillert
1990b39c5158SmillertIn an array context C<extract_multiple> returns an array of substrings
1991b39c5158Smillertof the original string, as extracted by the specified extractors.
1992b39c5158SmillertIn a scalar context, C<extract_multiple> returns the first
1993b39c5158Smillertsubstring successfully extracted from the original string. In both
1994b39c5158Smillertscalar and void contexts the original string has the first successfully
1995b39c5158Smillertextracted substring removed from it. In all contexts
1996b39c5158SmillertC<extract_multiple> starts at the current C<pos> of the string, and
1997b39c5158Smillertsets that C<pos> appropriately after it matches.
1998b39c5158Smillert
1999eac174f2Safresh1Hence, the aim of a call to C<extract_multiple> in a list context
2000b39c5158Smillertis to split the processed string into as many non-overlapping fields as
2001b39c5158Smillertpossible, by repeatedly applying each of the specified extractors
2002b39c5158Smillertto the remainder of the string. Thus C<extract_multiple> is
2003b39c5158Smillerta generalized form of Perl's C<split> subroutine.
2004b39c5158Smillert
2005b39c5158SmillertThe subroutine takes up to four optional arguments:
2006b39c5158Smillert
2007b39c5158Smillert=over 4
2008b39c5158Smillert
2009b39c5158Smillert=item 1.
2010b39c5158Smillert
2011b39c5158SmillertA string to be processed (C<$_> if the string is omitted or C<undef>)
2012b39c5158Smillert
2013b39c5158Smillert=item 2.
2014b39c5158Smillert
2015b39c5158SmillertA reference to a list of subroutine references and/or qr// objects and/or
2016b39c5158Smillertliteral strings and/or hash references, specifying the extractors
2017b39c5158Smillertto be used to split the string. If this argument is omitted (or
2018b39c5158SmillertC<undef>) the list:
2019b39c5158Smillert
2020b39c5158Smillert        [
2021b39c5158Smillert                sub { extract_variable($_[0], '') },
2022b39c5158Smillert                sub { extract_quotelike($_[0],'') },
2023b39c5158Smillert                sub { extract_codeblock($_[0],'{}','') },
2024b39c5158Smillert        ]
2025b39c5158Smillert
2026b39c5158Smillertis used.
2027b39c5158Smillert
2028b39c5158Smillert=item 3.
2029b39c5158Smillert
2030b39c5158SmillertAn number specifying the maximum number of fields to return. If this
2031b39c5158Smillertargument is omitted (or C<undef>), split continues as long as possible.
2032b39c5158Smillert
2033b39c5158SmillertIf the third argument is I<N>, then extraction continues until I<N> fields
2034b39c5158Smillerthave been successfully extracted, or until the string has been completely
2035b39c5158Smillertprocessed.
2036b39c5158Smillert
2037b39c5158SmillertNote that in scalar and void contexts the value of this argument is
2038b39c5158Smillertautomatically reset to 1 (under C<-w>, a warning is issued if the argument
2039b39c5158Smillerthas to be reset).
2040b39c5158Smillert
2041b39c5158Smillert=item 4.
2042b39c5158Smillert
2043b39c5158SmillertA value indicating whether unmatched substrings (see below) within the
2044b39c5158Smillerttext should be skipped or returned as fields. If the value is true,
2045b39c5158Smillertsuch substrings are skipped. Otherwise, they are returned.
2046b39c5158Smillert
2047b39c5158Smillert=back
2048b39c5158Smillert
2049b39c5158SmillertThe extraction process works by applying each extractor in
2050b39c5158Smillertsequence to the text string.
2051b39c5158Smillert
2052b39c5158SmillertIf the extractor is a subroutine it is called in a list context and is
2053b39c5158Smillertexpected to return a list of a single element, namely the extracted
2054b39c5158Smillerttext. It may optionally also return two further arguments: a string
2055b39c5158Smillertrepresenting the text left after extraction (like $' for a pattern
2056b39c5158Smillertmatch), and a string representing any prefix skipped before the
2057b39c5158Smillertextraction (like $` in a pattern match). Note that this is designed
2058b39c5158Smillertto facilitate the use of other Text::Balanced subroutines with
2059b39c5158SmillertC<extract_multiple>. Note too that the value returned by an extractor
2060b39c5158Smillertsubroutine need not bear any relationship to the corresponding substring
2061b39c5158Smillertof the original text (see examples below).
2062b39c5158Smillert
2063b39c5158SmillertIf the extractor is a precompiled regular expression or a string,
2064b39c5158Smillertit is matched against the text in a scalar context with a leading
2065b39c5158Smillert'\G' and the gc modifiers enabled. The extracted value is either
2066b39c5158Smillert$1 if that variable is defined after the match, or else the
2067b39c5158Smillertcomplete match (i.e. $&).
2068b39c5158Smillert
2069b39c5158SmillertIf the extractor is a hash reference, it must contain exactly one element.
2070b39c5158SmillertThe value of that element is one of the
2071b39c5158Smillertabove extractor types (subroutine reference, regular expression, or string).
2072b39c5158SmillertThe key of that element is the name of a class into which the successful
2073b39c5158Smillertreturn value of the extractor will be blessed.
2074b39c5158Smillert
2075b39c5158SmillertIf an extractor returns a defined value, that value is immediately
2076b39c5158Smillerttreated as the next extracted field and pushed onto the list of fields.
2077b39c5158SmillertIf the extractor was specified in a hash reference, the field is also
2078b39c5158Smillertblessed into the appropriate class,
2079b39c5158Smillert
2080b39c5158SmillertIf the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2081b39c5158Smillertassumed to have failed to extract.
2082b39c5158SmillertIf none of the extractor subroutines succeeds, then one
2083b39c5158Smillertcharacter is extracted from the start of the text and the extraction
2084b39c5158Smillertsubroutines reapplied. Characters which are thus removed are accumulated and
2085b39c5158Smillerteventually become the next field (unless the fourth argument is true, in which
2086b39c5158Smillertcase they are discarded).
2087b39c5158Smillert
2088b39c5158SmillertFor example, the following extracts substrings that are valid Perl variables:
2089b39c5158Smillert
2090b39c5158Smillert        @fields = extract_multiple($text,
2091b39c5158Smillert                                   [ sub { extract_variable($_[0]) } ],
2092b39c5158Smillert                                   undef, 1);
2093b39c5158Smillert
2094b39c5158SmillertThis example separates a text into fields which are quote delimited,
2095b39c5158Smillertcurly bracketed, and anything else. The delimited and bracketed
2096b39c5158Smillertparts are also blessed to identify them (the "anything else" is unblessed):
2097b39c5158Smillert
2098b39c5158Smillert        @fields = extract_multiple($text,
2099b39c5158Smillert                   [
2100b39c5158Smillert                        { Delim => sub { extract_delimited($_[0],q{'"}) } },
2101b39c5158Smillert                        { Brack => sub { extract_bracketed($_[0],'{}') } },
2102b39c5158Smillert                   ]);
2103b39c5158Smillert
2104b39c5158SmillertThis call extracts the next single substring that is a valid Perl quotelike
2105b39c5158Smillertoperator (and removes it from $text):
2106b39c5158Smillert
2107b39c5158Smillert        $quotelike = extract_multiple($text,
2108b39c5158Smillert                                      [
2109b39c5158Smillert                                        sub { extract_quotelike($_[0]) },
2110b39c5158Smillert                                      ], undef, 1);
2111b39c5158Smillert
2112b39c5158SmillertFinally, here is yet another way to do comma-separated value parsing:
2113b39c5158Smillert
2114*e0680481Safresh1        $csv_text = "a,'x b',c";
2115b39c5158Smillert        @fields = extract_multiple($csv_text,
2116b39c5158Smillert                                  [
2117b39c5158Smillert                                        sub { extract_delimited($_[0],q{'"}) },
2118*e0680481Safresh1                                        qr/([^,]+)/,
2119b39c5158Smillert                                  ],
2120b39c5158Smillert                                  undef,1);
2121*e0680481Safresh1        # @fields is now ('a', "'x b'", 'c')
2122b39c5158Smillert
2123b39c5158SmillertThe list in the second argument means:
2124b39c5158SmillertI<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2125b39c5158SmillertThe undef third argument means:
2126b39c5158SmillertI<"...as many times as possible...">,
2127b39c5158Smillertand the true value in the fourth argument means
2128b39c5158SmillertI<"...discarding anything else that appears (i.e. the commas)">.
2129b39c5158Smillert
2130b39c5158SmillertIf you wanted the commas preserved as separate fields (i.e. like split
2131b39c5158Smillertdoes if your split pattern has capturing parentheses), you would
2132b39c5158Smillertjust make the last parameter undefined (or remove it).
2133b39c5158Smillert
2134eac174f2Safresh1=item C<gen_delimited_pat>
2135b39c5158Smillert
2136b39c5158SmillertThe C<gen_delimited_pat> subroutine takes a single (string) argument and
2137*e0680481Safresh1builds a Friedl-style optimized regex that matches a string delimited
2138b39c5158Smillertby any one of the characters in the single argument. For example:
2139b39c5158Smillert
2140b39c5158Smillert        gen_delimited_pat(q{'"})
2141b39c5158Smillert
2142b39c5158Smillertreturns the regex:
2143b39c5158Smillert
2144b39c5158Smillert        (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2145b39c5158Smillert
2146b39c5158SmillertNote that the specified delimiters are automatically quotemeta'd.
2147b39c5158Smillert
2148b39c5158SmillertA typical use of C<gen_delimited_pat> would be to build special purpose tags
2149b39c5158Smillertfor C<extract_tagged>. For example, to properly ignore "empty" XML elements
2150b39c5158Smillert(which might contain quoted strings):
2151b39c5158Smillert
2152b39c5158Smillert        my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2153b39c5158Smillert
2154b39c5158Smillert        extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2155b39c5158Smillert
2156b39c5158SmillertC<gen_delimited_pat> may also be called with an optional second argument,
2157b39c5158Smillertwhich specifies the "escape" character(s) to be used for each delimiter.
2158b39c5158SmillertFor example to match a Pascal-style string (where ' is the delimiter
2159b39c5158Smillertand '' is a literal ' within the string):
2160b39c5158Smillert
2161b39c5158Smillert        gen_delimited_pat(q{'},q{'});
2162b39c5158Smillert
2163b39c5158SmillertDifferent escape characters can be specified for different delimiters.
2164b39c5158SmillertFor example, to specify that '/' is the escape for single quotes
2165b39c5158Smillertand '%' is the escape for double quotes:
2166b39c5158Smillert
2167b39c5158Smillert        gen_delimited_pat(q{'"},q{/%});
2168b39c5158Smillert
2169b39c5158SmillertIf more delimiters than escape chars are specified, the last escape char
2170b39c5158Smillertis used for the remaining delimiters.
2171b39c5158SmillertIf no escape char is specified for a given specified delimiter, '\' is used.
2172b39c5158Smillert
2173eac174f2Safresh1=item C<delimited_pat>
2174b39c5158Smillert
2175b39c5158SmillertNote that C<gen_delimited_pat> was previously called C<delimited_pat>.
2176b39c5158SmillertThat name may still be used, but is now deprecated.
2177b39c5158Smillert
2178eac174f2Safresh1=back
2179b39c5158Smillert
2180b39c5158Smillert=head1 DIAGNOSTICS
2181b39c5158Smillert
2182b39c5158SmillertIn a list context, all the functions return C<(undef,$original_text)>
2183b39c5158Smillerton failure. In a scalar context, failure is indicated by returning C<undef>
2184b39c5158Smillert(in this case the input text is not modified in any way).
2185b39c5158Smillert
2186b39c5158SmillertIn addition, on failure in I<any> context, the C<$@> variable is set.
2187b39c5158SmillertAccessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2188b39c5158Smillertbelow.
2189b39c5158SmillertAccessing C<$@-E<gt>{pos}> returns the offset into the original string at
2190b39c5158Smillertwhich the error was detected (although not necessarily where it occurred!)
2191b39c5158SmillertPrinting C<$@> directly produces the error message, with the offset appended.
2192b39c5158SmillertOn success, the C<$@> variable is guaranteed to be C<undef>.
2193b39c5158Smillert
2194b39c5158SmillertThe available diagnostics are:
2195b39c5158Smillert
2196b39c5158Smillert=over 4
2197b39c5158Smillert
2198b39c5158Smillert=item  C<Did not find a suitable bracket: "%s">
2199b39c5158Smillert
2200b39c5158SmillertThe delimiter provided to C<extract_bracketed> was not one of
2201b39c5158SmillertC<'()[]E<lt>E<gt>{}'>.
2202b39c5158Smillert
2203b39c5158Smillert=item  C<Did not find prefix: /%s/>
2204b39c5158Smillert
2205b39c5158SmillertA non-optional prefix was specified but wasn't found at the start of the text.
2206b39c5158Smillert
2207b39c5158Smillert=item  C<Did not find opening bracket after prefix: "%s">
2208b39c5158Smillert
2209b39c5158SmillertC<extract_bracketed> or C<extract_codeblock> was expecting a
2210b39c5158Smillertparticular kind of bracket at the start of the text, and didn't find it.
2211b39c5158Smillert
2212b39c5158Smillert=item  C<No quotelike operator found after prefix: "%s">
2213b39c5158Smillert
2214b39c5158SmillertC<extract_quotelike> didn't find one of the quotelike operators C<q>,
2215b39c5158SmillertC<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2216b39c5158Smillertit was extracting.
2217b39c5158Smillert
2218b39c5158Smillert=item  C<Unmatched closing bracket: "%c">
2219b39c5158Smillert
2220b39c5158SmillertC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2221b39c5158Smillerta closing bracket where none was expected.
2222b39c5158Smillert
2223b39c5158Smillert=item  C<Unmatched opening bracket(s): "%s">
2224b39c5158Smillert
2225b39c5158SmillertC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2226b39c5158Smillertout of characters in the text before closing one or more levels of nested
2227b39c5158Smillertbrackets.
2228b39c5158Smillert
2229b39c5158Smillert=item C<Unmatched embedded quote (%s)>
2230b39c5158Smillert
2231b39c5158SmillertC<extract_bracketed> attempted to match an embedded quoted substring, but
2232b39c5158Smillertfailed to find a closing quote to match it.
2233b39c5158Smillert
2234b39c5158Smillert=item C<Did not find closing delimiter to match '%s'>
2235b39c5158Smillert
2236b39c5158SmillertC<extract_quotelike> was unable to find a closing delimiter to match the
2237b39c5158Smillertone that opened the quote-like operation.
2238b39c5158Smillert
2239b39c5158Smillert=item  C<Mismatched closing bracket: expected "%c" but found "%s">
2240b39c5158Smillert
2241b39c5158SmillertC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2242b39c5158Smillerta valid bracket delimiter, but it was the wrong species. This usually
2243b39c5158Smillertindicates a nesting error, but may indicate incorrect quoting or escaping.
2244b39c5158Smillert
2245b39c5158Smillert=item  C<No block delimiter found after quotelike "%s">
2246b39c5158Smillert
2247b39c5158SmillertC<extract_quotelike> or C<extract_codeblock> found one of the
2248b39c5158Smillertquotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2249b39c5158Smillertwithout a suitable block after it.
2250b39c5158Smillert
2251b39c5158Smillert=item C<Did not find leading dereferencer>
2252b39c5158Smillert
2253b39c5158SmillertC<extract_variable> was expecting one of '$', '@', or '%' at the start of
2254b39c5158Smillerta variable, but didn't find any of them.
2255b39c5158Smillert
2256b39c5158Smillert=item C<Bad identifier after dereferencer>
2257b39c5158Smillert
2258b39c5158SmillertC<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2259b39c5158Smillertcharacter was not followed by a legal Perl identifier.
2260b39c5158Smillert
2261b39c5158Smillert=item C<Did not find expected opening bracket at %s>
2262b39c5158Smillert
2263b39c5158SmillertC<extract_codeblock> failed to find any of the outermost opening brackets
2264b39c5158Smillertthat were specified.
2265b39c5158Smillert
2266b39c5158Smillert=item C<Improperly nested codeblock at %s>
2267b39c5158Smillert
2268b39c5158SmillertA nested code block was found that started with a delimiter that was specified
2269b39c5158Smillertas being only to be used as an outermost bracket.
2270b39c5158Smillert
2271b39c5158Smillert=item  C<Missing second block for quotelike "%s">
2272b39c5158Smillert
2273b39c5158SmillertC<extract_codeblock> or C<extract_quotelike> found one of the
2274b39c5158Smillertquotelike operators C<s>, C<tr> or C<y> followed by only one block.
2275b39c5158Smillert
2276b39c5158Smillert=item C<No match found for opening bracket>
2277b39c5158Smillert
2278b39c5158SmillertC<extract_codeblock> failed to find a closing bracket to match the outermost
2279b39c5158Smillertopening bracket.
2280b39c5158Smillert
2281b39c5158Smillert=item C<Did not find opening tag: /%s/>
2282b39c5158Smillert
2283b39c5158SmillertC<extract_tagged> did not find a suitable opening tag (after any specified
2284b39c5158Smillertprefix was removed).
2285b39c5158Smillert
2286b39c5158Smillert=item C<Unable to construct closing tag to match: /%s/>
2287b39c5158Smillert
2288b39c5158SmillertC<extract_tagged> matched the specified opening tag and tried to
2289b39c5158Smillertmodify the matched text to produce a matching closing tag (because
2290b39c5158Smillertnone was specified). It failed to generate the closing tag, almost
2291b39c5158Smillertcertainly because the opening tag did not start with a
2292b39c5158Smillertbracket of some kind.
2293b39c5158Smillert
2294b39c5158Smillert=item C<Found invalid nested tag: %s>
2295b39c5158Smillert
2296b39c5158SmillertC<extract_tagged> found a nested tag that appeared in the "reject" list
2297b39c5158Smillert(and the failure mode was not "MAX" or "PARA").
2298b39c5158Smillert
2299b39c5158Smillert=item C<Found unbalanced nested tag: %s>
2300b39c5158Smillert
2301b39c5158SmillertC<extract_tagged> found a nested opening tag that was not matched by a
2302b39c5158Smillertcorresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2303b39c5158Smillert
2304b39c5158Smillert=item C<Did not find closing tag>
2305b39c5158Smillert
2306b39c5158SmillertC<extract_tagged> reached the end of the text without finding a closing tag
2307b39c5158Smillertto match the original opening tag (and the failure mode was not
2308b39c5158Smillert"MAX" or "PARA").
2309b39c5158Smillert
2310b39c5158Smillert=back
2311b39c5158Smillert
2312eac174f2Safresh1=head1 EXPORTS
2313eac174f2Safresh1
2314eac174f2Safresh1The following symbols are, or can be, exported by this module:
2315eac174f2Safresh1
2316eac174f2Safresh1=over 4
2317eac174f2Safresh1
2318eac174f2Safresh1=item Default Exports
2319eac174f2Safresh1
2320eac174f2Safresh1I<None>.
2321eac174f2Safresh1
2322eac174f2Safresh1=item Optional Exports
2323eac174f2Safresh1
2324eac174f2Safresh1C<extract_delimited>,
2325eac174f2Safresh1C<extract_bracketed>,
2326eac174f2Safresh1C<extract_quotelike>,
2327eac174f2Safresh1C<extract_codeblock>,
2328eac174f2Safresh1C<extract_variable>,
2329eac174f2Safresh1C<extract_tagged>,
2330eac174f2Safresh1C<extract_multiple>,
2331eac174f2Safresh1C<gen_delimited_pat>,
2332eac174f2Safresh1C<gen_extract_tagged>,
2333eac174f2Safresh1C<delimited_pat>.
2334eac174f2Safresh1
2335eac174f2Safresh1=item Export Tags
2336eac174f2Safresh1
2337eac174f2Safresh1=over 4
2338eac174f2Safresh1
2339eac174f2Safresh1=item C<:ALL>
2340eac174f2Safresh1
2341eac174f2Safresh1C<extract_delimited>,
2342eac174f2Safresh1C<extract_bracketed>,
2343eac174f2Safresh1C<extract_quotelike>,
2344eac174f2Safresh1C<extract_codeblock>,
2345eac174f2Safresh1C<extract_variable>,
2346eac174f2Safresh1C<extract_tagged>,
2347eac174f2Safresh1C<extract_multiple>,
2348eac174f2Safresh1C<gen_delimited_pat>,
2349eac174f2Safresh1C<gen_extract_tagged>,
2350eac174f2Safresh1C<delimited_pat>.
2351eac174f2Safresh1
2352eac174f2Safresh1=back
2353eac174f2Safresh1
2354eac174f2Safresh1=back
2355eac174f2Safresh1
2356eac174f2Safresh1=head1 KNOWN BUGS
2357eac174f2Safresh1
2358eac174f2Safresh1See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>.
2359eac174f2Safresh1
2360eac174f2Safresh1=head1 FEEDBACK
2361eac174f2Safresh1
2362eac174f2Safresh1Patches, bug reports, suggestions or any other feedback is welcome.
2363eac174f2Safresh1
2364eac174f2Safresh1Patches can be sent as GitHub pull requests at
2365eac174f2Safresh1L<https://github.com/steve-m-hay/Text-Balanced/pulls>.
2366eac174f2Safresh1
2367eac174f2Safresh1Bug reports and suggestions can be made on the CPAN Request Tracker at
2368eac174f2Safresh1L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>.
2369eac174f2Safresh1
2370eac174f2Safresh1Currently active requests on the CPAN Request Tracker can be viewed at
2371eac174f2Safresh1L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>.
2372eac174f2Safresh1
2373eac174f2Safresh1Please test this distribution.  See CPAN Testers Reports at
2374eac174f2Safresh1L<https://www.cpantesters.org/> for details of how to get involved.
2375eac174f2Safresh1
2376eac174f2Safresh1Previous test results on CPAN Testers Reports can be viewed at
2377eac174f2Safresh1L<https://www.cpantesters.org/distro/T/Text-Balanced.html>.
2378eac174f2Safresh1
2379eac174f2Safresh1Please rate this distribution on CPAN Ratings at
2380eac174f2Safresh1L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>.
2381eac174f2Safresh1
2382eac174f2Safresh1=head1 AVAILABILITY
2383eac174f2Safresh1
2384eac174f2Safresh1The latest version of this module is available from CPAN (see
2385eac174f2Safresh1L<perlmodlib/"CPAN"> for details) at
2386eac174f2Safresh1
2387eac174f2Safresh1L<https://metacpan.org/release/Text-Balanced> or
2388eac174f2Safresh1
2389eac174f2Safresh1L<https://www.cpan.org/authors/id/S/SH/SHAY/> or
2390eac174f2Safresh1
2391eac174f2Safresh1L<https://www.cpan.org/modules/by-module/Text/>.
2392eac174f2Safresh1
2393eac174f2Safresh1The latest source code is available from GitHub at
2394eac174f2Safresh1L<https://github.com/steve-m-hay/Text-Balanced>.
2395eac174f2Safresh1
2396eac174f2Safresh1=head1 INSTALLATION
2397eac174f2Safresh1
2398eac174f2Safresh1See the F<INSTALL> file.
2399eac174f2Safresh1
2400b39c5158Smillert=head1 AUTHOR
2401b39c5158Smillert
2402eac174f2Safresh1Damian Conway E<lt>L<damian@conway.org|mailto:damian@conway.org>E<gt>.
2403b39c5158Smillert
2404eac174f2Safresh1Steve Hay E<lt>L<shay@cpan.org|mailto:shay@cpan.org>E<gt> is now maintaining
2405eac174f2Safresh1Text::Balanced as of version 2.03.
2406b39c5158Smillert
2407b39c5158Smillert=head1 COPYRIGHT
2408b39c5158Smillert
2409eac174f2Safresh1Copyright (C) 1997-2001 Damian Conway.  All rights reserved.
2410b39c5158Smillert
2411eac174f2Safresh1Copyright (C) 2009 Adam Kennedy.
2412b39c5158Smillert
2413*e0680481Safresh1Copyright (C) 2015, 2020, 2022 Steve Hay and other contributors.  All rights
2414*e0680481Safresh1reserved.
2415eac174f2Safresh1
2416eac174f2Safresh1=head1 LICENCE
2417eac174f2Safresh1
2418eac174f2Safresh1This module is free software; you can redistribute it and/or modify it under the
2419eac174f2Safresh1same terms as Perl itself, i.e. under the terms of either the GNU General Public
2420eac174f2Safresh1License or the Artistic License, as specified in the F<LICENCE> file.
2421eac174f2Safresh1
2422eac174f2Safresh1=head1 VERSION
2423eac174f2Safresh1
2424*e0680481Safresh1Version 2.06
2425eac174f2Safresh1
2426eac174f2Safresh1=head1 DATE
2427eac174f2Safresh1
2428*e0680481Safresh105 Jun 2022
2429eac174f2Safresh1
2430eac174f2Safresh1=head1 HISTORY
2431eac174f2Safresh1
2432eac174f2Safresh1See the F<Changes> file.
2433b39c5158Smillert
2434b39c5158Smillert=cut
2435