xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/lib/Text/Balanced.pm (revision 0:68f95e015346)
1*0Sstevel@tonic-gate# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
2*0Sstevel@tonic-gate# FOR FULL DOCUMENTATION SEE Balanced.pod
3*0Sstevel@tonic-gate
4*0Sstevel@tonic-gateuse 5.005;
5*0Sstevel@tonic-gateuse strict;
6*0Sstevel@tonic-gate
7*0Sstevel@tonic-gatepackage Text::Balanced;
8*0Sstevel@tonic-gate
9*0Sstevel@tonic-gateuse Exporter;
10*0Sstevel@tonic-gateuse SelfLoader;
11*0Sstevel@tonic-gateuse vars qw { $VERSION @ISA %EXPORT_TAGS };
12*0Sstevel@tonic-gate
13*0Sstevel@tonic-gate$VERSION = '1.95';
14*0Sstevel@tonic-gate@ISA		= qw ( Exporter );
15*0Sstevel@tonic-gate
16*0Sstevel@tonic-gate%EXPORT_TAGS	= ( ALL => [ qw(
17*0Sstevel@tonic-gate				&extract_delimited
18*0Sstevel@tonic-gate				&extract_bracketed
19*0Sstevel@tonic-gate				&extract_quotelike
20*0Sstevel@tonic-gate				&extract_codeblock
21*0Sstevel@tonic-gate				&extract_variable
22*0Sstevel@tonic-gate				&extract_tagged
23*0Sstevel@tonic-gate				&extract_multiple
24*0Sstevel@tonic-gate
25*0Sstevel@tonic-gate				&gen_delimited_pat
26*0Sstevel@tonic-gate				&gen_extract_tagged
27*0Sstevel@tonic-gate
28*0Sstevel@tonic-gate				&delimited_pat
29*0Sstevel@tonic-gate			       ) ] );
30*0Sstevel@tonic-gate
31*0Sstevel@tonic-gateExporter::export_ok_tags('ALL');
32*0Sstevel@tonic-gate
33*0Sstevel@tonic-gate# PROTOTYPES
34*0Sstevel@tonic-gate
35*0Sstevel@tonic-gatesub _match_bracketed($$$$$$);
36*0Sstevel@tonic-gatesub _match_variable($$);
37*0Sstevel@tonic-gatesub _match_codeblock($$$$$$$);
38*0Sstevel@tonic-gatesub _match_quotelike($$$$);
39*0Sstevel@tonic-gate
40*0Sstevel@tonic-gate# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
41*0Sstevel@tonic-gate
42*0Sstevel@tonic-gatesub _failmsg {
43*0Sstevel@tonic-gate	my ($message, $pos) = @_;
44*0Sstevel@tonic-gate	$@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
45*0Sstevel@tonic-gate}
46*0Sstevel@tonic-gate
47*0Sstevel@tonic-gatesub _fail
48*0Sstevel@tonic-gate{
49*0Sstevel@tonic-gate	my ($wantarray, $textref, $message, $pos) = @_;
50*0Sstevel@tonic-gate	_failmsg $message, $pos if $message;
51*0Sstevel@tonic-gate	return ("",$$textref,"") if $wantarray;
52*0Sstevel@tonic-gate	return undef;
53*0Sstevel@tonic-gate}
54*0Sstevel@tonic-gate
55*0Sstevel@tonic-gatesub _succeed
56*0Sstevel@tonic-gate{
57*0Sstevel@tonic-gate	$@ = undef;
58*0Sstevel@tonic-gate	my ($wantarray,$textref) = splice @_, 0, 2;
59*0Sstevel@tonic-gate	my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
60*0Sstevel@tonic-gate	my ($startlen) = $_[5];
61*0Sstevel@tonic-gate	my $remainderpos = $_[2];
62*0Sstevel@tonic-gate	if ($wantarray)
63*0Sstevel@tonic-gate	{
64*0Sstevel@tonic-gate		my @res;
65*0Sstevel@tonic-gate		while (my ($from, $len) = splice @_, 0, 2)
66*0Sstevel@tonic-gate		{
67*0Sstevel@tonic-gate			push @res, substr($$textref,$from,$len);
68*0Sstevel@tonic-gate		}
69*0Sstevel@tonic-gate		if ($extralen) {	# CORRECT FILLET
70*0Sstevel@tonic-gate			my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n");
71*0Sstevel@tonic-gate			$res[1] = "$extra$res[1]";
72*0Sstevel@tonic-gate			eval { substr($$textref,$remainderpos,0) = $extra;
73*0Sstevel@tonic-gate			       substr($$textref,$extrapos,$extralen,"\n")} ;
74*0Sstevel@tonic-gate				#REARRANGE HERE DOC AND FILLET IF POSSIBLE
75*0Sstevel@tonic-gate			pos($$textref) = $remainderpos-$extralen+1; # RESET \G
76*0Sstevel@tonic-gate		}
77*0Sstevel@tonic-gate		else {
78*0Sstevel@tonic-gate			pos($$textref) = $remainderpos;		    # RESET \G
79*0Sstevel@tonic-gate		}
80*0Sstevel@tonic-gate		return @res;
81*0Sstevel@tonic-gate	}
82*0Sstevel@tonic-gate	else
83*0Sstevel@tonic-gate	{
84*0Sstevel@tonic-gate		my $match = substr($$textref,$_[0],$_[1]);
85*0Sstevel@tonic-gate		substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
86*0Sstevel@tonic-gate		my $extra = $extralen
87*0Sstevel@tonic-gate			? substr($$textref, $extrapos, $extralen)."\n" : "";
88*0Sstevel@tonic-gate		eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;	#CHOP OUT PREFIX & MATCH, IF POSSIBLE
89*0Sstevel@tonic-gate		pos($$textref) = $_[4];				# RESET \G
90*0Sstevel@tonic-gate		return $match;
91*0Sstevel@tonic-gate	}
92*0Sstevel@tonic-gate}
93*0Sstevel@tonic-gate
94*0Sstevel@tonic-gate# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
95*0Sstevel@tonic-gate
96*0Sstevel@tonic-gatesub gen_delimited_pat($;$)  # ($delimiters;$escapes)
97*0Sstevel@tonic-gate{
98*0Sstevel@tonic-gate	my ($dels, $escs) = @_;
99*0Sstevel@tonic-gate	return "" unless $dels =~ /\S/;
100*0Sstevel@tonic-gate	$escs = '\\' unless $escs;
101*0Sstevel@tonic-gate	$escs .= substr($escs,-1) x (length($dels)-length($escs));
102*0Sstevel@tonic-gate	my @pat = ();
103*0Sstevel@tonic-gate	my $i;
104*0Sstevel@tonic-gate	for ($i=0; $i<length $dels; $i++)
105*0Sstevel@tonic-gate	{
106*0Sstevel@tonic-gate		my $del = quotemeta substr($dels,$i,1);
107*0Sstevel@tonic-gate		my $esc = quotemeta substr($escs,$i,1);
108*0Sstevel@tonic-gate		if ($del eq $esc)
109*0Sstevel@tonic-gate		{
110*0Sstevel@tonic-gate			push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
111*0Sstevel@tonic-gate		}
112*0Sstevel@tonic-gate		else
113*0Sstevel@tonic-gate		{
114*0Sstevel@tonic-gate			push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
115*0Sstevel@tonic-gate		}
116*0Sstevel@tonic-gate	}
117*0Sstevel@tonic-gate	my $pat = join '|', @pat;
118*0Sstevel@tonic-gate	return "(?:$pat)";
119*0Sstevel@tonic-gate}
120*0Sstevel@tonic-gate
121*0Sstevel@tonic-gate*delimited_pat = \&gen_delimited_pat;
122*0Sstevel@tonic-gate
123*0Sstevel@tonic-gate
124*0Sstevel@tonic-gate# THE EXTRACTION FUNCTIONS
125*0Sstevel@tonic-gate
126*0Sstevel@tonic-gatesub extract_delimited (;$$$$)
127*0Sstevel@tonic-gate{
128*0Sstevel@tonic-gate	my $textref = defined $_[0] ? \$_[0] : \$_;
129*0Sstevel@tonic-gate	my $wantarray = wantarray;
130*0Sstevel@tonic-gate	my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
131*0Sstevel@tonic-gate	my $pre  = defined $_[2] ? $_[2] : '\s*';
132*0Sstevel@tonic-gate	my $esc  = defined $_[3] ? $_[3] : qq{\\};
133*0Sstevel@tonic-gate	my $pat = gen_delimited_pat($del, $esc);
134*0Sstevel@tonic-gate	my $startpos = pos $$textref || 0;
135*0Sstevel@tonic-gate	return _fail($wantarray, $textref, "Not a delimited pattern", 0)
136*0Sstevel@tonic-gate		unless $$textref =~ m/\G($pre)($pat)/gc;
137*0Sstevel@tonic-gate	my $prelen = length($1);
138*0Sstevel@tonic-gate	my $matchpos = $startpos+$prelen;
139*0Sstevel@tonic-gate	my $endpos = pos $$textref;
140*0Sstevel@tonic-gate	return _succeed $wantarray, $textref,
141*0Sstevel@tonic-gate			$matchpos, $endpos-$matchpos,		# MATCH
142*0Sstevel@tonic-gate			$endpos,   length($$textref)-$endpos,	# REMAINDER
143*0Sstevel@tonic-gate			$startpos, $prelen;			# PREFIX
144*0Sstevel@tonic-gate}
145*0Sstevel@tonic-gate
146*0Sstevel@tonic-gatesub extract_bracketed (;$$$)
147*0Sstevel@tonic-gate{
148*0Sstevel@tonic-gate	my $textref = defined $_[0] ? \$_[0] : \$_;
149*0Sstevel@tonic-gate	my $ldel = defined $_[1] ? $_[1] : '{([<';
150*0Sstevel@tonic-gate	my $pre  = defined $_[2] ? $_[2] : '\s*';
151*0Sstevel@tonic-gate	my $wantarray = wantarray;
152*0Sstevel@tonic-gate	my $qdel = "";
153*0Sstevel@tonic-gate	my $quotelike;
154*0Sstevel@tonic-gate	$ldel =~ s/'//g and $qdel .= q{'};
155*0Sstevel@tonic-gate	$ldel =~ s/"//g and $qdel .= q{"};
156*0Sstevel@tonic-gate	$ldel =~ s/`//g and $qdel .= q{`};
157*0Sstevel@tonic-gate	$ldel =~ s/q//g and $quotelike = 1;
158*0Sstevel@tonic-gate	$ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
159*0Sstevel@tonic-gate	my $rdel = $ldel;
160*0Sstevel@tonic-gate	unless ($rdel =~ tr/[({</])}>/)
161*0Sstevel@tonic-gate        {
162*0Sstevel@tonic-gate		return _fail $wantarray, $textref,
163*0Sstevel@tonic-gate			     "Did not find a suitable bracket in delimiter: \"$_[1]\"",
164*0Sstevel@tonic-gate			     0;
165*0Sstevel@tonic-gate	}
166*0Sstevel@tonic-gate	my $posbug = pos;
167*0Sstevel@tonic-gate	$ldel = join('|', map { quotemeta $_ } split('', $ldel));
168*0Sstevel@tonic-gate	$rdel = join('|', map { quotemeta $_ } split('', $rdel));
169*0Sstevel@tonic-gate	pos = $posbug;
170*0Sstevel@tonic-gate
171*0Sstevel@tonic-gate	my $startpos = pos $$textref || 0;
172*0Sstevel@tonic-gate	my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
173*0Sstevel@tonic-gate
174*0Sstevel@tonic-gate	return _fail ($wantarray, $textref) unless @match;
175*0Sstevel@tonic-gate
176*0Sstevel@tonic-gate	return _succeed ( $wantarray, $textref,
177*0Sstevel@tonic-gate			  $match[2], $match[5]+2,	# MATCH
178*0Sstevel@tonic-gate			  @match[8,9],			# REMAINDER
179*0Sstevel@tonic-gate			  @match[0,1],			# PREFIX
180*0Sstevel@tonic-gate			);
181*0Sstevel@tonic-gate}
182*0Sstevel@tonic-gate
183*0Sstevel@tonic-gatesub _match_bracketed($$$$$$)	# $textref, $pre, $ldel, $qdel, $quotelike, $rdel
184*0Sstevel@tonic-gate{
185*0Sstevel@tonic-gate	my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
186*0Sstevel@tonic-gate	my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
187*0Sstevel@tonic-gate	unless ($$textref =~ m/\G$pre/gc)
188*0Sstevel@tonic-gate	{
189*0Sstevel@tonic-gate		_failmsg "Did not find prefix: /$pre/", $startpos;
190*0Sstevel@tonic-gate		return;
191*0Sstevel@tonic-gate	}
192*0Sstevel@tonic-gate
193*0Sstevel@tonic-gate	$ldelpos = pos $$textref;
194*0Sstevel@tonic-gate
195*0Sstevel@tonic-gate	unless ($$textref =~ m/\G($ldel)/gc)
196*0Sstevel@tonic-gate	{
197*0Sstevel@tonic-gate		_failmsg "Did not find opening bracket after prefix: \"$pre\"",
198*0Sstevel@tonic-gate		         pos $$textref;
199*0Sstevel@tonic-gate		pos $$textref = $startpos;
200*0Sstevel@tonic-gate		return;
201*0Sstevel@tonic-gate	}
202*0Sstevel@tonic-gate
203*0Sstevel@tonic-gate	my @nesting = ( $1 );
204*0Sstevel@tonic-gate	my $textlen = length $$textref;
205*0Sstevel@tonic-gate	while (pos $$textref < $textlen)
206*0Sstevel@tonic-gate	{
207*0Sstevel@tonic-gate		next if $$textref =~ m/\G\\./gcs;
208*0Sstevel@tonic-gate
209*0Sstevel@tonic-gate		if ($$textref =~ m/\G($ldel)/gc)
210*0Sstevel@tonic-gate		{
211*0Sstevel@tonic-gate			push @nesting, $1;
212*0Sstevel@tonic-gate		}
213*0Sstevel@tonic-gate		elsif ($$textref =~ m/\G($rdel)/gc)
214*0Sstevel@tonic-gate		{
215*0Sstevel@tonic-gate			my ($found, $brackettype) = ($1, $1);
216*0Sstevel@tonic-gate			if ($#nesting < 0)
217*0Sstevel@tonic-gate			{
218*0Sstevel@tonic-gate				_failmsg "Unmatched closing bracket: \"$found\"",
219*0Sstevel@tonic-gate					 pos $$textref;
220*0Sstevel@tonic-gate				pos $$textref = $startpos;
221*0Sstevel@tonic-gate			        return;
222*0Sstevel@tonic-gate			}
223*0Sstevel@tonic-gate			my $expected = pop(@nesting);
224*0Sstevel@tonic-gate			$expected =~ tr/({[</)}]>/;
225*0Sstevel@tonic-gate			if ($expected ne $brackettype)
226*0Sstevel@tonic-gate			{
227*0Sstevel@tonic-gate				_failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
228*0Sstevel@tonic-gate					 pos $$textref;
229*0Sstevel@tonic-gate				pos $$textref = $startpos;
230*0Sstevel@tonic-gate			        return;
231*0Sstevel@tonic-gate			}
232*0Sstevel@tonic-gate			last if $#nesting < 0;
233*0Sstevel@tonic-gate		}
234*0Sstevel@tonic-gate		elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
235*0Sstevel@tonic-gate		{
236*0Sstevel@tonic-gate			$$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
237*0Sstevel@tonic-gate			_failmsg "Unmatched embedded quote ($1)",
238*0Sstevel@tonic-gate				 pos $$textref;
239*0Sstevel@tonic-gate			pos $$textref = $startpos;
240*0Sstevel@tonic-gate			return;
241*0Sstevel@tonic-gate		}
242*0Sstevel@tonic-gate		elsif ($quotelike && _match_quotelike($textref,"",1,0))
243*0Sstevel@tonic-gate		{
244*0Sstevel@tonic-gate			next;
245*0Sstevel@tonic-gate		}
246*0Sstevel@tonic-gate
247*0Sstevel@tonic-gate		else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
248*0Sstevel@tonic-gate	}
249*0Sstevel@tonic-gate	if ($#nesting>=0)
250*0Sstevel@tonic-gate	{
251*0Sstevel@tonic-gate		_failmsg "Unmatched opening bracket(s): "
252*0Sstevel@tonic-gate				. join("..",@nesting)."..",
253*0Sstevel@tonic-gate		         pos $$textref;
254*0Sstevel@tonic-gate		pos $$textref = $startpos;
255*0Sstevel@tonic-gate		return;
256*0Sstevel@tonic-gate	}
257*0Sstevel@tonic-gate
258*0Sstevel@tonic-gate	$endpos = pos $$textref;
259*0Sstevel@tonic-gate
260*0Sstevel@tonic-gate	return (
261*0Sstevel@tonic-gate		$startpos,  $ldelpos-$startpos,		# PREFIX
262*0Sstevel@tonic-gate		$ldelpos,   1,				# OPENING BRACKET
263*0Sstevel@tonic-gate		$ldelpos+1, $endpos-$ldelpos-2,		# CONTENTS
264*0Sstevel@tonic-gate		$endpos-1,  1,				# CLOSING BRACKET
265*0Sstevel@tonic-gate		$endpos,    length($$textref)-$endpos,	# REMAINDER
266*0Sstevel@tonic-gate	       );
267*0Sstevel@tonic-gate}
268*0Sstevel@tonic-gate
269*0Sstevel@tonic-gatesub revbracket($)
270*0Sstevel@tonic-gate{
271*0Sstevel@tonic-gate	my $brack = reverse $_[0];
272*0Sstevel@tonic-gate	$brack =~ tr/[({</])}>/;
273*0Sstevel@tonic-gate	return $brack;
274*0Sstevel@tonic-gate}
275*0Sstevel@tonic-gate
276*0Sstevel@tonic-gatemy $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
277*0Sstevel@tonic-gate
278*0Sstevel@tonic-gatesub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
279*0Sstevel@tonic-gate{
280*0Sstevel@tonic-gate	my $textref = defined $_[0] ? \$_[0] : \$_;
281*0Sstevel@tonic-gate	my $ldel    = $_[1];
282*0Sstevel@tonic-gate	my $rdel    = $_[2];
283*0Sstevel@tonic-gate	my $pre     = defined $_[3] ? $_[3] : '\s*';
284*0Sstevel@tonic-gate	my %options = defined $_[4] ? %{$_[4]} : ();
285*0Sstevel@tonic-gate	my $omode   = defined $options{fail} ? $options{fail} : '';
286*0Sstevel@tonic-gate	my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
287*0Sstevel@tonic-gate		    : defined($options{reject})	       ? $options{reject}
288*0Sstevel@tonic-gate		    :					 ''
289*0Sstevel@tonic-gate		    ;
290*0Sstevel@tonic-gate	my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
291*0Sstevel@tonic-gate		    : defined($options{ignore})	       ? $options{ignore}
292*0Sstevel@tonic-gate		    :					 ''
293*0Sstevel@tonic-gate		    ;
294*0Sstevel@tonic-gate
295*0Sstevel@tonic-gate	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
296*0Sstevel@tonic-gate	$@ = undef;
297*0Sstevel@tonic-gate
298*0Sstevel@tonic-gate	my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
299*0Sstevel@tonic-gate
300*0Sstevel@tonic-gate	return _fail(wantarray, $textref) unless @match;
301*0Sstevel@tonic-gate	return _succeed wantarray, $textref,
302*0Sstevel@tonic-gate			$match[2], $match[3]+$match[5]+$match[7],	# MATCH
303*0Sstevel@tonic-gate			@match[8..9,0..1,2..7];				# REM, PRE, BITS
304*0Sstevel@tonic-gate}
305*0Sstevel@tonic-gate
306*0Sstevel@tonic-gatesub _match_tagged	# ($$$$$$$)
307*0Sstevel@tonic-gate{
308*0Sstevel@tonic-gate	my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
309*0Sstevel@tonic-gate	my $rdelspec;
310*0Sstevel@tonic-gate
311*0Sstevel@tonic-gate	my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
312*0Sstevel@tonic-gate
313*0Sstevel@tonic-gate	unless ($$textref =~ m/\G($pre)/gc)
314*0Sstevel@tonic-gate	{
315*0Sstevel@tonic-gate		_failmsg "Did not find prefix: /$pre/", pos $$textref;
316*0Sstevel@tonic-gate		goto failed;
317*0Sstevel@tonic-gate	}
318*0Sstevel@tonic-gate
319*0Sstevel@tonic-gate	$opentagpos = pos($$textref);
320*0Sstevel@tonic-gate
321*0Sstevel@tonic-gate	unless ($$textref =~ m/\G$ldel/gc)
322*0Sstevel@tonic-gate	{
323*0Sstevel@tonic-gate		_failmsg "Did not find opening tag: /$ldel/", pos $$textref;
324*0Sstevel@tonic-gate		goto failed;
325*0Sstevel@tonic-gate	}
326*0Sstevel@tonic-gate
327*0Sstevel@tonic-gate	$textpos = pos($$textref);
328*0Sstevel@tonic-gate
329*0Sstevel@tonic-gate	if (!defined $rdel)
330*0Sstevel@tonic-gate	{
331*0Sstevel@tonic-gate		$rdelspec = $&;
332*0Sstevel@tonic-gate		unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes)
333*0Sstevel@tonic-gate		{
334*0Sstevel@tonic-gate			_failmsg "Unable to construct closing tag to match: $rdel",
335*0Sstevel@tonic-gate				 pos $$textref;
336*0Sstevel@tonic-gate			goto failed;
337*0Sstevel@tonic-gate		}
338*0Sstevel@tonic-gate	}
339*0Sstevel@tonic-gate	else
340*0Sstevel@tonic-gate	{
341*0Sstevel@tonic-gate		$rdelspec = eval "qq{$rdel}" || do {
342*0Sstevel@tonic-gate			my $del;
343*0Sstevel@tonic-gate			for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
344*0Sstevel@tonic-gate				{ next if $rdel =~ /\Q$_/; $del = $_; last }
345*0Sstevel@tonic-gate			unless ($del) {
346*0Sstevel@tonic-gate				use Carp;
347*0Sstevel@tonic-gate				croak "Can't interpolate right delimiter $rdel"
348*0Sstevel@tonic-gate			}
349*0Sstevel@tonic-gate			eval "qq$del$rdel$del";
350*0Sstevel@tonic-gate		};
351*0Sstevel@tonic-gate	}
352*0Sstevel@tonic-gate
353*0Sstevel@tonic-gate	while (pos($$textref) < length($$textref))
354*0Sstevel@tonic-gate	{
355*0Sstevel@tonic-gate		next if $$textref =~ m/\G\\./gc;
356*0Sstevel@tonic-gate
357*0Sstevel@tonic-gate		if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
358*0Sstevel@tonic-gate		{
359*0Sstevel@tonic-gate			$parapos = pos($$textref) - length($1)
360*0Sstevel@tonic-gate				unless defined $parapos;
361*0Sstevel@tonic-gate		}
362*0Sstevel@tonic-gate		elsif ($$textref =~ m/\G($rdelspec)/gc )
363*0Sstevel@tonic-gate		{
364*0Sstevel@tonic-gate			$closetagpos = pos($$textref)-length($1);
365*0Sstevel@tonic-gate			goto matched;
366*0Sstevel@tonic-gate		}
367*0Sstevel@tonic-gate		elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
368*0Sstevel@tonic-gate		{
369*0Sstevel@tonic-gate			next;
370*0Sstevel@tonic-gate		}
371*0Sstevel@tonic-gate		elsif ($bad && $$textref =~ m/\G($bad)/gcs)
372*0Sstevel@tonic-gate		{
373*0Sstevel@tonic-gate			pos($$textref) -= length($1);	# CUT OFF WHATEVER CAUSED THE SHORTNESS
374*0Sstevel@tonic-gate			goto short if ($omode eq 'PARA' || $omode eq 'MAX');
375*0Sstevel@tonic-gate			_failmsg "Found invalid nested tag: $1", pos $$textref;
376*0Sstevel@tonic-gate			goto failed;
377*0Sstevel@tonic-gate		}
378*0Sstevel@tonic-gate		elsif ($$textref =~ m/\G($ldel)/gc)
379*0Sstevel@tonic-gate		{
380*0Sstevel@tonic-gate			my $tag = $1;
381*0Sstevel@tonic-gate			pos($$textref) -= length($tag);	# REWIND TO NESTED TAG
382*0Sstevel@tonic-gate			unless (_match_tagged(@_))	# MATCH NESTED TAG
383*0Sstevel@tonic-gate			{
384*0Sstevel@tonic-gate				goto short if $omode eq 'PARA' || $omode eq 'MAX';
385*0Sstevel@tonic-gate				_failmsg "Found unbalanced nested tag: $tag",
386*0Sstevel@tonic-gate					 pos $$textref;
387*0Sstevel@tonic-gate				goto failed;
388*0Sstevel@tonic-gate			}
389*0Sstevel@tonic-gate		}
390*0Sstevel@tonic-gate		else { $$textref =~ m/./gcs }
391*0Sstevel@tonic-gate	}
392*0Sstevel@tonic-gate
393*0Sstevel@tonic-gateshort:
394*0Sstevel@tonic-gate	$closetagpos = pos($$textref);
395*0Sstevel@tonic-gate	goto matched if $omode eq 'MAX';
396*0Sstevel@tonic-gate	goto failed unless $omode eq 'PARA';
397*0Sstevel@tonic-gate
398*0Sstevel@tonic-gate	if (defined $parapos) { pos($$textref) = $parapos }
399*0Sstevel@tonic-gate	else		      { $parapos = pos($$textref) }
400*0Sstevel@tonic-gate
401*0Sstevel@tonic-gate	return (
402*0Sstevel@tonic-gate		$startpos,    $opentagpos-$startpos,		# PREFIX
403*0Sstevel@tonic-gate		$opentagpos,  $textpos-$opentagpos,		# OPENING TAG
404*0Sstevel@tonic-gate		$textpos,     $parapos-$textpos,		# TEXT
405*0Sstevel@tonic-gate		$parapos,     0,				# NO CLOSING TAG
406*0Sstevel@tonic-gate		$parapos,     length($$textref)-$parapos,	# REMAINDER
407*0Sstevel@tonic-gate	       );
408*0Sstevel@tonic-gate
409*0Sstevel@tonic-gatematched:
410*0Sstevel@tonic-gate	$endpos = pos($$textref);
411*0Sstevel@tonic-gate	return (
412*0Sstevel@tonic-gate		$startpos,    $opentagpos-$startpos,		# PREFIX
413*0Sstevel@tonic-gate		$opentagpos,  $textpos-$opentagpos,		# OPENING TAG
414*0Sstevel@tonic-gate		$textpos,     $closetagpos-$textpos,		# TEXT
415*0Sstevel@tonic-gate		$closetagpos, $endpos-$closetagpos,		# CLOSING TAG
416*0Sstevel@tonic-gate		$endpos,      length($$textref)-$endpos,	# REMAINDER
417*0Sstevel@tonic-gate	       );
418*0Sstevel@tonic-gate
419*0Sstevel@tonic-gatefailed:
420*0Sstevel@tonic-gate	_failmsg "Did not find closing tag", pos $$textref unless $@;
421*0Sstevel@tonic-gate	pos($$textref) = $startpos;
422*0Sstevel@tonic-gate	return;
423*0Sstevel@tonic-gate}
424*0Sstevel@tonic-gate
425*0Sstevel@tonic-gatesub extract_variable (;$$)
426*0Sstevel@tonic-gate{
427*0Sstevel@tonic-gate	my $textref = defined $_[0] ? \$_[0] : \$_;
428*0Sstevel@tonic-gate	return ("","","") unless defined $$textref;
429*0Sstevel@tonic-gate	my $pre  = defined $_[1] ? $_[1] : '\s*';
430*0Sstevel@tonic-gate
431*0Sstevel@tonic-gate	my @match = _match_variable($textref,$pre);
432*0Sstevel@tonic-gate
433*0Sstevel@tonic-gate	return _fail wantarray, $textref unless @match;
434*0Sstevel@tonic-gate
435*0Sstevel@tonic-gate	return _succeed wantarray, $textref,
436*0Sstevel@tonic-gate			@match[2..3,4..5,0..1];		# MATCH, REMAINDER, PREFIX
437*0Sstevel@tonic-gate}
438*0Sstevel@tonic-gate
439*0Sstevel@tonic-gatesub _match_variable($$)
440*0Sstevel@tonic-gate{
441*0Sstevel@tonic-gate#  $#
442*0Sstevel@tonic-gate#  $^
443*0Sstevel@tonic-gate#  $$
444*0Sstevel@tonic-gate	my ($textref, $pre) = @_;
445*0Sstevel@tonic-gate	my $startpos = pos($$textref) = pos($$textref)||0;
446*0Sstevel@tonic-gate	unless ($$textref =~ m/\G($pre)/gc)
447*0Sstevel@tonic-gate	{
448*0Sstevel@tonic-gate		_failmsg "Did not find prefix: /$pre/", pos $$textref;
449*0Sstevel@tonic-gate		return;
450*0Sstevel@tonic-gate	}
451*0Sstevel@tonic-gate	my $varpos = pos($$textref);
452*0Sstevel@tonic-gate        unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
453*0Sstevel@tonic-gate	{
454*0Sstevel@tonic-gate	    unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
455*0Sstevel@tonic-gate	    {
456*0Sstevel@tonic-gate		_failmsg "Did not find leading dereferencer", pos $$textref;
457*0Sstevel@tonic-gate		pos $$textref = $startpos;
458*0Sstevel@tonic-gate		return;
459*0Sstevel@tonic-gate	    }
460*0Sstevel@tonic-gate	    my $deref = $1;
461*0Sstevel@tonic-gate
462*0Sstevel@tonic-gate	    unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
463*0Sstevel@tonic-gate	    	or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
464*0Sstevel@tonic-gate		or $deref eq '$#' or $deref eq '$$' )
465*0Sstevel@tonic-gate	    {
466*0Sstevel@tonic-gate		_failmsg "Bad identifier after dereferencer", pos $$textref;
467*0Sstevel@tonic-gate		pos $$textref = $startpos;
468*0Sstevel@tonic-gate		return;
469*0Sstevel@tonic-gate	    }
470*0Sstevel@tonic-gate	}
471*0Sstevel@tonic-gate
472*0Sstevel@tonic-gate	while (1)
473*0Sstevel@tonic-gate	{
474*0Sstevel@tonic-gate		next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
475*0Sstevel@tonic-gate		next if _match_codeblock($textref,
476*0Sstevel@tonic-gate					 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
477*0Sstevel@tonic-gate					 qr/[({[]/, qr/[)}\]]/,
478*0Sstevel@tonic-gate					 qr/[({[]/, qr/[)}\]]/, 0);
479*0Sstevel@tonic-gate		next if _match_codeblock($textref,
480*0Sstevel@tonic-gate					 qr/\s*/, qr/[{[]/, qr/[}\]]/,
481*0Sstevel@tonic-gate					 qr/[{[]/, qr/[}\]]/, 0);
482*0Sstevel@tonic-gate		next if _match_variable($textref,'\s*->\s*');
483*0Sstevel@tonic-gate		next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
484*0Sstevel@tonic-gate		last;
485*0Sstevel@tonic-gate	}
486*0Sstevel@tonic-gate
487*0Sstevel@tonic-gate	my $endpos = pos($$textref);
488*0Sstevel@tonic-gate	return ($startpos, $varpos-$startpos,
489*0Sstevel@tonic-gate		$varpos,   $endpos-$varpos,
490*0Sstevel@tonic-gate		$endpos,   length($$textref)-$endpos
491*0Sstevel@tonic-gate		);
492*0Sstevel@tonic-gate}
493*0Sstevel@tonic-gate
494*0Sstevel@tonic-gatesub extract_codeblock (;$$$$$)
495*0Sstevel@tonic-gate{
496*0Sstevel@tonic-gate	my $textref = defined $_[0] ? \$_[0] : \$_;
497*0Sstevel@tonic-gate	my $wantarray = wantarray;
498*0Sstevel@tonic-gate	my $ldel_inner = defined $_[1] ? $_[1] : '{';
499*0Sstevel@tonic-gate	my $pre        = defined $_[2] ? $_[2] : '\s*';
500*0Sstevel@tonic-gate	my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
501*0Sstevel@tonic-gate	my $rd         = $_[4];
502*0Sstevel@tonic-gate	my $rdel_inner = $ldel_inner;
503*0Sstevel@tonic-gate	my $rdel_outer = $ldel_outer;
504*0Sstevel@tonic-gate	my $posbug = pos;
505*0Sstevel@tonic-gate	for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
506*0Sstevel@tonic-gate	for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
507*0Sstevel@tonic-gate	for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
508*0Sstevel@tonic-gate	{
509*0Sstevel@tonic-gate		$_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
510*0Sstevel@tonic-gate	}
511*0Sstevel@tonic-gate	pos = $posbug;
512*0Sstevel@tonic-gate
513*0Sstevel@tonic-gate	my @match = _match_codeblock($textref, $pre,
514*0Sstevel@tonic-gate				     $ldel_outer, $rdel_outer,
515*0Sstevel@tonic-gate				     $ldel_inner, $rdel_inner,
516*0Sstevel@tonic-gate				     $rd);
517*0Sstevel@tonic-gate	return _fail($wantarray, $textref) unless @match;
518*0Sstevel@tonic-gate	return _succeed($wantarray, $textref,
519*0Sstevel@tonic-gate			@match[2..3,4..5,0..1]	# MATCH, REMAINDER, PREFIX
520*0Sstevel@tonic-gate		       );
521*0Sstevel@tonic-gate
522*0Sstevel@tonic-gate}
523*0Sstevel@tonic-gate
524*0Sstevel@tonic-gatesub _match_codeblock($$$$$$$)
525*0Sstevel@tonic-gate{
526*0Sstevel@tonic-gate	my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
527*0Sstevel@tonic-gate	my $startpos = pos($$textref) = pos($$textref) || 0;
528*0Sstevel@tonic-gate	unless ($$textref =~ m/\G($pre)/gc)
529*0Sstevel@tonic-gate	{
530*0Sstevel@tonic-gate		_failmsg qq{Did not match prefix /$pre/ at"} .
531*0Sstevel@tonic-gate			    substr($$textref,pos($$textref),20) .
532*0Sstevel@tonic-gate			    q{..."},
533*0Sstevel@tonic-gate		         pos $$textref;
534*0Sstevel@tonic-gate		return;
535*0Sstevel@tonic-gate	}
536*0Sstevel@tonic-gate	my $codepos = pos($$textref);
537*0Sstevel@tonic-gate	unless ($$textref =~ m/\G($ldel_outer)/gc)	# OUTERMOST DELIMITER
538*0Sstevel@tonic-gate	{
539*0Sstevel@tonic-gate		_failmsg qq{Did not find expected opening bracket at "} .
540*0Sstevel@tonic-gate			     substr($$textref,pos($$textref),20) .
541*0Sstevel@tonic-gate			     q{..."},
542*0Sstevel@tonic-gate		         pos $$textref;
543*0Sstevel@tonic-gate		pos $$textref = $startpos;
544*0Sstevel@tonic-gate		return;
545*0Sstevel@tonic-gate	}
546*0Sstevel@tonic-gate	my $closing = $1;
547*0Sstevel@tonic-gate	   $closing =~ tr/([<{/)]>}/;
548*0Sstevel@tonic-gate	my $matched;
549*0Sstevel@tonic-gate	my $patvalid = 1;
550*0Sstevel@tonic-gate	while (pos($$textref) < length($$textref))
551*0Sstevel@tonic-gate	{
552*0Sstevel@tonic-gate		$matched = '';
553*0Sstevel@tonic-gate		if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
554*0Sstevel@tonic-gate		{
555*0Sstevel@tonic-gate			$patvalid = 0;
556*0Sstevel@tonic-gate			next;
557*0Sstevel@tonic-gate		}
558*0Sstevel@tonic-gate
559*0Sstevel@tonic-gate		if ($$textref =~ m/\G\s*#.*/gc)
560*0Sstevel@tonic-gate		{
561*0Sstevel@tonic-gate			next;
562*0Sstevel@tonic-gate		}
563*0Sstevel@tonic-gate
564*0Sstevel@tonic-gate		if ($$textref =~ m/\G\s*($rdel_outer)/gc)
565*0Sstevel@tonic-gate		{
566*0Sstevel@tonic-gate			unless ($matched = ($closing && $1 eq $closing) )
567*0Sstevel@tonic-gate			{
568*0Sstevel@tonic-gate				next if $1 eq '>';	# MIGHT BE A "LESS THAN"
569*0Sstevel@tonic-gate				_failmsg q{Mismatched closing bracket at "} .
570*0Sstevel@tonic-gate					     substr($$textref,pos($$textref),20) .
571*0Sstevel@tonic-gate					     qq{...". Expected '$closing'},
572*0Sstevel@tonic-gate					 pos $$textref;
573*0Sstevel@tonic-gate			}
574*0Sstevel@tonic-gate			last;
575*0Sstevel@tonic-gate		}
576*0Sstevel@tonic-gate
577*0Sstevel@tonic-gate		if (_match_variable($textref,'\s*') ||
578*0Sstevel@tonic-gate		    _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
579*0Sstevel@tonic-gate		{
580*0Sstevel@tonic-gate			$patvalid = 0;
581*0Sstevel@tonic-gate			next;
582*0Sstevel@tonic-gate		}
583*0Sstevel@tonic-gate
584*0Sstevel@tonic-gate
585*0Sstevel@tonic-gate		# NEED TO COVER MANY MORE CASES HERE!!!
586*0Sstevel@tonic-gate		if ($$textref =~ m#\G\s*(?!$ldel_inner)
587*0Sstevel@tonic-gate					( [-+*x/%^&|.]=?
588*0Sstevel@tonic-gate					| [!=]~
589*0Sstevel@tonic-gate					| =(?!>)
590*0Sstevel@tonic-gate					| (\*\*|&&|\|\||<<|>>)=?
591*0Sstevel@tonic-gate					| split|grep|map|return
592*0Sstevel@tonic-gate					| [([]
593*0Sstevel@tonic-gate					)#gcx)
594*0Sstevel@tonic-gate		{
595*0Sstevel@tonic-gate			$patvalid = 1;
596*0Sstevel@tonic-gate			next;
597*0Sstevel@tonic-gate		}
598*0Sstevel@tonic-gate
599*0Sstevel@tonic-gate		if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
600*0Sstevel@tonic-gate		{
601*0Sstevel@tonic-gate			$patvalid = 1;
602*0Sstevel@tonic-gate			next;
603*0Sstevel@tonic-gate		}
604*0Sstevel@tonic-gate
605*0Sstevel@tonic-gate		if ($$textref =~ m/\G\s*$ldel_outer/gc)
606*0Sstevel@tonic-gate		{
607*0Sstevel@tonic-gate			_failmsg q{Improperly nested codeblock at "} .
608*0Sstevel@tonic-gate				     substr($$textref,pos($$textref),20) .
609*0Sstevel@tonic-gate				     q{..."},
610*0Sstevel@tonic-gate				 pos $$textref;
611*0Sstevel@tonic-gate			last;
612*0Sstevel@tonic-gate		}
613*0Sstevel@tonic-gate
614*0Sstevel@tonic-gate		$patvalid = 0;
615*0Sstevel@tonic-gate		$$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
616*0Sstevel@tonic-gate	}
617*0Sstevel@tonic-gate	continue { $@ = undef }
618*0Sstevel@tonic-gate
619*0Sstevel@tonic-gate	unless ($matched)
620*0Sstevel@tonic-gate	{
621*0Sstevel@tonic-gate		_failmsg 'No match found for opening bracket', pos $$textref
622*0Sstevel@tonic-gate			unless $@;
623*0Sstevel@tonic-gate		return;
624*0Sstevel@tonic-gate	}
625*0Sstevel@tonic-gate
626*0Sstevel@tonic-gate	my $endpos = pos($$textref);
627*0Sstevel@tonic-gate	return ( $startpos, $codepos-$startpos,
628*0Sstevel@tonic-gate		 $codepos, $endpos-$codepos,
629*0Sstevel@tonic-gate		 $endpos,  length($$textref)-$endpos,
630*0Sstevel@tonic-gate	       );
631*0Sstevel@tonic-gate}
632*0Sstevel@tonic-gate
633*0Sstevel@tonic-gate
634*0Sstevel@tonic-gatemy %mods   = (
635*0Sstevel@tonic-gate		'none'	=> '[cgimsox]*',
636*0Sstevel@tonic-gate		'm'	=> '[cgimsox]*',
637*0Sstevel@tonic-gate		's'	=> '[cegimsox]*',
638*0Sstevel@tonic-gate		'tr'	=> '[cds]*',
639*0Sstevel@tonic-gate		'y'	=> '[cds]*',
640*0Sstevel@tonic-gate		'qq'	=> '',
641*0Sstevel@tonic-gate		'qx'	=> '',
642*0Sstevel@tonic-gate		'qw'	=> '',
643*0Sstevel@tonic-gate		'qr'	=> '[imsx]*',
644*0Sstevel@tonic-gate		'q'	=> '',
645*0Sstevel@tonic-gate	     );
646*0Sstevel@tonic-gate
647*0Sstevel@tonic-gatesub extract_quotelike (;$$)
648*0Sstevel@tonic-gate{
649*0Sstevel@tonic-gate	my $textref = $_[0] ? \$_[0] : \$_;
650*0Sstevel@tonic-gate	my $wantarray = wantarray;
651*0Sstevel@tonic-gate	my $pre  = defined $_[1] ? $_[1] : '\s*';
652*0Sstevel@tonic-gate
653*0Sstevel@tonic-gate	my @match = _match_quotelike($textref,$pre,1,0);
654*0Sstevel@tonic-gate	return _fail($wantarray, $textref) unless @match;
655*0Sstevel@tonic-gate	return _succeed($wantarray, $textref,
656*0Sstevel@tonic-gate			$match[2], $match[18]-$match[2],	# MATCH
657*0Sstevel@tonic-gate			@match[18,19],				# REMAINDER
658*0Sstevel@tonic-gate			@match[0,1],				# PREFIX
659*0Sstevel@tonic-gate			@match[2..17],				# THE BITS
660*0Sstevel@tonic-gate			@match[20,21],				# ANY FILLET?
661*0Sstevel@tonic-gate		       );
662*0Sstevel@tonic-gate};
663*0Sstevel@tonic-gate
664*0Sstevel@tonic-gatesub _match_quotelike($$$$)	# ($textref, $prepat, $allow_raw_match)
665*0Sstevel@tonic-gate{
666*0Sstevel@tonic-gate	my ($textref, $pre, $rawmatch, $qmark) = @_;
667*0Sstevel@tonic-gate
668*0Sstevel@tonic-gate	my ($textlen,$startpos,
669*0Sstevel@tonic-gate	    $oppos,
670*0Sstevel@tonic-gate	    $preld1pos,$ld1pos,$str1pos,$rd1pos,
671*0Sstevel@tonic-gate	    $preld2pos,$ld2pos,$str2pos,$rd2pos,
672*0Sstevel@tonic-gate	    $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
673*0Sstevel@tonic-gate
674*0Sstevel@tonic-gate	unless ($$textref =~ m/\G($pre)/gc)
675*0Sstevel@tonic-gate	{
676*0Sstevel@tonic-gate		_failmsg qq{Did not find prefix /$pre/ at "} .
677*0Sstevel@tonic-gate			     substr($$textref, pos($$textref), 20) .
678*0Sstevel@tonic-gate			     q{..."},
679*0Sstevel@tonic-gate		         pos $$textref;
680*0Sstevel@tonic-gate		return;
681*0Sstevel@tonic-gate	}
682*0Sstevel@tonic-gate	$oppos = pos($$textref);
683*0Sstevel@tonic-gate
684*0Sstevel@tonic-gate	my $initial = substr($$textref,$oppos,1);
685*0Sstevel@tonic-gate
686*0Sstevel@tonic-gate	if ($initial && $initial =~ m|^[\"\'\`]|
687*0Sstevel@tonic-gate		     || $rawmatch && $initial =~ m|^/|
688*0Sstevel@tonic-gate		     || $qmark && $initial =~ m|^\?|)
689*0Sstevel@tonic-gate	{
690*0Sstevel@tonic-gate		unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
691*0Sstevel@tonic-gate		{
692*0Sstevel@tonic-gate			_failmsg qq{Did not find closing delimiter to match '$initial' at "} .
693*0Sstevel@tonic-gate				     substr($$textref, $oppos, 20) .
694*0Sstevel@tonic-gate				     q{..."},
695*0Sstevel@tonic-gate				 pos $$textref;
696*0Sstevel@tonic-gate			pos $$textref = $startpos;
697*0Sstevel@tonic-gate			return;
698*0Sstevel@tonic-gate		}
699*0Sstevel@tonic-gate		$modpos= pos($$textref);
700*0Sstevel@tonic-gate		$rd1pos = $modpos-1;
701*0Sstevel@tonic-gate
702*0Sstevel@tonic-gate		if ($initial eq '/' || $initial eq '?')
703*0Sstevel@tonic-gate		{
704*0Sstevel@tonic-gate			$$textref =~ m/\G$mods{none}/gc
705*0Sstevel@tonic-gate		}
706*0Sstevel@tonic-gate
707*0Sstevel@tonic-gate		my $endpos = pos($$textref);
708*0Sstevel@tonic-gate		return (
709*0Sstevel@tonic-gate			$startpos,	$oppos-$startpos,	# PREFIX
710*0Sstevel@tonic-gate			$oppos,		0,			# NO OPERATOR
711*0Sstevel@tonic-gate			$oppos,		1,			# LEFT DEL
712*0Sstevel@tonic-gate			$oppos+1,	$rd1pos-$oppos-1,	# STR/PAT
713*0Sstevel@tonic-gate			$rd1pos,	1,			# RIGHT DEL
714*0Sstevel@tonic-gate			$modpos,	0,			# NO 2ND LDEL
715*0Sstevel@tonic-gate			$modpos,	0,			# NO 2ND STR
716*0Sstevel@tonic-gate			$modpos,	0,			# NO 2ND RDEL
717*0Sstevel@tonic-gate			$modpos,	$endpos-$modpos,	# MODIFIERS
718*0Sstevel@tonic-gate			$endpos, 	$textlen-$endpos,	# REMAINDER
719*0Sstevel@tonic-gate		       );
720*0Sstevel@tonic-gate	}
721*0Sstevel@tonic-gate
722*0Sstevel@tonic-gate	unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
723*0Sstevel@tonic-gate	{
724*0Sstevel@tonic-gate		_failmsg q{No quotelike operator found after prefix at "} .
725*0Sstevel@tonic-gate			     substr($$textref, pos($$textref), 20) .
726*0Sstevel@tonic-gate			     q{..."},
727*0Sstevel@tonic-gate		         pos $$textref;
728*0Sstevel@tonic-gate		pos $$textref = $startpos;
729*0Sstevel@tonic-gate		return;
730*0Sstevel@tonic-gate	}
731*0Sstevel@tonic-gate
732*0Sstevel@tonic-gate	my $op = $1;
733*0Sstevel@tonic-gate	$preld1pos = pos($$textref);
734*0Sstevel@tonic-gate	if ($op eq '<<') {
735*0Sstevel@tonic-gate		$ld1pos = pos($$textref);
736*0Sstevel@tonic-gate		my $label;
737*0Sstevel@tonic-gate		if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
738*0Sstevel@tonic-gate			$label = $1;
739*0Sstevel@tonic-gate		}
740*0Sstevel@tonic-gate		elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
741*0Sstevel@tonic-gate				     | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
742*0Sstevel@tonic-gate				     | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
743*0Sstevel@tonic-gate				     }gcsx) {
744*0Sstevel@tonic-gate			$label = $+;
745*0Sstevel@tonic-gate		}
746*0Sstevel@tonic-gate		else {
747*0Sstevel@tonic-gate			$label = "";
748*0Sstevel@tonic-gate		}
749*0Sstevel@tonic-gate		my $extrapos = pos($$textref);
750*0Sstevel@tonic-gate		$$textref =~ m{.*\n}gc;
751*0Sstevel@tonic-gate		$str1pos = pos($$textref);
752*0Sstevel@tonic-gate		unless ($$textref =~ m{.*?\n(?=$label\n)}gc) {
753*0Sstevel@tonic-gate			_failmsg qq{Missing here doc terminator ('$label') after "} .
754*0Sstevel@tonic-gate				     substr($$textref, $startpos, 20) .
755*0Sstevel@tonic-gate				     q{..."},
756*0Sstevel@tonic-gate				 pos $$textref;
757*0Sstevel@tonic-gate			pos $$textref = $startpos;
758*0Sstevel@tonic-gate			return;
759*0Sstevel@tonic-gate		}
760*0Sstevel@tonic-gate		$rd1pos = pos($$textref);
761*0Sstevel@tonic-gate		$$textref =~ m{$label\n}gc;
762*0Sstevel@tonic-gate		$ld2pos = pos($$textref);
763*0Sstevel@tonic-gate		return (
764*0Sstevel@tonic-gate			$startpos,	$oppos-$startpos,	# PREFIX
765*0Sstevel@tonic-gate			$oppos,		length($op),		# OPERATOR
766*0Sstevel@tonic-gate			$ld1pos,	$extrapos-$ld1pos,	# LEFT DEL
767*0Sstevel@tonic-gate			$str1pos,	$rd1pos-$str1pos,	# STR/PAT
768*0Sstevel@tonic-gate			$rd1pos,	$ld2pos-$rd1pos,	# RIGHT DEL
769*0Sstevel@tonic-gate			$ld2pos,	0,			# NO 2ND LDEL
770*0Sstevel@tonic-gate			$ld2pos,	0,                	# NO 2ND STR
771*0Sstevel@tonic-gate			$ld2pos,	0,	                # NO 2ND RDEL
772*0Sstevel@tonic-gate			$ld2pos,	0,                      # NO MODIFIERS
773*0Sstevel@tonic-gate			$ld2pos,	$textlen-$ld2pos,	# REMAINDER
774*0Sstevel@tonic-gate			$extrapos,      $str1pos-$extrapos,	# FILLETED BIT
775*0Sstevel@tonic-gate		       );
776*0Sstevel@tonic-gate	}
777*0Sstevel@tonic-gate
778*0Sstevel@tonic-gate	$$textref =~ m/\G\s*/gc;
779*0Sstevel@tonic-gate	$ld1pos = pos($$textref);
780*0Sstevel@tonic-gate	$str1pos = $ld1pos+1;
781*0Sstevel@tonic-gate
782*0Sstevel@tonic-gate	unless ($$textref =~ m/\G(\S)/gc)	# SHOULD USE LOOKAHEAD
783*0Sstevel@tonic-gate	{
784*0Sstevel@tonic-gate		_failmsg "No block delimiter found after quotelike $op",
785*0Sstevel@tonic-gate		         pos $$textref;
786*0Sstevel@tonic-gate		pos $$textref = $startpos;
787*0Sstevel@tonic-gate		return;
788*0Sstevel@tonic-gate	}
789*0Sstevel@tonic-gate	pos($$textref) = $ld1pos;	# HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
790*0Sstevel@tonic-gate	my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
791*0Sstevel@tonic-gate	if ($ldel1 =~ /[[(<{]/)
792*0Sstevel@tonic-gate	{
793*0Sstevel@tonic-gate		$rdel1 =~ tr/[({</])}>/;
794*0Sstevel@tonic-gate		_match_bracketed($textref,"",$ldel1,"","",$rdel1)
795*0Sstevel@tonic-gate		|| do { pos $$textref = $startpos; return };
796*0Sstevel@tonic-gate	}
797*0Sstevel@tonic-gate	else
798*0Sstevel@tonic-gate	{
799*0Sstevel@tonic-gate		$$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
800*0Sstevel@tonic-gate		|| do { pos $$textref = $startpos; return };
801*0Sstevel@tonic-gate	}
802*0Sstevel@tonic-gate	$ld2pos = $rd1pos = pos($$textref)-1;
803*0Sstevel@tonic-gate
804*0Sstevel@tonic-gate	my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
805*0Sstevel@tonic-gate	if ($second_arg)
806*0Sstevel@tonic-gate	{
807*0Sstevel@tonic-gate		my ($ldel2, $rdel2);
808*0Sstevel@tonic-gate		if ($ldel1 =~ /[[(<{]/)
809*0Sstevel@tonic-gate		{
810*0Sstevel@tonic-gate			unless ($$textref =~ /\G\s*(\S)/gc)	# SHOULD USE LOOKAHEAD
811*0Sstevel@tonic-gate			{
812*0Sstevel@tonic-gate				_failmsg "Missing second block for quotelike $op",
813*0Sstevel@tonic-gate					 pos $$textref;
814*0Sstevel@tonic-gate				pos $$textref = $startpos;
815*0Sstevel@tonic-gate				return;
816*0Sstevel@tonic-gate			}
817*0Sstevel@tonic-gate			$ldel2 = $rdel2 = "\Q$1";
818*0Sstevel@tonic-gate			$rdel2 =~ tr/[({</])}>/;
819*0Sstevel@tonic-gate		}
820*0Sstevel@tonic-gate		else
821*0Sstevel@tonic-gate		{
822*0Sstevel@tonic-gate			$ldel2 = $rdel2 = $ldel1;
823*0Sstevel@tonic-gate		}
824*0Sstevel@tonic-gate		$str2pos = $ld2pos+1;
825*0Sstevel@tonic-gate
826*0Sstevel@tonic-gate		if ($ldel2 =~ /[[(<{]/)
827*0Sstevel@tonic-gate		{
828*0Sstevel@tonic-gate			pos($$textref)--;	# OVERCOME BROKEN LOOKAHEAD
829*0Sstevel@tonic-gate			_match_bracketed($textref,"",$ldel2,"","",$rdel2)
830*0Sstevel@tonic-gate			|| do { pos $$textref = $startpos; return };
831*0Sstevel@tonic-gate		}
832*0Sstevel@tonic-gate		else
833*0Sstevel@tonic-gate		{
834*0Sstevel@tonic-gate			$$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
835*0Sstevel@tonic-gate			|| do { pos $$textref = $startpos; return };
836*0Sstevel@tonic-gate		}
837*0Sstevel@tonic-gate		$rd2pos = pos($$textref)-1;
838*0Sstevel@tonic-gate	}
839*0Sstevel@tonic-gate	else
840*0Sstevel@tonic-gate	{
841*0Sstevel@tonic-gate		$ld2pos = $str2pos = $rd2pos = $rd1pos;
842*0Sstevel@tonic-gate	}
843*0Sstevel@tonic-gate
844*0Sstevel@tonic-gate	$modpos = pos $$textref;
845*0Sstevel@tonic-gate
846*0Sstevel@tonic-gate	$$textref =~ m/\G($mods{$op})/gc;
847*0Sstevel@tonic-gate	my $endpos = pos $$textref;
848*0Sstevel@tonic-gate
849*0Sstevel@tonic-gate	return (
850*0Sstevel@tonic-gate		$startpos,	$oppos-$startpos,	# PREFIX
851*0Sstevel@tonic-gate		$oppos,		length($op),		# OPERATOR
852*0Sstevel@tonic-gate		$ld1pos,	1,			# LEFT DEL
853*0Sstevel@tonic-gate		$str1pos,	$rd1pos-$str1pos,	# STR/PAT
854*0Sstevel@tonic-gate		$rd1pos,	1,			# RIGHT DEL
855*0Sstevel@tonic-gate		$ld2pos,	$second_arg,		# 2ND LDEL (MAYBE)
856*0Sstevel@tonic-gate		$str2pos,	$rd2pos-$str2pos,	# 2ND STR (MAYBE)
857*0Sstevel@tonic-gate		$rd2pos,	$second_arg,		# 2ND RDEL (MAYBE)
858*0Sstevel@tonic-gate		$modpos,	$endpos-$modpos,	# MODIFIERS
859*0Sstevel@tonic-gate		$endpos,	$textlen-$endpos,	# REMAINDER
860*0Sstevel@tonic-gate	       );
861*0Sstevel@tonic-gate}
862*0Sstevel@tonic-gate
863*0Sstevel@tonic-gatemy $def_func =
864*0Sstevel@tonic-gate[
865*0Sstevel@tonic-gate	sub { extract_variable($_[0], '') },
866*0Sstevel@tonic-gate	sub { extract_quotelike($_[0],'') },
867*0Sstevel@tonic-gate	sub { extract_codeblock($_[0],'{}','') },
868*0Sstevel@tonic-gate];
869*0Sstevel@tonic-gate
870*0Sstevel@tonic-gatesub extract_multiple (;$$$$)	# ($text, $functions_ref, $max_fields, $ignoreunknown)
871*0Sstevel@tonic-gate{
872*0Sstevel@tonic-gate	my $textref = defined($_[0]) ? \$_[0] : \$_;
873*0Sstevel@tonic-gate	my $posbug = pos;
874*0Sstevel@tonic-gate	my ($lastpos, $firstpos);
875*0Sstevel@tonic-gate	my @fields = ();
876*0Sstevel@tonic-gate
877*0Sstevel@tonic-gate	#for ($$textref)
878*0Sstevel@tonic-gate	{
879*0Sstevel@tonic-gate		my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
880*0Sstevel@tonic-gate		my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
881*0Sstevel@tonic-gate		my $igunk = $_[3];
882*0Sstevel@tonic-gate
883*0Sstevel@tonic-gate		pos $$textref ||= 0;
884*0Sstevel@tonic-gate
885*0Sstevel@tonic-gate		unless (wantarray)
886*0Sstevel@tonic-gate		{
887*0Sstevel@tonic-gate			use Carp;
888*0Sstevel@tonic-gate			carp "extract_multiple reset maximal count to 1 in scalar context"
889*0Sstevel@tonic-gate				if $^W && defined($_[2]) && $max > 1;
890*0Sstevel@tonic-gate			$max = 1
891*0Sstevel@tonic-gate		}
892*0Sstevel@tonic-gate
893*0Sstevel@tonic-gate		my $unkpos;
894*0Sstevel@tonic-gate		my $func;
895*0Sstevel@tonic-gate		my $class;
896*0Sstevel@tonic-gate
897*0Sstevel@tonic-gate		my @class;
898*0Sstevel@tonic-gate		foreach $func ( @func )
899*0Sstevel@tonic-gate		{
900*0Sstevel@tonic-gate			if (ref($func) eq 'HASH')
901*0Sstevel@tonic-gate			{
902*0Sstevel@tonic-gate				push @class, (keys %$func)[0];
903*0Sstevel@tonic-gate				$func = (values %$func)[0];
904*0Sstevel@tonic-gate			}
905*0Sstevel@tonic-gate			else
906*0Sstevel@tonic-gate			{
907*0Sstevel@tonic-gate				push @class, undef;
908*0Sstevel@tonic-gate			}
909*0Sstevel@tonic-gate		}
910*0Sstevel@tonic-gate
911*0Sstevel@tonic-gate		FIELD: while (pos($$textref) < length($$textref))
912*0Sstevel@tonic-gate		{
913*0Sstevel@tonic-gate			my ($field, $rem);
914*0Sstevel@tonic-gate			my @bits;
915*0Sstevel@tonic-gate			foreach my $i ( 0..$#func )
916*0Sstevel@tonic-gate			{
917*0Sstevel@tonic-gate				my $pref;
918*0Sstevel@tonic-gate				$func = $func[$i];
919*0Sstevel@tonic-gate				$class = $class[$i];
920*0Sstevel@tonic-gate				$lastpos = pos $$textref;
921*0Sstevel@tonic-gate				if (ref($func) eq 'CODE')
922*0Sstevel@tonic-gate					{ ($field,$rem,$pref) = @bits = $func->($$textref);
923*0Sstevel@tonic-gate					# print "[$field|$rem]" if $field;
924*0Sstevel@tonic-gate					}
925*0Sstevel@tonic-gate				elsif (ref($func) eq 'Text::Balanced::Extractor')
926*0Sstevel@tonic-gate					{ @bits = $field = $func->extract($$textref) }
927*0Sstevel@tonic-gate				elsif( $$textref =~ m/\G$func/gc )
928*0Sstevel@tonic-gate					{ @bits = $field = defined($1) ? $1 : $& }
929*0Sstevel@tonic-gate				$pref ||= "";
930*0Sstevel@tonic-gate				if (defined($field) && length($field))
931*0Sstevel@tonic-gate				{
932*0Sstevel@tonic-gate					if (!$igunk) {
933*0Sstevel@tonic-gate						$unkpos = pos $$textref
934*0Sstevel@tonic-gate							if length($pref) && !defined($unkpos);
935*0Sstevel@tonic-gate						if (defined $unkpos)
936*0Sstevel@tonic-gate						{
937*0Sstevel@tonic-gate							push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
938*0Sstevel@tonic-gate							$firstpos = $unkpos unless defined $firstpos;
939*0Sstevel@tonic-gate							undef $unkpos;
940*0Sstevel@tonic-gate							last FIELD if @fields == $max;
941*0Sstevel@tonic-gate						}
942*0Sstevel@tonic-gate					}
943*0Sstevel@tonic-gate					push @fields, $class
944*0Sstevel@tonic-gate						? bless (\$field, $class)
945*0Sstevel@tonic-gate						: $field;
946*0Sstevel@tonic-gate					$firstpos = $lastpos unless defined $firstpos;
947*0Sstevel@tonic-gate					$lastpos = pos $$textref;
948*0Sstevel@tonic-gate					last FIELD if @fields == $max;
949*0Sstevel@tonic-gate					next FIELD;
950*0Sstevel@tonic-gate				}
951*0Sstevel@tonic-gate			}
952*0Sstevel@tonic-gate			if ($$textref =~ /\G(.)/gcs)
953*0Sstevel@tonic-gate			{
954*0Sstevel@tonic-gate				$unkpos = pos($$textref)-1
955*0Sstevel@tonic-gate					unless $igunk || defined $unkpos;
956*0Sstevel@tonic-gate			}
957*0Sstevel@tonic-gate		}
958*0Sstevel@tonic-gate
959*0Sstevel@tonic-gate		if (defined $unkpos)
960*0Sstevel@tonic-gate		{
961*0Sstevel@tonic-gate			push @fields, substr($$textref, $unkpos);
962*0Sstevel@tonic-gate			$firstpos = $unkpos unless defined $firstpos;
963*0Sstevel@tonic-gate			$lastpos = length $$textref;
964*0Sstevel@tonic-gate		}
965*0Sstevel@tonic-gate		last;
966*0Sstevel@tonic-gate	}
967*0Sstevel@tonic-gate
968*0Sstevel@tonic-gate	pos $$textref = $lastpos;
969*0Sstevel@tonic-gate	return @fields if wantarray;
970*0Sstevel@tonic-gate
971*0Sstevel@tonic-gate	$firstpos ||= 0;
972*0Sstevel@tonic-gate	eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
973*0Sstevel@tonic-gate	       pos $$textref = $firstpos };
974*0Sstevel@tonic-gate	return $fields[0];
975*0Sstevel@tonic-gate}
976*0Sstevel@tonic-gate
977*0Sstevel@tonic-gate
978*0Sstevel@tonic-gatesub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
979*0Sstevel@tonic-gate{
980*0Sstevel@tonic-gate	my $ldel    = $_[0];
981*0Sstevel@tonic-gate	my $rdel    = $_[1];
982*0Sstevel@tonic-gate	my $pre     = defined $_[2] ? $_[2] : '\s*';
983*0Sstevel@tonic-gate	my %options = defined $_[3] ? %{$_[3]} : ();
984*0Sstevel@tonic-gate	my $omode   = defined $options{fail} ? $options{fail} : '';
985*0Sstevel@tonic-gate	my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
986*0Sstevel@tonic-gate		    : defined($options{reject})	       ? $options{reject}
987*0Sstevel@tonic-gate		    :					 ''
988*0Sstevel@tonic-gate		    ;
989*0Sstevel@tonic-gate	my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
990*0Sstevel@tonic-gate		    : defined($options{ignore})	       ? $options{ignore}
991*0Sstevel@tonic-gate		    :					 ''
992*0Sstevel@tonic-gate		    ;
993*0Sstevel@tonic-gate
994*0Sstevel@tonic-gate	if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
995*0Sstevel@tonic-gate
996*0Sstevel@tonic-gate	my $posbug = pos;
997*0Sstevel@tonic-gate	for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
998*0Sstevel@tonic-gate	pos = $posbug;
999*0Sstevel@tonic-gate
1000*0Sstevel@tonic-gate	my $closure = sub
1001*0Sstevel@tonic-gate	{
1002*0Sstevel@tonic-gate		my $textref = defined $_[0] ? \$_[0] : \$_;
1003*0Sstevel@tonic-gate		my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1004*0Sstevel@tonic-gate
1005*0Sstevel@tonic-gate		return _fail(wantarray, $textref) unless @match;
1006*0Sstevel@tonic-gate		return _succeed wantarray, $textref,
1007*0Sstevel@tonic-gate				$match[2], $match[3]+$match[5]+$match[7],	# MATCH
1008*0Sstevel@tonic-gate				@match[8..9,0..1,2..7];				# REM, PRE, BITS
1009*0Sstevel@tonic-gate	};
1010*0Sstevel@tonic-gate
1011*0Sstevel@tonic-gate	bless $closure, 'Text::Balanced::Extractor';
1012*0Sstevel@tonic-gate}
1013*0Sstevel@tonic-gate
1014*0Sstevel@tonic-gatepackage Text::Balanced::Extractor;
1015*0Sstevel@tonic-gate
1016*0Sstevel@tonic-gatesub extract($$)	# ($self, $text)
1017*0Sstevel@tonic-gate{
1018*0Sstevel@tonic-gate	&{$_[0]}($_[1]);
1019*0Sstevel@tonic-gate}
1020*0Sstevel@tonic-gate
1021*0Sstevel@tonic-gatepackage Text::Balanced::ErrorMsg;
1022*0Sstevel@tonic-gate
1023*0Sstevel@tonic-gateuse overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1024*0Sstevel@tonic-gate
1025*0Sstevel@tonic-gate1;
1026*0Sstevel@tonic-gate
1027*0Sstevel@tonic-gate__END__
1028*0Sstevel@tonic-gate
1029*0Sstevel@tonic-gate=head1 NAME
1030*0Sstevel@tonic-gate
1031*0Sstevel@tonic-gateText::Balanced - Extract delimited text sequences from strings.
1032*0Sstevel@tonic-gate
1033*0Sstevel@tonic-gate
1034*0Sstevel@tonic-gate=head1 SYNOPSIS
1035*0Sstevel@tonic-gate
1036*0Sstevel@tonic-gate use Text::Balanced qw (
1037*0Sstevel@tonic-gate			extract_delimited
1038*0Sstevel@tonic-gate			extract_bracketed
1039*0Sstevel@tonic-gate			extract_quotelike
1040*0Sstevel@tonic-gate			extract_codeblock
1041*0Sstevel@tonic-gate			extract_variable
1042*0Sstevel@tonic-gate			extract_tagged
1043*0Sstevel@tonic-gate			extract_multiple
1044*0Sstevel@tonic-gate
1045*0Sstevel@tonic-gate			gen_delimited_pat
1046*0Sstevel@tonic-gate			gen_extract_tagged
1047*0Sstevel@tonic-gate		       );
1048*0Sstevel@tonic-gate
1049*0Sstevel@tonic-gate # Extract the initial substring of $text that is delimited by
1050*0Sstevel@tonic-gate # two (unescaped) instances of the first character in $delim.
1051*0Sstevel@tonic-gate
1052*0Sstevel@tonic-gate	($extracted, $remainder) = extract_delimited($text,$delim);
1053*0Sstevel@tonic-gate
1054*0Sstevel@tonic-gate
1055*0Sstevel@tonic-gate # Extract the initial substring of $text that is bracketed
1056*0Sstevel@tonic-gate # with a delimiter(s) specified by $delim (where the string
1057*0Sstevel@tonic-gate # in $delim contains one or more of '(){}[]<>').
1058*0Sstevel@tonic-gate
1059*0Sstevel@tonic-gate	($extracted, $remainder) = extract_bracketed($text,$delim);
1060*0Sstevel@tonic-gate
1061*0Sstevel@tonic-gate
1062*0Sstevel@tonic-gate # Extract the initial substring of $text that is bounded by
1063*0Sstevel@tonic-gate # an XML tag.
1064*0Sstevel@tonic-gate
1065*0Sstevel@tonic-gate	($extracted, $remainder) = extract_tagged($text);
1066*0Sstevel@tonic-gate
1067*0Sstevel@tonic-gate
1068*0Sstevel@tonic-gate # Extract the initial substring of $text that is bounded by
1069*0Sstevel@tonic-gate # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1070*0Sstevel@tonic-gate
1071*0Sstevel@tonic-gate	($extracted, $remainder) =
1072*0Sstevel@tonic-gate		extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1073*0Sstevel@tonic-gate
1074*0Sstevel@tonic-gate
1075*0Sstevel@tonic-gate # Extract the initial substring of $text that represents a
1076*0Sstevel@tonic-gate # Perl "quote or quote-like operation"
1077*0Sstevel@tonic-gate
1078*0Sstevel@tonic-gate	($extracted, $remainder) = extract_quotelike($text);
1079*0Sstevel@tonic-gate
1080*0Sstevel@tonic-gate
1081*0Sstevel@tonic-gate # Extract the initial substring of $text that represents a block
1082*0Sstevel@tonic-gate # of Perl code, bracketed by any of character(s) specified by $delim
1083*0Sstevel@tonic-gate # (where the string $delim contains one or more of '(){}[]<>').
1084*0Sstevel@tonic-gate
1085*0Sstevel@tonic-gate	($extracted, $remainder) = extract_codeblock($text,$delim);
1086*0Sstevel@tonic-gate
1087*0Sstevel@tonic-gate
1088*0Sstevel@tonic-gate # Extract the initial substrings of $text that would be extracted by
1089*0Sstevel@tonic-gate # one or more sequential applications of the specified functions
1090*0Sstevel@tonic-gate # or regular expressions
1091*0Sstevel@tonic-gate
1092*0Sstevel@tonic-gate	@extracted = extract_multiple($text,
1093*0Sstevel@tonic-gate				      [ \&extract_bracketed,
1094*0Sstevel@tonic-gate					\&extract_quotelike,
1095*0Sstevel@tonic-gate					\&some_other_extractor_sub,
1096*0Sstevel@tonic-gate					qr/[xyz]*/,
1097*0Sstevel@tonic-gate					'literal',
1098*0Sstevel@tonic-gate				      ]);
1099*0Sstevel@tonic-gate
1100*0Sstevel@tonic-gate# Create a string representing an optimized pattern (a la Friedl)
1101*0Sstevel@tonic-gate# that matches a substring delimited by any of the specified characters
1102*0Sstevel@tonic-gate# (in this case: any type of quote or a slash)
1103*0Sstevel@tonic-gate
1104*0Sstevel@tonic-gate	$patstring = gen_delimited_pat(q{'"`/});
1105*0Sstevel@tonic-gate
1106*0Sstevel@tonic-gate
1107*0Sstevel@tonic-gate# Generate a reference to an anonymous sub that is just like extract_tagged
1108*0Sstevel@tonic-gate# but pre-compiled and optimized for a specific pair of tags, and consequently
1109*0Sstevel@tonic-gate# much faster (i.e. 3 times faster). It uses qr// for better performance on
1110*0Sstevel@tonic-gate# repeated calls, so it only works under Perl 5.005 or later.
1111*0Sstevel@tonic-gate
1112*0Sstevel@tonic-gate	$extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1113*0Sstevel@tonic-gate
1114*0Sstevel@tonic-gate	($extracted, $remainder) = $extract_head->($text);
1115*0Sstevel@tonic-gate
1116*0Sstevel@tonic-gate
1117*0Sstevel@tonic-gate=head1 DESCRIPTION
1118*0Sstevel@tonic-gate
1119*0Sstevel@tonic-gateThe various C<extract_...> subroutines may be used to
1120*0Sstevel@tonic-gateextract a delimited substring, possibly after skipping a
1121*0Sstevel@tonic-gatespecified prefix string. By default, that prefix is
1122*0Sstevel@tonic-gateoptional whitespace (C</\s*/>), but you can change it to whatever
1123*0Sstevel@tonic-gateyou wish (see below).
1124*0Sstevel@tonic-gate
1125*0Sstevel@tonic-gateThe substring to be extracted must appear at the
1126*0Sstevel@tonic-gatecurrent C<pos> location of the string's variable
1127*0Sstevel@tonic-gate(or at index zero, if no C<pos> position is defined).
1128*0Sstevel@tonic-gateIn other words, the C<extract_...> subroutines I<don't>
1129*0Sstevel@tonic-gateextract the first occurance of a substring anywhere
1130*0Sstevel@tonic-gatein a string (like an unanchored regex would). Rather,
1131*0Sstevel@tonic-gatethey extract an occurance of the substring appearing
1132*0Sstevel@tonic-gateimmediately at the current matching position in the
1133*0Sstevel@tonic-gatestring (like a C<\G>-anchored regex would).
1134*0Sstevel@tonic-gate
1135*0Sstevel@tonic-gate
1136*0Sstevel@tonic-gate
1137*0Sstevel@tonic-gate=head2 General behaviour in list contexts
1138*0Sstevel@tonic-gate
1139*0Sstevel@tonic-gateIn a list context, all the subroutines return a list, the first three
1140*0Sstevel@tonic-gateelements of which are always:
1141*0Sstevel@tonic-gate
1142*0Sstevel@tonic-gate=over 4
1143*0Sstevel@tonic-gate
1144*0Sstevel@tonic-gate=item [0]
1145*0Sstevel@tonic-gate
1146*0Sstevel@tonic-gateThe extracted string, including the specified delimiters.
1147*0Sstevel@tonic-gateIf the extraction fails an empty string is returned.
1148*0Sstevel@tonic-gate
1149*0Sstevel@tonic-gate=item [1]
1150*0Sstevel@tonic-gate
1151*0Sstevel@tonic-gateThe remainder of the input string (i.e. the characters after the
1152*0Sstevel@tonic-gateextracted string). On failure, the entire string is returned.
1153*0Sstevel@tonic-gate
1154*0Sstevel@tonic-gate=item [2]
1155*0Sstevel@tonic-gate
1156*0Sstevel@tonic-gateThe skipped prefix (i.e. the characters before the extracted string).
1157*0Sstevel@tonic-gateOn failure, the empty string is returned.
1158*0Sstevel@tonic-gate
1159*0Sstevel@tonic-gate=back
1160*0Sstevel@tonic-gate
1161*0Sstevel@tonic-gateNote that in a list context, the contents of the original input text (the first
1162*0Sstevel@tonic-gateargument) are not modified in any way.
1163*0Sstevel@tonic-gate
1164*0Sstevel@tonic-gateHowever, if the input text was passed in a variable, that variable's
1165*0Sstevel@tonic-gateC<pos> value is updated to point at the first character after the
1166*0Sstevel@tonic-gateextracted text. That means that in a list context the various
1167*0Sstevel@tonic-gatesubroutines can be used much like regular expressions. For example:
1168*0Sstevel@tonic-gate
1169*0Sstevel@tonic-gate	while ( $next = (extract_quotelike($text))[0] )
1170*0Sstevel@tonic-gate	{
1171*0Sstevel@tonic-gate		# process next quote-like (in $next)
1172*0Sstevel@tonic-gate	}
1173*0Sstevel@tonic-gate
1174*0Sstevel@tonic-gate
1175*0Sstevel@tonic-gate=head2 General behaviour in scalar and void contexts
1176*0Sstevel@tonic-gate
1177*0Sstevel@tonic-gateIn a scalar context, the extracted string is returned, having first been
1178*0Sstevel@tonic-gateremoved from the input text. Thus, the following code also processes
1179*0Sstevel@tonic-gateeach quote-like operation, but actually removes them from $text:
1180*0Sstevel@tonic-gate
1181*0Sstevel@tonic-gate	while ( $next = extract_quotelike($text) )
1182*0Sstevel@tonic-gate	{
1183*0Sstevel@tonic-gate		# process next quote-like (in $next)
1184*0Sstevel@tonic-gate	}
1185*0Sstevel@tonic-gate
1186*0Sstevel@tonic-gateNote that if the input text is a read-only string (i.e. a literal),
1187*0Sstevel@tonic-gateno attempt is made to remove the extracted text.
1188*0Sstevel@tonic-gate
1189*0Sstevel@tonic-gateIn a void context the behaviour of the extraction subroutines is
1190*0Sstevel@tonic-gateexactly the same as in a scalar context, except (of course) that the
1191*0Sstevel@tonic-gateextracted substring is not returned.
1192*0Sstevel@tonic-gate
1193*0Sstevel@tonic-gate=head2 A note about prefixes
1194*0Sstevel@tonic-gate
1195*0Sstevel@tonic-gatePrefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1196*0Sstevel@tonic-gateThis can bite you if you're expecting a prefix specification like
1197*0Sstevel@tonic-gate'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1198*0Sstevel@tonic-gatepattern will only succeed if the <H1> tag is on the current line, since
1199*0Sstevel@tonic-gate. normally doesn't match newlines.
1200*0Sstevel@tonic-gate
1201*0Sstevel@tonic-gateTo overcome this limitation, you need to turn on /s matching within
1202*0Sstevel@tonic-gatethe prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1203*0Sstevel@tonic-gate
1204*0Sstevel@tonic-gate
1205*0Sstevel@tonic-gate=head2 C<extract_delimited>
1206*0Sstevel@tonic-gate
1207*0Sstevel@tonic-gateThe C<extract_delimited> function formalizes the common idiom
1208*0Sstevel@tonic-gateof extracting a single-character-delimited substring from the start of
1209*0Sstevel@tonic-gatea string. For example, to extract a single-quote delimited string, the
1210*0Sstevel@tonic-gatefollowing code is typically used:
1211*0Sstevel@tonic-gate
1212*0Sstevel@tonic-gate	($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1213*0Sstevel@tonic-gate	$extracted = $1;
1214*0Sstevel@tonic-gate
1215*0Sstevel@tonic-gatebut with C<extract_delimited> it can be simplified to:
1216*0Sstevel@tonic-gate
1217*0Sstevel@tonic-gate	($extracted,$remainder) = extract_delimited($text, "'");
1218*0Sstevel@tonic-gate
1219*0Sstevel@tonic-gateC<extract_delimited> takes up to four scalars (the input text, the
1220*0Sstevel@tonic-gatedelimiters, a prefix pattern to be skipped, and any escape characters)
1221*0Sstevel@tonic-gateand extracts the initial substring of the text that
1222*0Sstevel@tonic-gateis appropriately delimited. If the delimiter string has multiple
1223*0Sstevel@tonic-gatecharacters, the first one encountered in the text is taken to delimit
1224*0Sstevel@tonic-gatethe substring.
1225*0Sstevel@tonic-gateThe third argument specifies a prefix pattern that is to be skipped
1226*0Sstevel@tonic-gate(but must be present!) before the substring is extracted.
1227*0Sstevel@tonic-gateThe final argument specifies the escape character to be used for each
1228*0Sstevel@tonic-gatedelimiter.
1229*0Sstevel@tonic-gate
1230*0Sstevel@tonic-gateAll arguments are optional. If the escape characters are not specified,
1231*0Sstevel@tonic-gateevery delimiter is escaped with a backslash (C<\>).
1232*0Sstevel@tonic-gateIf the prefix is not specified, the
1233*0Sstevel@tonic-gatepattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1234*0Sstevel@tonic-gateis also not specified, the set C</["'`]/> is used. If the text to be processed
1235*0Sstevel@tonic-gateis not specified either, C<$_> is used.
1236*0Sstevel@tonic-gate
1237*0Sstevel@tonic-gateIn list context, C<extract_delimited> returns a array of three
1238*0Sstevel@tonic-gateelements, the extracted substring (I<including the surrounding
1239*0Sstevel@tonic-gatedelimiters>), the remainder of the text, and the skipped prefix (if
1240*0Sstevel@tonic-gateany). If a suitable delimited substring is not found, the first
1241*0Sstevel@tonic-gateelement of the array is the empty string, the second is the complete
1242*0Sstevel@tonic-gateoriginal text, and the prefix returned in the third element is an
1243*0Sstevel@tonic-gateempty string.
1244*0Sstevel@tonic-gate
1245*0Sstevel@tonic-gateIn a scalar context, just the extracted substring is returned. In
1246*0Sstevel@tonic-gatea void context, the extracted substring (and any prefix) are simply
1247*0Sstevel@tonic-gateremoved from the beginning of the first argument.
1248*0Sstevel@tonic-gate
1249*0Sstevel@tonic-gateExamples:
1250*0Sstevel@tonic-gate
1251*0Sstevel@tonic-gate	# Remove a single-quoted substring from the very beginning of $text:
1252*0Sstevel@tonic-gate
1253*0Sstevel@tonic-gate		$substring = extract_delimited($text, "'", '');
1254*0Sstevel@tonic-gate
1255*0Sstevel@tonic-gate	# Remove a single-quoted Pascalish substring (i.e. one in which
1256*0Sstevel@tonic-gate	# doubling the quote character escapes it) from the very
1257*0Sstevel@tonic-gate	# beginning of $text:
1258*0Sstevel@tonic-gate
1259*0Sstevel@tonic-gate		$substring = extract_delimited($text, "'", '', "'");
1260*0Sstevel@tonic-gate
1261*0Sstevel@tonic-gate	# Extract a single- or double- quoted substring from the
1262*0Sstevel@tonic-gate	# beginning of $text, optionally after some whitespace
1263*0Sstevel@tonic-gate	# (note the list context to protect $text from modification):
1264*0Sstevel@tonic-gate
1265*0Sstevel@tonic-gate		($substring) = extract_delimited $text, q{"'};
1266*0Sstevel@tonic-gate
1267*0Sstevel@tonic-gate
1268*0Sstevel@tonic-gate	# Delete the substring delimited by the first '/' in $text:
1269*0Sstevel@tonic-gate
1270*0Sstevel@tonic-gate		$text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1271*0Sstevel@tonic-gate
1272*0Sstevel@tonic-gateNote that this last example is I<not> the same as deleting the first
1273*0Sstevel@tonic-gatequote-like pattern. For instance, if C<$text> contained the string:
1274*0Sstevel@tonic-gate
1275*0Sstevel@tonic-gate	"if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1276*0Sstevel@tonic-gate
1277*0Sstevel@tonic-gatethen after the deletion it would contain:
1278*0Sstevel@tonic-gate
1279*0Sstevel@tonic-gate	"if ('.$UNIXCMD/s) { $cmd = $1; }"
1280*0Sstevel@tonic-gate
1281*0Sstevel@tonic-gatenot:
1282*0Sstevel@tonic-gate
1283*0Sstevel@tonic-gate	"if ('./cmd' =~ ms) { $cmd = $1; }"
1284*0Sstevel@tonic-gate
1285*0Sstevel@tonic-gate
1286*0Sstevel@tonic-gateSee L<"extract_quotelike"> for a (partial) solution to this problem.
1287*0Sstevel@tonic-gate
1288*0Sstevel@tonic-gate
1289*0Sstevel@tonic-gate=head2 C<extract_bracketed>
1290*0Sstevel@tonic-gate
1291*0Sstevel@tonic-gateLike C<"extract_delimited">, the C<extract_bracketed> function takes
1292*0Sstevel@tonic-gateup to three optional scalar arguments: a string to extract from, a delimiter
1293*0Sstevel@tonic-gatespecifier, and a prefix pattern. As before, a missing prefix defaults to
1294*0Sstevel@tonic-gateoptional whitespace and a missing text defaults to C<$_>. However, a missing
1295*0Sstevel@tonic-gatedelimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1296*0Sstevel@tonic-gate
1297*0Sstevel@tonic-gateC<extract_bracketed> extracts a balanced-bracket-delimited
1298*0Sstevel@tonic-gatesubstring (using any one (or more) of the user-specified delimiter
1299*0Sstevel@tonic-gatebrackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1300*0Sstevel@tonic-gaterespect quoted unbalanced brackets (see below).
1301*0Sstevel@tonic-gate
1302*0Sstevel@tonic-gateA "delimiter bracket" is a bracket in list of delimiters passed as
1303*0Sstevel@tonic-gateC<extract_bracketed>'s second argument. Delimiter brackets are
1304*0Sstevel@tonic-gatespecified by giving either the left or right (or both!) versions
1305*0Sstevel@tonic-gateof the required bracket(s). Note that the order in which
1306*0Sstevel@tonic-gatetwo or more delimiter brackets are specified is not significant.
1307*0Sstevel@tonic-gate
1308*0Sstevel@tonic-gateA "balanced-bracket-delimited substring" is a substring bounded by
1309*0Sstevel@tonic-gatematched brackets, such that any other (left or right) delimiter
1310*0Sstevel@tonic-gatebracket I<within> the substring is also matched by an opposite
1311*0Sstevel@tonic-gate(right or left) delimiter bracket I<at the same level of nesting>. Any
1312*0Sstevel@tonic-gatetype of bracket not in the delimiter list is treated as an ordinary
1313*0Sstevel@tonic-gatecharacter.
1314*0Sstevel@tonic-gate
1315*0Sstevel@tonic-gateIn other words, each type of bracket specified as a delimiter must be
1316*0Sstevel@tonic-gatebalanced and correctly nested within the substring, and any other kind of
1317*0Sstevel@tonic-gate("non-delimiter") bracket in the substring is ignored.
1318*0Sstevel@tonic-gate
1319*0Sstevel@tonic-gateFor example, given the string:
1320*0Sstevel@tonic-gate
1321*0Sstevel@tonic-gate	$text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1322*0Sstevel@tonic-gate
1323*0Sstevel@tonic-gatethen a call to C<extract_bracketed> in a list context:
1324*0Sstevel@tonic-gate
1325*0Sstevel@tonic-gate	@result = extract_bracketed( $text, '{}' );
1326*0Sstevel@tonic-gate
1327*0Sstevel@tonic-gatewould return:
1328*0Sstevel@tonic-gate
1329*0Sstevel@tonic-gate	( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1330*0Sstevel@tonic-gate
1331*0Sstevel@tonic-gatesince both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1332*0Sstevel@tonic-gate(In a scalar context just the first element of the array would be returned. In
1333*0Sstevel@tonic-gatea void context, C<$text> would be replaced by an empty string.)
1334*0Sstevel@tonic-gate
1335*0Sstevel@tonic-gateLikewise the call in:
1336*0Sstevel@tonic-gate
1337*0Sstevel@tonic-gate	@result = extract_bracketed( $text, '{[' );
1338*0Sstevel@tonic-gate
1339*0Sstevel@tonic-gatewould return the same result, since all sets of both types of specified
1340*0Sstevel@tonic-gatedelimiter brackets are correctly nested and balanced.
1341*0Sstevel@tonic-gate
1342*0Sstevel@tonic-gateHowever, the call in:
1343*0Sstevel@tonic-gate
1344*0Sstevel@tonic-gate	@result = extract_bracketed( $text, '{([<' );
1345*0Sstevel@tonic-gate
1346*0Sstevel@tonic-gatewould fail, returning:
1347*0Sstevel@tonic-gate
1348*0Sstevel@tonic-gate	( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1349*0Sstevel@tonic-gate
1350*0Sstevel@tonic-gatebecause the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1351*0Sstevel@tonic-gatethe embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1352*0Sstevel@tonic-gatereturn an empty string. In a void context, C<$text> would be unchanged.)
1353*0Sstevel@tonic-gate
1354*0Sstevel@tonic-gateNote that the embedded single-quotes in the string don't help in this
1355*0Sstevel@tonic-gatecase, since they have not been specified as acceptable delimiters and are
1356*0Sstevel@tonic-gatetherefore treated as non-delimiter characters (and ignored).
1357*0Sstevel@tonic-gate
1358*0Sstevel@tonic-gateHowever, if a particular species of quote character is included in the
1359*0Sstevel@tonic-gatedelimiter specification, then that type of quote will be correctly handled.
1360*0Sstevel@tonic-gatefor example, if C<$text> is:
1361*0Sstevel@tonic-gate
1362*0Sstevel@tonic-gate	$text = '<A HREF=">>>>">link</A>';
1363*0Sstevel@tonic-gate
1364*0Sstevel@tonic-gatethen
1365*0Sstevel@tonic-gate
1366*0Sstevel@tonic-gate	@result = extract_bracketed( $text, '<">' );
1367*0Sstevel@tonic-gate
1368*0Sstevel@tonic-gatereturns:
1369*0Sstevel@tonic-gate
1370*0Sstevel@tonic-gate	( '<A HREF=">>>>">', 'link</A>', "" )
1371*0Sstevel@tonic-gate
1372*0Sstevel@tonic-gateas expected. Without the specification of C<"> as an embedded quoter:
1373*0Sstevel@tonic-gate
1374*0Sstevel@tonic-gate	@result = extract_bracketed( $text, '<>' );
1375*0Sstevel@tonic-gate
1376*0Sstevel@tonic-gatethe result would be:
1377*0Sstevel@tonic-gate
1378*0Sstevel@tonic-gate	( '<A HREF=">', '>>>">link</A>', "" )
1379*0Sstevel@tonic-gate
1380*0Sstevel@tonic-gateIn addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1381*0Sstevel@tonic-gatequoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1382*0Sstevel@tonic-gateletter 'q' as a delimiter. Hence:
1383*0Sstevel@tonic-gate
1384*0Sstevel@tonic-gate	@result = extract_bracketed( $text, '<q>' );
1385*0Sstevel@tonic-gate
1386*0Sstevel@tonic-gatewould correctly match something like this:
1387*0Sstevel@tonic-gate
1388*0Sstevel@tonic-gate	$text = '<leftop: conj /and/ conj>';
1389*0Sstevel@tonic-gate
1390*0Sstevel@tonic-gateSee also: C<"extract_quotelike"> and C<"extract_codeblock">.
1391*0Sstevel@tonic-gate
1392*0Sstevel@tonic-gate
1393*0Sstevel@tonic-gate=head2 C<extract_variable>
1394*0Sstevel@tonic-gate
1395*0Sstevel@tonic-gateC<extract_variable> extracts any valid Perl variable or
1396*0Sstevel@tonic-gatevariable-involved expression, including scalars, arrays, hashes, array
1397*0Sstevel@tonic-gateaccesses, hash look-ups, method calls through objects, subroutine calles
1398*0Sstevel@tonic-gatethrough subroutine references, etc.
1399*0Sstevel@tonic-gate
1400*0Sstevel@tonic-gateThe subroutine takes up to two optional arguments:
1401*0Sstevel@tonic-gate
1402*0Sstevel@tonic-gate=over 4
1403*0Sstevel@tonic-gate
1404*0Sstevel@tonic-gate=item 1.
1405*0Sstevel@tonic-gate
1406*0Sstevel@tonic-gateA string to be processed (C<$_> if the string is omitted or C<undef>)
1407*0Sstevel@tonic-gate
1408*0Sstevel@tonic-gate=item 2.
1409*0Sstevel@tonic-gate
1410*0Sstevel@tonic-gateA string specifying a pattern to be matched as a prefix (which is to be
1411*0Sstevel@tonic-gateskipped). If omitted, optional whitespace is skipped.
1412*0Sstevel@tonic-gate
1413*0Sstevel@tonic-gate=back
1414*0Sstevel@tonic-gate
1415*0Sstevel@tonic-gateOn success in a list context, an array of 3 elements is returned. The
1416*0Sstevel@tonic-gateelements are:
1417*0Sstevel@tonic-gate
1418*0Sstevel@tonic-gate=over 4
1419*0Sstevel@tonic-gate
1420*0Sstevel@tonic-gate=item [0]
1421*0Sstevel@tonic-gate
1422*0Sstevel@tonic-gatethe extracted variable, or variablish expression
1423*0Sstevel@tonic-gate
1424*0Sstevel@tonic-gate=item [1]
1425*0Sstevel@tonic-gate
1426*0Sstevel@tonic-gatethe remainder of the input text,
1427*0Sstevel@tonic-gate
1428*0Sstevel@tonic-gate=item [2]
1429*0Sstevel@tonic-gate
1430*0Sstevel@tonic-gatethe prefix substring (if any),
1431*0Sstevel@tonic-gate
1432*0Sstevel@tonic-gate=back
1433*0Sstevel@tonic-gate
1434*0Sstevel@tonic-gateOn failure, all of these values (except the remaining text) are C<undef>.
1435*0Sstevel@tonic-gate
1436*0Sstevel@tonic-gateIn a scalar context, C<extract_variable> returns just the complete
1437*0Sstevel@tonic-gatesubstring that matched a variablish expression. C<undef> is returned on
1438*0Sstevel@tonic-gatefailure. In addition, the original input text has the returned substring
1439*0Sstevel@tonic-gate(and any prefix) removed from it.
1440*0Sstevel@tonic-gate
1441*0Sstevel@tonic-gateIn a void context, the input text just has the matched substring (and
1442*0Sstevel@tonic-gateany specified prefix) removed.
1443*0Sstevel@tonic-gate
1444*0Sstevel@tonic-gate
1445*0Sstevel@tonic-gate=head2 C<extract_tagged>
1446*0Sstevel@tonic-gate
1447*0Sstevel@tonic-gateC<extract_tagged> extracts and segments text between (balanced)
1448*0Sstevel@tonic-gatespecified tags.
1449*0Sstevel@tonic-gate
1450*0Sstevel@tonic-gateThe subroutine takes up to five optional arguments:
1451*0Sstevel@tonic-gate
1452*0Sstevel@tonic-gate=over 4
1453*0Sstevel@tonic-gate
1454*0Sstevel@tonic-gate=item 1.
1455*0Sstevel@tonic-gate
1456*0Sstevel@tonic-gateA string to be processed (C<$_> if the string is omitted or C<undef>)
1457*0Sstevel@tonic-gate
1458*0Sstevel@tonic-gate=item 2.
1459*0Sstevel@tonic-gate
1460*0Sstevel@tonic-gateA string specifying a pattern to be matched as the opening tag.
1461*0Sstevel@tonic-gateIf the pattern string is omitted (or C<undef>) then a pattern
1462*0Sstevel@tonic-gatethat matches any standard XML tag is used.
1463*0Sstevel@tonic-gate
1464*0Sstevel@tonic-gate=item 3.
1465*0Sstevel@tonic-gate
1466*0Sstevel@tonic-gateA string specifying a pattern to be matched at the closing tag.
1467*0Sstevel@tonic-gateIf the pattern string is omitted (or C<undef>) then the closing
1468*0Sstevel@tonic-gatetag is constructed by inserting a C</> after any leading bracket
1469*0Sstevel@tonic-gatecharacters in the actual opening tag that was matched (I<not> the pattern
1470*0Sstevel@tonic-gatethat matched the tag). For example, if the opening tag pattern
1471*0Sstevel@tonic-gateis specified as C<'{{\w+}}'> and actually matched the opening tag
1472*0Sstevel@tonic-gateC<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1473*0Sstevel@tonic-gate
1474*0Sstevel@tonic-gate=item 4.
1475*0Sstevel@tonic-gate
1476*0Sstevel@tonic-gateA string specifying a pattern to be matched as a prefix (which is to be
1477*0Sstevel@tonic-gateskipped). If omitted, optional whitespace is skipped.
1478*0Sstevel@tonic-gate
1479*0Sstevel@tonic-gate=item 5.
1480*0Sstevel@tonic-gate
1481*0Sstevel@tonic-gateA hash reference containing various parsing options (see below)
1482*0Sstevel@tonic-gate
1483*0Sstevel@tonic-gate=back
1484*0Sstevel@tonic-gate
1485*0Sstevel@tonic-gateThe various options that can be specified are:
1486*0Sstevel@tonic-gate
1487*0Sstevel@tonic-gate=over 4
1488*0Sstevel@tonic-gate
1489*0Sstevel@tonic-gate=item C<reject =E<gt> $listref>
1490*0Sstevel@tonic-gate
1491*0Sstevel@tonic-gateThe list reference contains one or more strings specifying patterns
1492*0Sstevel@tonic-gatethat must I<not> appear within the tagged text.
1493*0Sstevel@tonic-gate
1494*0Sstevel@tonic-gateFor example, to extract
1495*0Sstevel@tonic-gatean HTML link (which should not contain nested links) use:
1496*0Sstevel@tonic-gate
1497*0Sstevel@tonic-gate        extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1498*0Sstevel@tonic-gate
1499*0Sstevel@tonic-gate=item C<ignore =E<gt> $listref>
1500*0Sstevel@tonic-gate
1501*0Sstevel@tonic-gateThe list reference contains one or more strings specifying patterns
1502*0Sstevel@tonic-gatethat are I<not> be be treated as nested tags within the tagged text
1503*0Sstevel@tonic-gate(even if they would match the start tag pattern).
1504*0Sstevel@tonic-gate
1505*0Sstevel@tonic-gateFor example, to extract an arbitrary XML tag, but ignore "empty" elements:
1506*0Sstevel@tonic-gate
1507*0Sstevel@tonic-gate        extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1508*0Sstevel@tonic-gate
1509*0Sstevel@tonic-gate(also see L<"gen_delimited_pat"> below).
1510*0Sstevel@tonic-gate
1511*0Sstevel@tonic-gate
1512*0Sstevel@tonic-gate=item C<fail =E<gt> $str>
1513*0Sstevel@tonic-gate
1514*0Sstevel@tonic-gateThe C<fail> option indicates the action to be taken if a matching end
1515*0Sstevel@tonic-gatetag is not encountered (i.e. before the end of the string or some
1516*0Sstevel@tonic-gateC<reject> pattern matches). By default, a failure to match a closing
1517*0Sstevel@tonic-gatetag causes C<extract_tagged> to immediately fail.
1518*0Sstevel@tonic-gate
1519*0Sstevel@tonic-gateHowever, if the string value associated with <reject> is "MAX", then
1520*0Sstevel@tonic-gateC<extract_tagged> returns the complete text up to the point of failure.
1521*0Sstevel@tonic-gateIf the string is "PARA", C<extract_tagged> returns only the first paragraph
1522*0Sstevel@tonic-gateafter the tag (up to the first line that is either empty or contains
1523*0Sstevel@tonic-gateonly whitespace characters).
1524*0Sstevel@tonic-gateIf the string is "", the the default behaviour (i.e. failure) is reinstated.
1525*0Sstevel@tonic-gate
1526*0Sstevel@tonic-gateFor example, suppose the start tag "/para" introduces a paragraph, which then
1527*0Sstevel@tonic-gatecontinues until the next "/endpara" tag or until another "/para" tag is
1528*0Sstevel@tonic-gateencountered:
1529*0Sstevel@tonic-gate
1530*0Sstevel@tonic-gate        $text = "/para line 1\n\nline 3\n/para line 4";
1531*0Sstevel@tonic-gate
1532*0Sstevel@tonic-gate        extract_tagged($text, '/para', '/endpara', undef,
1533*0Sstevel@tonic-gate                                {reject => '/para', fail => MAX );
1534*0Sstevel@tonic-gate
1535*0Sstevel@tonic-gate        # EXTRACTED: "/para line 1\n\nline 3\n"
1536*0Sstevel@tonic-gate
1537*0Sstevel@tonic-gateSuppose instead, that if no matching "/endpara" tag is found, the "/para"
1538*0Sstevel@tonic-gatetag refers only to the immediately following paragraph:
1539*0Sstevel@tonic-gate
1540*0Sstevel@tonic-gate        $text = "/para line 1\n\nline 3\n/para line 4";
1541*0Sstevel@tonic-gate
1542*0Sstevel@tonic-gate        extract_tagged($text, '/para', '/endpara', undef,
1543*0Sstevel@tonic-gate                        {reject => '/para', fail => MAX );
1544*0Sstevel@tonic-gate
1545*0Sstevel@tonic-gate        # EXTRACTED: "/para line 1\n"
1546*0Sstevel@tonic-gate
1547*0Sstevel@tonic-gateNote that the specified C<fail> behaviour applies to nested tags as well.
1548*0Sstevel@tonic-gate
1549*0Sstevel@tonic-gate=back
1550*0Sstevel@tonic-gate
1551*0Sstevel@tonic-gateOn success in a list context, an array of 6 elements is returned. The elements are:
1552*0Sstevel@tonic-gate
1553*0Sstevel@tonic-gate=over 4
1554*0Sstevel@tonic-gate
1555*0Sstevel@tonic-gate=item [0]
1556*0Sstevel@tonic-gate
1557*0Sstevel@tonic-gatethe extracted tagged substring (including the outermost tags),
1558*0Sstevel@tonic-gate
1559*0Sstevel@tonic-gate=item [1]
1560*0Sstevel@tonic-gate
1561*0Sstevel@tonic-gatethe remainder of the input text,
1562*0Sstevel@tonic-gate
1563*0Sstevel@tonic-gate=item [2]
1564*0Sstevel@tonic-gate
1565*0Sstevel@tonic-gatethe prefix substring (if any),
1566*0Sstevel@tonic-gate
1567*0Sstevel@tonic-gate=item [3]
1568*0Sstevel@tonic-gate
1569*0Sstevel@tonic-gatethe opening tag
1570*0Sstevel@tonic-gate
1571*0Sstevel@tonic-gate=item [4]
1572*0Sstevel@tonic-gate
1573*0Sstevel@tonic-gatethe text between the opening and closing tags
1574*0Sstevel@tonic-gate
1575*0Sstevel@tonic-gate=item [5]
1576*0Sstevel@tonic-gate
1577*0Sstevel@tonic-gatethe closing tag (or "" if no closing tag was found)
1578*0Sstevel@tonic-gate
1579*0Sstevel@tonic-gate=back
1580*0Sstevel@tonic-gate
1581*0Sstevel@tonic-gateOn failure, all of these values (except the remaining text) are C<undef>.
1582*0Sstevel@tonic-gate
1583*0Sstevel@tonic-gateIn a scalar context, C<extract_tagged> returns just the complete
1584*0Sstevel@tonic-gatesubstring that matched a tagged text (including the start and end
1585*0Sstevel@tonic-gatetags). C<undef> is returned on failure. In addition, the original input
1586*0Sstevel@tonic-gatetext has the returned substring (and any prefix) removed from it.
1587*0Sstevel@tonic-gate
1588*0Sstevel@tonic-gateIn a void context, the input text just has the matched substring (and
1589*0Sstevel@tonic-gateany specified prefix) removed.
1590*0Sstevel@tonic-gate
1591*0Sstevel@tonic-gate
1592*0Sstevel@tonic-gate=head2 C<gen_extract_tagged>
1593*0Sstevel@tonic-gate
1594*0Sstevel@tonic-gate(Note: This subroutine is only available under Perl5.005)
1595*0Sstevel@tonic-gate
1596*0Sstevel@tonic-gateC<gen_extract_tagged> generates a new anonymous subroutine which
1597*0Sstevel@tonic-gateextracts text between (balanced) specified tags. In other words,
1598*0Sstevel@tonic-gateit generates a function identical in function to C<extract_tagged>.
1599*0Sstevel@tonic-gate
1600*0Sstevel@tonic-gateThe difference between C<extract_tagged> and the anonymous
1601*0Sstevel@tonic-gatesubroutines generated by
1602*0Sstevel@tonic-gateC<gen_extract_tagged>, is that those generated subroutines:
1603*0Sstevel@tonic-gate
1604*0Sstevel@tonic-gate=over 4
1605*0Sstevel@tonic-gate
1606*0Sstevel@tonic-gate=item *
1607*0Sstevel@tonic-gate
1608*0Sstevel@tonic-gatedo not have to reparse tag specification or parsing options every time
1609*0Sstevel@tonic-gatethey are called (whereas C<extract_tagged> has to effectively rebuild
1610*0Sstevel@tonic-gateits tag parser on every call);
1611*0Sstevel@tonic-gate
1612*0Sstevel@tonic-gate=item *
1613*0Sstevel@tonic-gate
1614*0Sstevel@tonic-gatemake use of the new qr// construct to pre-compile the regexes they use
1615*0Sstevel@tonic-gate(whereas C<extract_tagged> uses standard string variable interpolation
1616*0Sstevel@tonic-gateto create tag-matching patterns).
1617*0Sstevel@tonic-gate
1618*0Sstevel@tonic-gate=back
1619*0Sstevel@tonic-gate
1620*0Sstevel@tonic-gateThe subroutine takes up to four optional arguments (the same set as
1621*0Sstevel@tonic-gateC<extract_tagged> except for the string to be processed). It returns
1622*0Sstevel@tonic-gatea reference to a subroutine which in turn takes a single argument (the text to
1623*0Sstevel@tonic-gatebe extracted from).
1624*0Sstevel@tonic-gate
1625*0Sstevel@tonic-gateIn other words, the implementation of C<extract_tagged> is exactly
1626*0Sstevel@tonic-gateequivalent to:
1627*0Sstevel@tonic-gate
1628*0Sstevel@tonic-gate        sub extract_tagged
1629*0Sstevel@tonic-gate        {
1630*0Sstevel@tonic-gate                my $text = shift;
1631*0Sstevel@tonic-gate                $extractor = gen_extract_tagged(@_);
1632*0Sstevel@tonic-gate                return $extractor->($text);
1633*0Sstevel@tonic-gate        }
1634*0Sstevel@tonic-gate
1635*0Sstevel@tonic-gate(although C<extract_tagged> is not currently implemented that way, in order
1636*0Sstevel@tonic-gateto preserve pre-5.005 compatibility).
1637*0Sstevel@tonic-gate
1638*0Sstevel@tonic-gateUsing C<gen_extract_tagged> to create extraction functions for specific tags
1639*0Sstevel@tonic-gateis a good idea if those functions are going to be called more than once, since
1640*0Sstevel@tonic-gatetheir performance is typically twice as good as the more general-purpose
1641*0Sstevel@tonic-gateC<extract_tagged>.
1642*0Sstevel@tonic-gate
1643*0Sstevel@tonic-gate
1644*0Sstevel@tonic-gate=head2 C<extract_quotelike>
1645*0Sstevel@tonic-gate
1646*0Sstevel@tonic-gateC<extract_quotelike> attempts to recognize, extract, and segment any
1647*0Sstevel@tonic-gateone of the various Perl quotes and quotelike operators (see
1648*0Sstevel@tonic-gateL<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1649*0Sstevel@tonic-gatedelimiters (for the quotelike operators), and trailing modifiers are
1650*0Sstevel@tonic-gateall caught. For example, in:
1651*0Sstevel@tonic-gate
1652*0Sstevel@tonic-gate        extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1653*0Sstevel@tonic-gate
1654*0Sstevel@tonic-gate        extract_quotelike '  "You said, \"Use sed\"."  '
1655*0Sstevel@tonic-gate
1656*0Sstevel@tonic-gate        extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1657*0Sstevel@tonic-gate
1658*0Sstevel@tonic-gate        extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1659*0Sstevel@tonic-gate
1660*0Sstevel@tonic-gatethe full Perl quotelike operations are all extracted correctly.
1661*0Sstevel@tonic-gate
1662*0Sstevel@tonic-gateNote too that, when using the /x modifier on a regex, any comment
1663*0Sstevel@tonic-gatecontaining the current pattern delimiter will cause the regex to be
1664*0Sstevel@tonic-gateimmediately terminated. In other words:
1665*0Sstevel@tonic-gate
1666*0Sstevel@tonic-gate        'm /
1667*0Sstevel@tonic-gate                (?i)            # CASE INSENSITIVE
1668*0Sstevel@tonic-gate                [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1669*0Sstevel@tonic-gate                [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1670*0Sstevel@tonic-gate           /x'
1671*0Sstevel@tonic-gate
1672*0Sstevel@tonic-gatewill be extracted as if it were:
1673*0Sstevel@tonic-gate
1674*0Sstevel@tonic-gate        'm /
1675*0Sstevel@tonic-gate                (?i)            # CASE INSENSITIVE
1676*0Sstevel@tonic-gate                [a-z_]          # LEADING ALPHABETIC/'
1677*0Sstevel@tonic-gate
1678*0Sstevel@tonic-gateThis behaviour is identical to that of the actual compiler.
1679*0Sstevel@tonic-gate
1680*0Sstevel@tonic-gateC<extract_quotelike> takes two arguments: the text to be processed and
1681*0Sstevel@tonic-gatea prefix to be matched at the very beginning of the text. If no prefix
1682*0Sstevel@tonic-gateis specified, optional whitespace is the default. If no text is given,
1683*0Sstevel@tonic-gateC<$_> is used.
1684*0Sstevel@tonic-gate
1685*0Sstevel@tonic-gateIn a list context, an array of 11 elements is returned. The elements are:
1686*0Sstevel@tonic-gate
1687*0Sstevel@tonic-gate=over 4
1688*0Sstevel@tonic-gate
1689*0Sstevel@tonic-gate=item [0]
1690*0Sstevel@tonic-gate
1691*0Sstevel@tonic-gatethe extracted quotelike substring (including trailing modifiers),
1692*0Sstevel@tonic-gate
1693*0Sstevel@tonic-gate=item [1]
1694*0Sstevel@tonic-gate
1695*0Sstevel@tonic-gatethe remainder of the input text,
1696*0Sstevel@tonic-gate
1697*0Sstevel@tonic-gate=item [2]
1698*0Sstevel@tonic-gate
1699*0Sstevel@tonic-gatethe prefix substring (if any),
1700*0Sstevel@tonic-gate
1701*0Sstevel@tonic-gate=item [3]
1702*0Sstevel@tonic-gate
1703*0Sstevel@tonic-gatethe name of the quotelike operator (if any),
1704*0Sstevel@tonic-gate
1705*0Sstevel@tonic-gate=item [4]
1706*0Sstevel@tonic-gate
1707*0Sstevel@tonic-gatethe left delimiter of the first block of the operation,
1708*0Sstevel@tonic-gate
1709*0Sstevel@tonic-gate=item [5]
1710*0Sstevel@tonic-gate
1711*0Sstevel@tonic-gatethe text of the first block of the operation
1712*0Sstevel@tonic-gate(that is, the contents of
1713*0Sstevel@tonic-gatea quote, the regex of a match or substitution or the target list of a
1714*0Sstevel@tonic-gatetranslation),
1715*0Sstevel@tonic-gate
1716*0Sstevel@tonic-gate=item [6]
1717*0Sstevel@tonic-gate
1718*0Sstevel@tonic-gatethe right delimiter of the first block of the operation,
1719*0Sstevel@tonic-gate
1720*0Sstevel@tonic-gate=item [7]
1721*0Sstevel@tonic-gate
1722*0Sstevel@tonic-gatethe left delimiter of the second block of the operation
1723*0Sstevel@tonic-gate(that is, if it is a C<s>, C<tr>, or C<y>),
1724*0Sstevel@tonic-gate
1725*0Sstevel@tonic-gate=item [8]
1726*0Sstevel@tonic-gate
1727*0Sstevel@tonic-gatethe text of the second block of the operation
1728*0Sstevel@tonic-gate(that is, the replacement of a substitution or the translation list
1729*0Sstevel@tonic-gateof a translation),
1730*0Sstevel@tonic-gate
1731*0Sstevel@tonic-gate=item [9]
1732*0Sstevel@tonic-gate
1733*0Sstevel@tonic-gatethe right delimiter of the second block of the operation (if any),
1734*0Sstevel@tonic-gate
1735*0Sstevel@tonic-gate=item [10]
1736*0Sstevel@tonic-gate
1737*0Sstevel@tonic-gatethe trailing modifiers on the operation (if any).
1738*0Sstevel@tonic-gate
1739*0Sstevel@tonic-gate=back
1740*0Sstevel@tonic-gate
1741*0Sstevel@tonic-gateFor each of the fields marked "(if any)" the default value on success is
1742*0Sstevel@tonic-gatean empty string.
1743*0Sstevel@tonic-gateOn failure, all of these values (except the remaining text) are C<undef>.
1744*0Sstevel@tonic-gate
1745*0Sstevel@tonic-gate
1746*0Sstevel@tonic-gateIn a scalar context, C<extract_quotelike> returns just the complete substring
1747*0Sstevel@tonic-gatethat matched a quotelike operation (or C<undef> on failure). In a scalar or
1748*0Sstevel@tonic-gatevoid context, the input text has the same substring (and any specified
1749*0Sstevel@tonic-gateprefix) removed.
1750*0Sstevel@tonic-gate
1751*0Sstevel@tonic-gateExamples:
1752*0Sstevel@tonic-gate
1753*0Sstevel@tonic-gate        # Remove the first quotelike literal that appears in text
1754*0Sstevel@tonic-gate
1755*0Sstevel@tonic-gate                $quotelike = extract_quotelike($text,'.*?');
1756*0Sstevel@tonic-gate
1757*0Sstevel@tonic-gate        # Replace one or more leading whitespace-separated quotelike
1758*0Sstevel@tonic-gate        # literals in $_ with "<QLL>"
1759*0Sstevel@tonic-gate
1760*0Sstevel@tonic-gate                do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1761*0Sstevel@tonic-gate
1762*0Sstevel@tonic-gate
1763*0Sstevel@tonic-gate        # Isolate the search pattern in a quotelike operation from $text
1764*0Sstevel@tonic-gate
1765*0Sstevel@tonic-gate                ($op,$pat) = (extract_quotelike $text)[3,5];
1766*0Sstevel@tonic-gate                if ($op =~ /[ms]/)
1767*0Sstevel@tonic-gate                {
1768*0Sstevel@tonic-gate                        print "search pattern: $pat\n";
1769*0Sstevel@tonic-gate                }
1770*0Sstevel@tonic-gate                else
1771*0Sstevel@tonic-gate                {
1772*0Sstevel@tonic-gate                        print "$op is not a pattern matching operation\n";
1773*0Sstevel@tonic-gate                }
1774*0Sstevel@tonic-gate
1775*0Sstevel@tonic-gate
1776*0Sstevel@tonic-gate=head2 C<extract_quotelike> and "here documents"
1777*0Sstevel@tonic-gate
1778*0Sstevel@tonic-gateC<extract_quotelike> can successfully extract "here documents" from an input
1779*0Sstevel@tonic-gatestring, but with an important caveat in list contexts.
1780*0Sstevel@tonic-gate
1781*0Sstevel@tonic-gateUnlike other types of quote-like literals, a here document is rarely
1782*0Sstevel@tonic-gatea contiguous substring. For example, a typical piece of code using
1783*0Sstevel@tonic-gatehere document might look like this:
1784*0Sstevel@tonic-gate
1785*0Sstevel@tonic-gate        <<'EOMSG' || die;
1786*0Sstevel@tonic-gate        This is the message.
1787*0Sstevel@tonic-gate        EOMSG
1788*0Sstevel@tonic-gate        exit;
1789*0Sstevel@tonic-gate
1790*0Sstevel@tonic-gateGiven this as an input string in a scalar context, C<extract_quotelike>
1791*0Sstevel@tonic-gatewould correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1792*0Sstevel@tonic-gateleaving the string " || die;\nexit;" in the original variable. In other words,
1793*0Sstevel@tonic-gatethe two separate pieces of the here document are successfully extracted and
1794*0Sstevel@tonic-gateconcatenated.
1795*0Sstevel@tonic-gate
1796*0Sstevel@tonic-gateIn a list context, C<extract_quotelike> would return the list
1797*0Sstevel@tonic-gate
1798*0Sstevel@tonic-gate=over 4
1799*0Sstevel@tonic-gate
1800*0Sstevel@tonic-gate=item [0]
1801*0Sstevel@tonic-gate
1802*0Sstevel@tonic-gate"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1803*0Sstevel@tonic-gateincluding fore and aft delimiters),
1804*0Sstevel@tonic-gate
1805*0Sstevel@tonic-gate=item [1]
1806*0Sstevel@tonic-gate
1807*0Sstevel@tonic-gate" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1808*0Sstevel@tonic-gate
1809*0Sstevel@tonic-gate=item [2]
1810*0Sstevel@tonic-gate
1811*0Sstevel@tonic-gate"" (i.e. the prefix substring -- trivial in this case),
1812*0Sstevel@tonic-gate
1813*0Sstevel@tonic-gate=item [3]
1814*0Sstevel@tonic-gate
1815*0Sstevel@tonic-gate"<<" (i.e. the "name" of the quotelike operator)
1816*0Sstevel@tonic-gate
1817*0Sstevel@tonic-gate=item [4]
1818*0Sstevel@tonic-gate
1819*0Sstevel@tonic-gate"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1820*0Sstevel@tonic-gate
1821*0Sstevel@tonic-gate=item [5]
1822*0Sstevel@tonic-gate
1823*0Sstevel@tonic-gate"This is the message.\n" (i.e. the text of the here document),
1824*0Sstevel@tonic-gate
1825*0Sstevel@tonic-gate=item [6]
1826*0Sstevel@tonic-gate
1827*0Sstevel@tonic-gate"EOMSG" (i.e. the right delimiter of the here document),
1828*0Sstevel@tonic-gate
1829*0Sstevel@tonic-gate=item [7..10]
1830*0Sstevel@tonic-gate
1831*0Sstevel@tonic-gate"" (a here document has no second left delimiter, second text, second right
1832*0Sstevel@tonic-gatedelimiter, or trailing modifiers).
1833*0Sstevel@tonic-gate
1834*0Sstevel@tonic-gate=back
1835*0Sstevel@tonic-gate
1836*0Sstevel@tonic-gateHowever, the matching position of the input variable would be set to
1837*0Sstevel@tonic-gate"exit;" (i.e. I<after> the closing delimiter of the here document),
1838*0Sstevel@tonic-gatewhich would cause the earlier " || die;\nexit;" to be skipped in any
1839*0Sstevel@tonic-gatesequence of code fragment extractions.
1840*0Sstevel@tonic-gate
1841*0Sstevel@tonic-gateTo avoid this problem, when it encounters a here document whilst
1842*0Sstevel@tonic-gateextracting from a modifiable string, C<extract_quotelike> silently
1843*0Sstevel@tonic-gaterearranges the string to an equivalent piece of Perl:
1844*0Sstevel@tonic-gate
1845*0Sstevel@tonic-gate        <<'EOMSG'
1846*0Sstevel@tonic-gate        This is the message.
1847*0Sstevel@tonic-gate        EOMSG
1848*0Sstevel@tonic-gate        || die;
1849*0Sstevel@tonic-gate        exit;
1850*0Sstevel@tonic-gate
1851*0Sstevel@tonic-gatein which the here document I<is> contiguous. It still leaves the
1852*0Sstevel@tonic-gatematching position after the here document, but now the rest of the line
1853*0Sstevel@tonic-gateon which the here document starts is not skipped.
1854*0Sstevel@tonic-gate
1855*0Sstevel@tonic-gateTo prevent <extract_quotelike> from mucking about with the input in this way
1856*0Sstevel@tonic-gate(this is the only case where a list-context C<extract_quotelike> does so),
1857*0Sstevel@tonic-gateyou can pass the input variable as an interpolated literal:
1858*0Sstevel@tonic-gate
1859*0Sstevel@tonic-gate        $quotelike = extract_quotelike("$var");
1860*0Sstevel@tonic-gate
1861*0Sstevel@tonic-gate
1862*0Sstevel@tonic-gate=head2 C<extract_codeblock>
1863*0Sstevel@tonic-gate
1864*0Sstevel@tonic-gateC<extract_codeblock> attempts to recognize and extract a balanced
1865*0Sstevel@tonic-gatebracket delimited substring that may contain unbalanced brackets
1866*0Sstevel@tonic-gateinside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1867*0Sstevel@tonic-gateis like a combination of C<"extract_bracketed"> and
1868*0Sstevel@tonic-gateC<"extract_quotelike">.
1869*0Sstevel@tonic-gate
1870*0Sstevel@tonic-gateC<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1871*0Sstevel@tonic-gatea text to process, a set of delimiter brackets to look for, and a prefix to
1872*0Sstevel@tonic-gatematch first. It also takes an optional fourth parameter, which allows the
1873*0Sstevel@tonic-gateoutermost delimiter brackets to be specified separately (see below).
1874*0Sstevel@tonic-gate
1875*0Sstevel@tonic-gateOmitting the first argument (input text) means process C<$_> instead.
1876*0Sstevel@tonic-gateOmitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1877*0Sstevel@tonic-gateOmitting the third argument (prefix argument) implies optional whitespace at the start.
1878*0Sstevel@tonic-gateOmitting the fourth argument (outermost delimiter brackets) indicates that the
1879*0Sstevel@tonic-gatevalue of the second argument is to be used for the outermost delimiters.
1880*0Sstevel@tonic-gate
1881*0Sstevel@tonic-gateOnce the prefix an dthe outermost opening delimiter bracket have been
1882*0Sstevel@tonic-gaterecognized, code blocks are extracted by stepping through the input text and
1883*0Sstevel@tonic-gatetrying the following alternatives in sequence:
1884*0Sstevel@tonic-gate
1885*0Sstevel@tonic-gate=over 4
1886*0Sstevel@tonic-gate
1887*0Sstevel@tonic-gate=item 1.
1888*0Sstevel@tonic-gate
1889*0Sstevel@tonic-gateTry and match a closing delimiter bracket. If the bracket was the same
1890*0Sstevel@tonic-gatespecies as the last opening bracket, return the substring to that
1891*0Sstevel@tonic-gatepoint. If the bracket was mismatched, return an error.
1892*0Sstevel@tonic-gate
1893*0Sstevel@tonic-gate=item 2.
1894*0Sstevel@tonic-gate
1895*0Sstevel@tonic-gateTry to match a quote or quotelike operator. If found, call
1896*0Sstevel@tonic-gateC<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1897*0Sstevel@tonic-gatethe error it returned. Otherwise go back to step 1.
1898*0Sstevel@tonic-gate
1899*0Sstevel@tonic-gate=item 3.
1900*0Sstevel@tonic-gate
1901*0Sstevel@tonic-gateTry to match an opening delimiter bracket. If found, call
1902*0Sstevel@tonic-gateC<extract_codeblock> recursively to eat the embedded block. If the
1903*0Sstevel@tonic-gaterecursive call fails, return an error. Otherwise, go back to step 1.
1904*0Sstevel@tonic-gate
1905*0Sstevel@tonic-gate=item 4.
1906*0Sstevel@tonic-gate
1907*0Sstevel@tonic-gateUnconditionally match a bareword or any other single character, and
1908*0Sstevel@tonic-gatethen go back to step 1.
1909*0Sstevel@tonic-gate
1910*0Sstevel@tonic-gate=back
1911*0Sstevel@tonic-gate
1912*0Sstevel@tonic-gate
1913*0Sstevel@tonic-gateExamples:
1914*0Sstevel@tonic-gate
1915*0Sstevel@tonic-gate        # Find a while loop in the text
1916*0Sstevel@tonic-gate
1917*0Sstevel@tonic-gate                if ($text =~ s/.*?while\s*\{/{/)
1918*0Sstevel@tonic-gate                {
1919*0Sstevel@tonic-gate                        $loop = "while " . extract_codeblock($text);
1920*0Sstevel@tonic-gate                }
1921*0Sstevel@tonic-gate
1922*0Sstevel@tonic-gate        # Remove the first round-bracketed list (which may include
1923*0Sstevel@tonic-gate        # round- or curly-bracketed code blocks or quotelike operators)
1924*0Sstevel@tonic-gate
1925*0Sstevel@tonic-gate                extract_codeblock $text, "(){}", '[^(]*';
1926*0Sstevel@tonic-gate
1927*0Sstevel@tonic-gate
1928*0Sstevel@tonic-gateThe ability to specify a different outermost delimiter bracket is useful
1929*0Sstevel@tonic-gatein some circumstances. For example, in the Parse::RecDescent module,
1930*0Sstevel@tonic-gateparser actions which are to be performed only on a successful parse
1931*0Sstevel@tonic-gateare specified using a C<E<lt>defer:...E<gt>> directive. For example:
1932*0Sstevel@tonic-gate
1933*0Sstevel@tonic-gate        sentence: subject verb object
1934*0Sstevel@tonic-gate                        <defer: {$::theVerb = $item{verb}} >
1935*0Sstevel@tonic-gate
1936*0Sstevel@tonic-gateParse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1937*0Sstevel@tonic-gatewithin the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1938*0Sstevel@tonic-gate
1939*0Sstevel@tonic-gateA deferred action like this:
1940*0Sstevel@tonic-gate
1941*0Sstevel@tonic-gate                        <defer: {if ($count>10) {$count--}} >
1942*0Sstevel@tonic-gate
1943*0Sstevel@tonic-gatewill be incorrectly parsed as:
1944*0Sstevel@tonic-gate
1945*0Sstevel@tonic-gate                        <defer: {if ($count>
1946*0Sstevel@tonic-gate
1947*0Sstevel@tonic-gatebecause the "less than" operator is interpreted as a closing delimiter.
1948*0Sstevel@tonic-gate
1949*0Sstevel@tonic-gateBut, by extracting the directive using
1950*0Sstevel@tonic-gateS<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1951*0Sstevel@tonic-gatethe '>' character is only treated as a delimited at the outermost
1952*0Sstevel@tonic-gatelevel of the code block, so the directive is parsed correctly.
1953*0Sstevel@tonic-gate
1954*0Sstevel@tonic-gate=head2 C<extract_multiple>
1955*0Sstevel@tonic-gate
1956*0Sstevel@tonic-gateThe C<extract_multiple> subroutine takes a string to be processed and a
1957*0Sstevel@tonic-gatelist of extractors (subroutines or regular expressions) to apply to that string.
1958*0Sstevel@tonic-gate
1959*0Sstevel@tonic-gateIn an array context C<extract_multiple> returns an array of substrings
1960*0Sstevel@tonic-gateof the original string, as extracted by the specified extractors.
1961*0Sstevel@tonic-gateIn a scalar context, C<extract_multiple> returns the first
1962*0Sstevel@tonic-gatesubstring successfully extracted from the original string. In both
1963*0Sstevel@tonic-gatescalar and void contexts the original string has the first successfully
1964*0Sstevel@tonic-gateextracted substring removed from it. In all contexts
1965*0Sstevel@tonic-gateC<extract_multiple> starts at the current C<pos> of the string, and
1966*0Sstevel@tonic-gatesets that C<pos> appropriately after it matches.
1967*0Sstevel@tonic-gate
1968*0Sstevel@tonic-gateHence, the aim of of a call to C<extract_multiple> in a list context
1969*0Sstevel@tonic-gateis to split the processed string into as many non-overlapping fields as
1970*0Sstevel@tonic-gatepossible, by repeatedly applying each of the specified extractors
1971*0Sstevel@tonic-gateto the remainder of the string. Thus C<extract_multiple> is
1972*0Sstevel@tonic-gatea generalized form of Perl's C<split> subroutine.
1973*0Sstevel@tonic-gate
1974*0Sstevel@tonic-gateThe subroutine takes up to four optional arguments:
1975*0Sstevel@tonic-gate
1976*0Sstevel@tonic-gate=over 4
1977*0Sstevel@tonic-gate
1978*0Sstevel@tonic-gate=item 1.
1979*0Sstevel@tonic-gate
1980*0Sstevel@tonic-gateA string to be processed (C<$_> if the string is omitted or C<undef>)
1981*0Sstevel@tonic-gate
1982*0Sstevel@tonic-gate=item 2.
1983*0Sstevel@tonic-gate
1984*0Sstevel@tonic-gateA reference to a list of subroutine references and/or qr// objects and/or
1985*0Sstevel@tonic-gateliteral strings and/or hash references, specifying the extractors
1986*0Sstevel@tonic-gateto be used to split the string. If this argument is omitted (or
1987*0Sstevel@tonic-gateC<undef>) the list:
1988*0Sstevel@tonic-gate
1989*0Sstevel@tonic-gate        [
1990*0Sstevel@tonic-gate                sub { extract_variable($_[0], '') },
1991*0Sstevel@tonic-gate                sub { extract_quotelike($_[0],'') },
1992*0Sstevel@tonic-gate                sub { extract_codeblock($_[0],'{}','') },
1993*0Sstevel@tonic-gate        ]
1994*0Sstevel@tonic-gate
1995*0Sstevel@tonic-gateis used.
1996*0Sstevel@tonic-gate
1997*0Sstevel@tonic-gate
1998*0Sstevel@tonic-gate=item 3.
1999*0Sstevel@tonic-gate
2000*0Sstevel@tonic-gateAn number specifying the maximum number of fields to return. If this
2001*0Sstevel@tonic-gateargument is omitted (or C<undef>), split continues as long as possible.
2002*0Sstevel@tonic-gate
2003*0Sstevel@tonic-gateIf the third argument is I<N>, then extraction continues until I<N> fields
2004*0Sstevel@tonic-gatehave been successfully extracted, or until the string has been completely
2005*0Sstevel@tonic-gateprocessed.
2006*0Sstevel@tonic-gate
2007*0Sstevel@tonic-gateNote that in scalar and void contexts the value of this argument is
2008*0Sstevel@tonic-gateautomatically reset to 1 (under C<-w>, a warning is issued if the argument
2009*0Sstevel@tonic-gatehas to be reset).
2010*0Sstevel@tonic-gate
2011*0Sstevel@tonic-gate=item 4.
2012*0Sstevel@tonic-gate
2013*0Sstevel@tonic-gateA value indicating whether unmatched substrings (see below) within the
2014*0Sstevel@tonic-gatetext should be skipped or returned as fields. If the value is true,
2015*0Sstevel@tonic-gatesuch substrings are skipped. Otherwise, they are returned.
2016*0Sstevel@tonic-gate
2017*0Sstevel@tonic-gate=back
2018*0Sstevel@tonic-gate
2019*0Sstevel@tonic-gateThe extraction process works by applying each extractor in
2020*0Sstevel@tonic-gatesequence to the text string.
2021*0Sstevel@tonic-gate
2022*0Sstevel@tonic-gateIf the extractor is a subroutine it is called in a list context and is
2023*0Sstevel@tonic-gateexpected to return a list of a single element, namely the extracted
2024*0Sstevel@tonic-gatetext. It may optionally also return two further arguments: a string
2025*0Sstevel@tonic-gaterepresenting the text left after extraction (like $' for a pattern
2026*0Sstevel@tonic-gatematch), and a string representing any prefix skipped before the
2027*0Sstevel@tonic-gateextraction (like $` in a pattern match). Note that this is designed
2028*0Sstevel@tonic-gateto facilitate the use of other Text::Balanced subroutines with
2029*0Sstevel@tonic-gateC<extract_multiple>. Note too that the value returned by an extractor
2030*0Sstevel@tonic-gatesubroutine need not bear any relationship to the corresponding substring
2031*0Sstevel@tonic-gateof the original text (see examples below).
2032*0Sstevel@tonic-gate
2033*0Sstevel@tonic-gateIf the extractor is a precompiled regular expression or a string,
2034*0Sstevel@tonic-gateit is matched against the text in a scalar context with a leading
2035*0Sstevel@tonic-gate'\G' and the gc modifiers enabled. The extracted value is either
2036*0Sstevel@tonic-gate$1 if that variable is defined after the match, or else the
2037*0Sstevel@tonic-gatecomplete match (i.e. $&).
2038*0Sstevel@tonic-gate
2039*0Sstevel@tonic-gateIf the extractor is a hash reference, it must contain exactly one element.
2040*0Sstevel@tonic-gateThe value of that element is one of the
2041*0Sstevel@tonic-gateabove extractor types (subroutine reference, regular expression, or string).
2042*0Sstevel@tonic-gateThe key of that element is the name of a class into which the successful
2043*0Sstevel@tonic-gatereturn value of the extractor will be blessed.
2044*0Sstevel@tonic-gate
2045*0Sstevel@tonic-gateIf an extractor returns a defined value, that value is immediately
2046*0Sstevel@tonic-gatetreated as the next extracted field and pushed onto the list of fields.
2047*0Sstevel@tonic-gateIf the extractor was specified in a hash reference, the field is also
2048*0Sstevel@tonic-gateblessed into the appropriate class,
2049*0Sstevel@tonic-gate
2050*0Sstevel@tonic-gateIf the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2051*0Sstevel@tonic-gateassumed to have failed to extract.
2052*0Sstevel@tonic-gateIf none of the extractor subroutines succeeds, then one
2053*0Sstevel@tonic-gatecharacter is extracted from the start of the text and the extraction
2054*0Sstevel@tonic-gatesubroutines reapplied. Characters which are thus removed are accumulated and
2055*0Sstevel@tonic-gateeventually become the next field (unless the fourth argument is true, in which
2056*0Sstevel@tonic-gatecase they are disgarded).
2057*0Sstevel@tonic-gate
2058*0Sstevel@tonic-gateFor example, the following extracts substrings that are valid Perl variables:
2059*0Sstevel@tonic-gate
2060*0Sstevel@tonic-gate        @fields = extract_multiple($text,
2061*0Sstevel@tonic-gate                                   [ sub { extract_variable($_[0]) } ],
2062*0Sstevel@tonic-gate                                   undef, 1);
2063*0Sstevel@tonic-gate
2064*0Sstevel@tonic-gateThis example separates a text into fields which are quote delimited,
2065*0Sstevel@tonic-gatecurly bracketed, and anything else. The delimited and bracketed
2066*0Sstevel@tonic-gateparts are also blessed to identify them (the "anything else" is unblessed):
2067*0Sstevel@tonic-gate
2068*0Sstevel@tonic-gate        @fields = extract_multiple($text,
2069*0Sstevel@tonic-gate                   [
2070*0Sstevel@tonic-gate                        { Delim => sub { extract_delimited($_[0],q{'"}) } },
2071*0Sstevel@tonic-gate                        { Brack => sub { extract_bracketed($_[0],'{}') } },
2072*0Sstevel@tonic-gate                   ]);
2073*0Sstevel@tonic-gate
2074*0Sstevel@tonic-gateThis call extracts the next single substring that is a valid Perl quotelike
2075*0Sstevel@tonic-gateoperator (and removes it from $text):
2076*0Sstevel@tonic-gate
2077*0Sstevel@tonic-gate        $quotelike = extract_multiple($text,
2078*0Sstevel@tonic-gate                                      [
2079*0Sstevel@tonic-gate                                        sub { extract_quotelike($_[0]) },
2080*0Sstevel@tonic-gate                                      ], undef, 1);
2081*0Sstevel@tonic-gate
2082*0Sstevel@tonic-gateFinally, here is yet another way to do comma-separated value parsing:
2083*0Sstevel@tonic-gate
2084*0Sstevel@tonic-gate        @fields = extract_multiple($csv_text,
2085*0Sstevel@tonic-gate                                  [
2086*0Sstevel@tonic-gate                                        sub { extract_delimited($_[0],q{'"}) },
2087*0Sstevel@tonic-gate                                        qr/([^,]+)(.*)/,
2088*0Sstevel@tonic-gate                                  ],
2089*0Sstevel@tonic-gate                                  undef,1);
2090*0Sstevel@tonic-gate
2091*0Sstevel@tonic-gateThe list in the second argument means:
2092*0Sstevel@tonic-gateI<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2093*0Sstevel@tonic-gateThe undef third argument means:
2094*0Sstevel@tonic-gateI<"...as many times as possible...">,
2095*0Sstevel@tonic-gateand the true value in the fourth argument means
2096*0Sstevel@tonic-gateI<"...discarding anything else that appears (i.e. the commas)">.
2097*0Sstevel@tonic-gate
2098*0Sstevel@tonic-gateIf you wanted the commas preserved as separate fields (i.e. like split
2099*0Sstevel@tonic-gatedoes if your split pattern has capturing parentheses), you would
2100*0Sstevel@tonic-gatejust make the last parameter undefined (or remove it).
2101*0Sstevel@tonic-gate
2102*0Sstevel@tonic-gate
2103*0Sstevel@tonic-gate=head2 C<gen_delimited_pat>
2104*0Sstevel@tonic-gate
2105*0Sstevel@tonic-gateThe C<gen_delimited_pat> subroutine takes a single (string) argument and
2106*0Sstevel@tonic-gate   > builds a Friedl-style optimized regex that matches a string delimited
2107*0Sstevel@tonic-gateby any one of the characters in the single argument. For example:
2108*0Sstevel@tonic-gate
2109*0Sstevel@tonic-gate        gen_delimited_pat(q{'"})
2110*0Sstevel@tonic-gate
2111*0Sstevel@tonic-gatereturns the regex:
2112*0Sstevel@tonic-gate
2113*0Sstevel@tonic-gate        (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2114*0Sstevel@tonic-gate
2115*0Sstevel@tonic-gateNote that the specified delimiters are automatically quotemeta'd.
2116*0Sstevel@tonic-gate
2117*0Sstevel@tonic-gateA typical use of C<gen_delimited_pat> would be to build special purpose tags
2118*0Sstevel@tonic-gatefor C<extract_tagged>. For example, to properly ignore "empty" XML elements
2119*0Sstevel@tonic-gate(which might contain quoted strings):
2120*0Sstevel@tonic-gate
2121*0Sstevel@tonic-gate        my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2122*0Sstevel@tonic-gate
2123*0Sstevel@tonic-gate        extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2124*0Sstevel@tonic-gate
2125*0Sstevel@tonic-gate
2126*0Sstevel@tonic-gateC<gen_delimited_pat> may also be called with an optional second argument,
2127*0Sstevel@tonic-gatewhich specifies the "escape" character(s) to be used for each delimiter.
2128*0Sstevel@tonic-gateFor example to match a Pascal-style string (where ' is the delimiter
2129*0Sstevel@tonic-gateand '' is a literal ' within the string):
2130*0Sstevel@tonic-gate
2131*0Sstevel@tonic-gate        gen_delimited_pat(q{'},q{'});
2132*0Sstevel@tonic-gate
2133*0Sstevel@tonic-gateDifferent escape characters can be specified for different delimiters.
2134*0Sstevel@tonic-gateFor example, to specify that '/' is the escape for single quotes
2135*0Sstevel@tonic-gateand '%' is the escape for double quotes:
2136*0Sstevel@tonic-gate
2137*0Sstevel@tonic-gate        gen_delimited_pat(q{'"},q{/%});
2138*0Sstevel@tonic-gate
2139*0Sstevel@tonic-gateIf more delimiters than escape chars are specified, the last escape char
2140*0Sstevel@tonic-gateis used for the remaining delimiters.
2141*0Sstevel@tonic-gateIf no escape char is specified for a given specified delimiter, '\' is used.
2142*0Sstevel@tonic-gate
2143*0Sstevel@tonic-gateNote that
2144*0Sstevel@tonic-gateC<gen_delimited_pat> was previously called
2145*0Sstevel@tonic-gateC<delimited_pat>. That name may still be used, but is now deprecated.
2146*0Sstevel@tonic-gate
2147*0Sstevel@tonic-gate
2148*0Sstevel@tonic-gate=head1 DIAGNOSTICS
2149*0Sstevel@tonic-gate
2150*0Sstevel@tonic-gateIn a list context, all the functions return C<(undef,$original_text)>
2151*0Sstevel@tonic-gateon failure. In a scalar context, failure is indicated by returning C<undef>
2152*0Sstevel@tonic-gate(in this case the input text is not modified in any way).
2153*0Sstevel@tonic-gate
2154*0Sstevel@tonic-gateIn addition, on failure in I<any> context, the C<$@> variable is set.
2155*0Sstevel@tonic-gateAccessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2156*0Sstevel@tonic-gatebelow.
2157*0Sstevel@tonic-gateAccessing C<$@-E<gt>{pos}> returns the offset into the original string at
2158*0Sstevel@tonic-gatewhich the error was detected (although not necessarily where it occurred!)
2159*0Sstevel@tonic-gatePrinting C<$@> directly produces the error message, with the offset appended.
2160*0Sstevel@tonic-gateOn success, the C<$@> variable is guaranteed to be C<undef>.
2161*0Sstevel@tonic-gate
2162*0Sstevel@tonic-gateThe available diagnostics are:
2163*0Sstevel@tonic-gate
2164*0Sstevel@tonic-gate=over 4
2165*0Sstevel@tonic-gate
2166*0Sstevel@tonic-gate=item  C<Did not find a suitable bracket: "%s">
2167*0Sstevel@tonic-gate
2168*0Sstevel@tonic-gateThe delimiter provided to C<extract_bracketed> was not one of
2169*0Sstevel@tonic-gateC<'()[]E<lt>E<gt>{}'>.
2170*0Sstevel@tonic-gate
2171*0Sstevel@tonic-gate=item  C<Did not find prefix: /%s/>
2172*0Sstevel@tonic-gate
2173*0Sstevel@tonic-gateA non-optional prefix was specified but wasn't found at the start of the text.
2174*0Sstevel@tonic-gate
2175*0Sstevel@tonic-gate=item  C<Did not find opening bracket after prefix: "%s">
2176*0Sstevel@tonic-gate
2177*0Sstevel@tonic-gateC<extract_bracketed> or C<extract_codeblock> was expecting a
2178*0Sstevel@tonic-gateparticular kind of bracket at the start of the text, and didn't find it.
2179*0Sstevel@tonic-gate
2180*0Sstevel@tonic-gate=item  C<No quotelike operator found after prefix: "%s">
2181*0Sstevel@tonic-gate
2182*0Sstevel@tonic-gateC<extract_quotelike> didn't find one of the quotelike operators C<q>,
2183*0Sstevel@tonic-gateC<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2184*0Sstevel@tonic-gateit was extracting.
2185*0Sstevel@tonic-gate
2186*0Sstevel@tonic-gate=item  C<Unmatched closing bracket: "%c">
2187*0Sstevel@tonic-gate
2188*0Sstevel@tonic-gateC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2189*0Sstevel@tonic-gatea closing bracket where none was expected.
2190*0Sstevel@tonic-gate
2191*0Sstevel@tonic-gate=item  C<Unmatched opening bracket(s): "%s">
2192*0Sstevel@tonic-gate
2193*0Sstevel@tonic-gateC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2194*0Sstevel@tonic-gateout of characters in the text before closing one or more levels of nested
2195*0Sstevel@tonic-gatebrackets.
2196*0Sstevel@tonic-gate
2197*0Sstevel@tonic-gate=item C<Unmatched embedded quote (%s)>
2198*0Sstevel@tonic-gate
2199*0Sstevel@tonic-gateC<extract_bracketed> attempted to match an embedded quoted substring, but
2200*0Sstevel@tonic-gatefailed to find a closing quote to match it.
2201*0Sstevel@tonic-gate
2202*0Sstevel@tonic-gate=item C<Did not find closing delimiter to match '%s'>
2203*0Sstevel@tonic-gate
2204*0Sstevel@tonic-gateC<extract_quotelike> was unable to find a closing delimiter to match the
2205*0Sstevel@tonic-gateone that opened the quote-like operation.
2206*0Sstevel@tonic-gate
2207*0Sstevel@tonic-gate=item  C<Mismatched closing bracket: expected "%c" but found "%s">
2208*0Sstevel@tonic-gate
2209*0Sstevel@tonic-gateC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2210*0Sstevel@tonic-gatea valid bracket delimiter, but it was the wrong species. This usually
2211*0Sstevel@tonic-gateindicates a nesting error, but may indicate incorrect quoting or escaping.
2212*0Sstevel@tonic-gate
2213*0Sstevel@tonic-gate=item  C<No block delimiter found after quotelike "%s">
2214*0Sstevel@tonic-gate
2215*0Sstevel@tonic-gateC<extract_quotelike> or C<extract_codeblock> found one of the
2216*0Sstevel@tonic-gatequotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2217*0Sstevel@tonic-gatewithout a suitable block after it.
2218*0Sstevel@tonic-gate
2219*0Sstevel@tonic-gate=item C<Did not find leading dereferencer>
2220*0Sstevel@tonic-gate
2221*0Sstevel@tonic-gateC<extract_variable> was expecting one of '$', '@', or '%' at the start of
2222*0Sstevel@tonic-gatea variable, but didn't find any of them.
2223*0Sstevel@tonic-gate
2224*0Sstevel@tonic-gate=item C<Bad identifier after dereferencer>
2225*0Sstevel@tonic-gate
2226*0Sstevel@tonic-gateC<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2227*0Sstevel@tonic-gatecharacter was not followed by a legal Perl identifier.
2228*0Sstevel@tonic-gate
2229*0Sstevel@tonic-gate=item C<Did not find expected opening bracket at %s>
2230*0Sstevel@tonic-gate
2231*0Sstevel@tonic-gateC<extract_codeblock> failed to find any of the outermost opening brackets
2232*0Sstevel@tonic-gatethat were specified.
2233*0Sstevel@tonic-gate
2234*0Sstevel@tonic-gate=item C<Improperly nested codeblock at %s>
2235*0Sstevel@tonic-gate
2236*0Sstevel@tonic-gateA nested code block was found that started with a delimiter that was specified
2237*0Sstevel@tonic-gateas being only to be used as an outermost bracket.
2238*0Sstevel@tonic-gate
2239*0Sstevel@tonic-gate=item  C<Missing second block for quotelike "%s">
2240*0Sstevel@tonic-gate
2241*0Sstevel@tonic-gateC<extract_codeblock> or C<extract_quotelike> found one of the
2242*0Sstevel@tonic-gatequotelike operators C<s>, C<tr> or C<y> followed by only one block.
2243*0Sstevel@tonic-gate
2244*0Sstevel@tonic-gate=item C<No match found for opening bracket>
2245*0Sstevel@tonic-gate
2246*0Sstevel@tonic-gateC<extract_codeblock> failed to find a closing bracket to match the outermost
2247*0Sstevel@tonic-gateopening bracket.
2248*0Sstevel@tonic-gate
2249*0Sstevel@tonic-gate=item C<Did not find opening tag: /%s/>
2250*0Sstevel@tonic-gate
2251*0Sstevel@tonic-gateC<extract_tagged> did not find a suitable opening tag (after any specified
2252*0Sstevel@tonic-gateprefix was removed).
2253*0Sstevel@tonic-gate
2254*0Sstevel@tonic-gate=item C<Unable to construct closing tag to match: /%s/>
2255*0Sstevel@tonic-gate
2256*0Sstevel@tonic-gateC<extract_tagged> matched the specified opening tag and tried to
2257*0Sstevel@tonic-gatemodify the matched text to produce a matching closing tag (because
2258*0Sstevel@tonic-gatenone was specified). It failed to generate the closing tag, almost
2259*0Sstevel@tonic-gatecertainly because the opening tag did not start with a
2260*0Sstevel@tonic-gatebracket of some kind.
2261*0Sstevel@tonic-gate
2262*0Sstevel@tonic-gate=item C<Found invalid nested tag: %s>
2263*0Sstevel@tonic-gate
2264*0Sstevel@tonic-gateC<extract_tagged> found a nested tag that appeared in the "reject" list
2265*0Sstevel@tonic-gate(and the failure mode was not "MAX" or "PARA").
2266*0Sstevel@tonic-gate
2267*0Sstevel@tonic-gate=item C<Found unbalanced nested tag: %s>
2268*0Sstevel@tonic-gate
2269*0Sstevel@tonic-gateC<extract_tagged> found a nested opening tag that was not matched by a
2270*0Sstevel@tonic-gatecorresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2271*0Sstevel@tonic-gate
2272*0Sstevel@tonic-gate=item C<Did not find closing tag>
2273*0Sstevel@tonic-gate
2274*0Sstevel@tonic-gateC<extract_tagged> reached the end of the text without finding a closing tag
2275*0Sstevel@tonic-gateto match the original opening tag (and the failure mode was not
2276*0Sstevel@tonic-gate"MAX" or "PARA").
2277*0Sstevel@tonic-gate
2278*0Sstevel@tonic-gate
2279*0Sstevel@tonic-gate
2280*0Sstevel@tonic-gate
2281*0Sstevel@tonic-gate=back
2282*0Sstevel@tonic-gate
2283*0Sstevel@tonic-gate
2284*0Sstevel@tonic-gate=head1 AUTHOR
2285*0Sstevel@tonic-gate
2286*0Sstevel@tonic-gateDamian Conway (damian@conway.org)
2287*0Sstevel@tonic-gate
2288*0Sstevel@tonic-gate
2289*0Sstevel@tonic-gate=head1 BUGS AND IRRITATIONS
2290*0Sstevel@tonic-gate
2291*0Sstevel@tonic-gateThere are undoubtedly serious bugs lurking somewhere in this code, if
2292*0Sstevel@tonic-gateonly because parts of it give the impression of understanding a great deal
2293*0Sstevel@tonic-gatemore about Perl than they really do.
2294*0Sstevel@tonic-gate
2295*0Sstevel@tonic-gateBug reports and other feedback are most welcome.
2296*0Sstevel@tonic-gate
2297*0Sstevel@tonic-gate
2298*0Sstevel@tonic-gate=head1 COPYRIGHT
2299*0Sstevel@tonic-gate
2300*0Sstevel@tonic-gate Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2301*0Sstevel@tonic-gate This module is free software. It may be used, redistributed
2302*0Sstevel@tonic-gate     and/or modified under the same terms as Perl itself.
2303