1*0Sstevel@tonic-gate# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 2*0Sstevel@tonic-gate# FOR FULL DOCUMENTATION SEE Balanced.pod 3*0Sstevel@tonic-gate 4*0Sstevel@tonic-gateuse 5.005; 5*0Sstevel@tonic-gateuse strict; 6*0Sstevel@tonic-gate 7*0Sstevel@tonic-gatepackage Text::Balanced; 8*0Sstevel@tonic-gate 9*0Sstevel@tonic-gateuse Exporter; 10*0Sstevel@tonic-gateuse SelfLoader; 11*0Sstevel@tonic-gateuse vars qw { $VERSION @ISA %EXPORT_TAGS }; 12*0Sstevel@tonic-gate 13*0Sstevel@tonic-gate$VERSION = '1.95'; 14*0Sstevel@tonic-gate@ISA = qw ( Exporter ); 15*0Sstevel@tonic-gate 16*0Sstevel@tonic-gate%EXPORT_TAGS = ( ALL => [ qw( 17*0Sstevel@tonic-gate &extract_delimited 18*0Sstevel@tonic-gate &extract_bracketed 19*0Sstevel@tonic-gate &extract_quotelike 20*0Sstevel@tonic-gate &extract_codeblock 21*0Sstevel@tonic-gate &extract_variable 22*0Sstevel@tonic-gate &extract_tagged 23*0Sstevel@tonic-gate &extract_multiple 24*0Sstevel@tonic-gate 25*0Sstevel@tonic-gate &gen_delimited_pat 26*0Sstevel@tonic-gate &gen_extract_tagged 27*0Sstevel@tonic-gate 28*0Sstevel@tonic-gate &delimited_pat 29*0Sstevel@tonic-gate ) ] ); 30*0Sstevel@tonic-gate 31*0Sstevel@tonic-gateExporter::export_ok_tags('ALL'); 32*0Sstevel@tonic-gate 33*0Sstevel@tonic-gate# PROTOTYPES 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gatesub _match_bracketed($$$$$$); 36*0Sstevel@tonic-gatesub _match_variable($$); 37*0Sstevel@tonic-gatesub _match_codeblock($$$$$$$); 38*0Sstevel@tonic-gatesub _match_quotelike($$$$); 39*0Sstevel@tonic-gate 40*0Sstevel@tonic-gate# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 41*0Sstevel@tonic-gate 42*0Sstevel@tonic-gatesub _failmsg { 43*0Sstevel@tonic-gate my ($message, $pos) = @_; 44*0Sstevel@tonic-gate $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg"; 45*0Sstevel@tonic-gate} 46*0Sstevel@tonic-gate 47*0Sstevel@tonic-gatesub _fail 48*0Sstevel@tonic-gate{ 49*0Sstevel@tonic-gate my ($wantarray, $textref, $message, $pos) = @_; 50*0Sstevel@tonic-gate _failmsg $message, $pos if $message; 51*0Sstevel@tonic-gate return ("",$$textref,"") if $wantarray; 52*0Sstevel@tonic-gate return undef; 53*0Sstevel@tonic-gate} 54*0Sstevel@tonic-gate 55*0Sstevel@tonic-gatesub _succeed 56*0Sstevel@tonic-gate{ 57*0Sstevel@tonic-gate $@ = undef; 58*0Sstevel@tonic-gate my ($wantarray,$textref) = splice @_, 0, 2; 59*0Sstevel@tonic-gate my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0); 60*0Sstevel@tonic-gate my ($startlen) = $_[5]; 61*0Sstevel@tonic-gate my $remainderpos = $_[2]; 62*0Sstevel@tonic-gate if ($wantarray) 63*0Sstevel@tonic-gate { 64*0Sstevel@tonic-gate my @res; 65*0Sstevel@tonic-gate while (my ($from, $len) = splice @_, 0, 2) 66*0Sstevel@tonic-gate { 67*0Sstevel@tonic-gate push @res, substr($$textref,$from,$len); 68*0Sstevel@tonic-gate } 69*0Sstevel@tonic-gate if ($extralen) { # CORRECT FILLET 70*0Sstevel@tonic-gate my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n"); 71*0Sstevel@tonic-gate $res[1] = "$extra$res[1]"; 72*0Sstevel@tonic-gate eval { substr($$textref,$remainderpos,0) = $extra; 73*0Sstevel@tonic-gate substr($$textref,$extrapos,$extralen,"\n")} ; 74*0Sstevel@tonic-gate #REARRANGE HERE DOC AND FILLET IF POSSIBLE 75*0Sstevel@tonic-gate pos($$textref) = $remainderpos-$extralen+1; # RESET \G 76*0Sstevel@tonic-gate } 77*0Sstevel@tonic-gate else { 78*0Sstevel@tonic-gate pos($$textref) = $remainderpos; # RESET \G 79*0Sstevel@tonic-gate } 80*0Sstevel@tonic-gate return @res; 81*0Sstevel@tonic-gate } 82*0Sstevel@tonic-gate else 83*0Sstevel@tonic-gate { 84*0Sstevel@tonic-gate my $match = substr($$textref,$_[0],$_[1]); 85*0Sstevel@tonic-gate substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; 86*0Sstevel@tonic-gate my $extra = $extralen 87*0Sstevel@tonic-gate ? substr($$textref, $extrapos, $extralen)."\n" : ""; 88*0Sstevel@tonic-gate eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE 89*0Sstevel@tonic-gate pos($$textref) = $_[4]; # RESET \G 90*0Sstevel@tonic-gate return $match; 91*0Sstevel@tonic-gate } 92*0Sstevel@tonic-gate} 93*0Sstevel@tonic-gate 94*0Sstevel@tonic-gate# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 95*0Sstevel@tonic-gate 96*0Sstevel@tonic-gatesub gen_delimited_pat($;$) # ($delimiters;$escapes) 97*0Sstevel@tonic-gate{ 98*0Sstevel@tonic-gate my ($dels, $escs) = @_; 99*0Sstevel@tonic-gate return "" unless $dels =~ /\S/; 100*0Sstevel@tonic-gate $escs = '\\' unless $escs; 101*0Sstevel@tonic-gate $escs .= substr($escs,-1) x (length($dels)-length($escs)); 102*0Sstevel@tonic-gate my @pat = (); 103*0Sstevel@tonic-gate my $i; 104*0Sstevel@tonic-gate for ($i=0; $i<length $dels; $i++) 105*0Sstevel@tonic-gate { 106*0Sstevel@tonic-gate my $del = quotemeta substr($dels,$i,1); 107*0Sstevel@tonic-gate my $esc = quotemeta substr($escs,$i,1); 108*0Sstevel@tonic-gate if ($del eq $esc) 109*0Sstevel@tonic-gate { 110*0Sstevel@tonic-gate push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; 111*0Sstevel@tonic-gate } 112*0Sstevel@tonic-gate else 113*0Sstevel@tonic-gate { 114*0Sstevel@tonic-gate push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; 115*0Sstevel@tonic-gate } 116*0Sstevel@tonic-gate } 117*0Sstevel@tonic-gate my $pat = join '|', @pat; 118*0Sstevel@tonic-gate return "(?:$pat)"; 119*0Sstevel@tonic-gate} 120*0Sstevel@tonic-gate 121*0Sstevel@tonic-gate*delimited_pat = \&gen_delimited_pat; 122*0Sstevel@tonic-gate 123*0Sstevel@tonic-gate 124*0Sstevel@tonic-gate# THE EXTRACTION FUNCTIONS 125*0Sstevel@tonic-gate 126*0Sstevel@tonic-gatesub extract_delimited (;$$$$) 127*0Sstevel@tonic-gate{ 128*0Sstevel@tonic-gate my $textref = defined $_[0] ? \$_[0] : \$_; 129*0Sstevel@tonic-gate my $wantarray = wantarray; 130*0Sstevel@tonic-gate my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; 131*0Sstevel@tonic-gate my $pre = defined $_[2] ? $_[2] : '\s*'; 132*0Sstevel@tonic-gate my $esc = defined $_[3] ? $_[3] : qq{\\}; 133*0Sstevel@tonic-gate my $pat = gen_delimited_pat($del, $esc); 134*0Sstevel@tonic-gate my $startpos = pos $$textref || 0; 135*0Sstevel@tonic-gate return _fail($wantarray, $textref, "Not a delimited pattern", 0) 136*0Sstevel@tonic-gate unless $$textref =~ m/\G($pre)($pat)/gc; 137*0Sstevel@tonic-gate my $prelen = length($1); 138*0Sstevel@tonic-gate my $matchpos = $startpos+$prelen; 139*0Sstevel@tonic-gate my $endpos = pos $$textref; 140*0Sstevel@tonic-gate return _succeed $wantarray, $textref, 141*0Sstevel@tonic-gate $matchpos, $endpos-$matchpos, # MATCH 142*0Sstevel@tonic-gate $endpos, length($$textref)-$endpos, # REMAINDER 143*0Sstevel@tonic-gate $startpos, $prelen; # PREFIX 144*0Sstevel@tonic-gate} 145*0Sstevel@tonic-gate 146*0Sstevel@tonic-gatesub extract_bracketed (;$$$) 147*0Sstevel@tonic-gate{ 148*0Sstevel@tonic-gate my $textref = defined $_[0] ? \$_[0] : \$_; 149*0Sstevel@tonic-gate my $ldel = defined $_[1] ? $_[1] : '{([<'; 150*0Sstevel@tonic-gate my $pre = defined $_[2] ? $_[2] : '\s*'; 151*0Sstevel@tonic-gate my $wantarray = wantarray; 152*0Sstevel@tonic-gate my $qdel = ""; 153*0Sstevel@tonic-gate my $quotelike; 154*0Sstevel@tonic-gate $ldel =~ s/'//g and $qdel .= q{'}; 155*0Sstevel@tonic-gate $ldel =~ s/"//g and $qdel .= q{"}; 156*0Sstevel@tonic-gate $ldel =~ s/`//g and $qdel .= q{`}; 157*0Sstevel@tonic-gate $ldel =~ s/q//g and $quotelike = 1; 158*0Sstevel@tonic-gate $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; 159*0Sstevel@tonic-gate my $rdel = $ldel; 160*0Sstevel@tonic-gate unless ($rdel =~ tr/[({</])}>/) 161*0Sstevel@tonic-gate { 162*0Sstevel@tonic-gate return _fail $wantarray, $textref, 163*0Sstevel@tonic-gate "Did not find a suitable bracket in delimiter: \"$_[1]\"", 164*0Sstevel@tonic-gate 0; 165*0Sstevel@tonic-gate } 166*0Sstevel@tonic-gate my $posbug = pos; 167*0Sstevel@tonic-gate $ldel = join('|', map { quotemeta $_ } split('', $ldel)); 168*0Sstevel@tonic-gate $rdel = join('|', map { quotemeta $_ } split('', $rdel)); 169*0Sstevel@tonic-gate pos = $posbug; 170*0Sstevel@tonic-gate 171*0Sstevel@tonic-gate my $startpos = pos $$textref || 0; 172*0Sstevel@tonic-gate my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); 173*0Sstevel@tonic-gate 174*0Sstevel@tonic-gate return _fail ($wantarray, $textref) unless @match; 175*0Sstevel@tonic-gate 176*0Sstevel@tonic-gate return _succeed ( $wantarray, $textref, 177*0Sstevel@tonic-gate $match[2], $match[5]+2, # MATCH 178*0Sstevel@tonic-gate @match[8,9], # REMAINDER 179*0Sstevel@tonic-gate @match[0,1], # PREFIX 180*0Sstevel@tonic-gate ); 181*0Sstevel@tonic-gate} 182*0Sstevel@tonic-gate 183*0Sstevel@tonic-gatesub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel 184*0Sstevel@tonic-gate{ 185*0Sstevel@tonic-gate my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; 186*0Sstevel@tonic-gate my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); 187*0Sstevel@tonic-gate unless ($$textref =~ m/\G$pre/gc) 188*0Sstevel@tonic-gate { 189*0Sstevel@tonic-gate _failmsg "Did not find prefix: /$pre/", $startpos; 190*0Sstevel@tonic-gate return; 191*0Sstevel@tonic-gate } 192*0Sstevel@tonic-gate 193*0Sstevel@tonic-gate $ldelpos = pos $$textref; 194*0Sstevel@tonic-gate 195*0Sstevel@tonic-gate unless ($$textref =~ m/\G($ldel)/gc) 196*0Sstevel@tonic-gate { 197*0Sstevel@tonic-gate _failmsg "Did not find opening bracket after prefix: \"$pre\"", 198*0Sstevel@tonic-gate pos $$textref; 199*0Sstevel@tonic-gate pos $$textref = $startpos; 200*0Sstevel@tonic-gate return; 201*0Sstevel@tonic-gate } 202*0Sstevel@tonic-gate 203*0Sstevel@tonic-gate my @nesting = ( $1 ); 204*0Sstevel@tonic-gate my $textlen = length $$textref; 205*0Sstevel@tonic-gate while (pos $$textref < $textlen) 206*0Sstevel@tonic-gate { 207*0Sstevel@tonic-gate next if $$textref =~ m/\G\\./gcs; 208*0Sstevel@tonic-gate 209*0Sstevel@tonic-gate if ($$textref =~ m/\G($ldel)/gc) 210*0Sstevel@tonic-gate { 211*0Sstevel@tonic-gate push @nesting, $1; 212*0Sstevel@tonic-gate } 213*0Sstevel@tonic-gate elsif ($$textref =~ m/\G($rdel)/gc) 214*0Sstevel@tonic-gate { 215*0Sstevel@tonic-gate my ($found, $brackettype) = ($1, $1); 216*0Sstevel@tonic-gate if ($#nesting < 0) 217*0Sstevel@tonic-gate { 218*0Sstevel@tonic-gate _failmsg "Unmatched closing bracket: \"$found\"", 219*0Sstevel@tonic-gate pos $$textref; 220*0Sstevel@tonic-gate pos $$textref = $startpos; 221*0Sstevel@tonic-gate return; 222*0Sstevel@tonic-gate } 223*0Sstevel@tonic-gate my $expected = pop(@nesting); 224*0Sstevel@tonic-gate $expected =~ tr/({[</)}]>/; 225*0Sstevel@tonic-gate if ($expected ne $brackettype) 226*0Sstevel@tonic-gate { 227*0Sstevel@tonic-gate _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, 228*0Sstevel@tonic-gate pos $$textref; 229*0Sstevel@tonic-gate pos $$textref = $startpos; 230*0Sstevel@tonic-gate return; 231*0Sstevel@tonic-gate } 232*0Sstevel@tonic-gate last if $#nesting < 0; 233*0Sstevel@tonic-gate } 234*0Sstevel@tonic-gate elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) 235*0Sstevel@tonic-gate { 236*0Sstevel@tonic-gate $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; 237*0Sstevel@tonic-gate _failmsg "Unmatched embedded quote ($1)", 238*0Sstevel@tonic-gate pos $$textref; 239*0Sstevel@tonic-gate pos $$textref = $startpos; 240*0Sstevel@tonic-gate return; 241*0Sstevel@tonic-gate } 242*0Sstevel@tonic-gate elsif ($quotelike && _match_quotelike($textref,"",1,0)) 243*0Sstevel@tonic-gate { 244*0Sstevel@tonic-gate next; 245*0Sstevel@tonic-gate } 246*0Sstevel@tonic-gate 247*0Sstevel@tonic-gate else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } 248*0Sstevel@tonic-gate } 249*0Sstevel@tonic-gate if ($#nesting>=0) 250*0Sstevel@tonic-gate { 251*0Sstevel@tonic-gate _failmsg "Unmatched opening bracket(s): " 252*0Sstevel@tonic-gate . join("..",@nesting)."..", 253*0Sstevel@tonic-gate pos $$textref; 254*0Sstevel@tonic-gate pos $$textref = $startpos; 255*0Sstevel@tonic-gate return; 256*0Sstevel@tonic-gate } 257*0Sstevel@tonic-gate 258*0Sstevel@tonic-gate $endpos = pos $$textref; 259*0Sstevel@tonic-gate 260*0Sstevel@tonic-gate return ( 261*0Sstevel@tonic-gate $startpos, $ldelpos-$startpos, # PREFIX 262*0Sstevel@tonic-gate $ldelpos, 1, # OPENING BRACKET 263*0Sstevel@tonic-gate $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS 264*0Sstevel@tonic-gate $endpos-1, 1, # CLOSING BRACKET 265*0Sstevel@tonic-gate $endpos, length($$textref)-$endpos, # REMAINDER 266*0Sstevel@tonic-gate ); 267*0Sstevel@tonic-gate} 268*0Sstevel@tonic-gate 269*0Sstevel@tonic-gatesub revbracket($) 270*0Sstevel@tonic-gate{ 271*0Sstevel@tonic-gate my $brack = reverse $_[0]; 272*0Sstevel@tonic-gate $brack =~ tr/[({</])}>/; 273*0Sstevel@tonic-gate return $brack; 274*0Sstevel@tonic-gate} 275*0Sstevel@tonic-gate 276*0Sstevel@tonic-gatemy $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; 277*0Sstevel@tonic-gate 278*0Sstevel@tonic-gatesub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) 279*0Sstevel@tonic-gate{ 280*0Sstevel@tonic-gate my $textref = defined $_[0] ? \$_[0] : \$_; 281*0Sstevel@tonic-gate my $ldel = $_[1]; 282*0Sstevel@tonic-gate my $rdel = $_[2]; 283*0Sstevel@tonic-gate my $pre = defined $_[3] ? $_[3] : '\s*'; 284*0Sstevel@tonic-gate my %options = defined $_[4] ? %{$_[4]} : (); 285*0Sstevel@tonic-gate my $omode = defined $options{fail} ? $options{fail} : ''; 286*0Sstevel@tonic-gate my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 287*0Sstevel@tonic-gate : defined($options{reject}) ? $options{reject} 288*0Sstevel@tonic-gate : '' 289*0Sstevel@tonic-gate ; 290*0Sstevel@tonic-gate my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 291*0Sstevel@tonic-gate : defined($options{ignore}) ? $options{ignore} 292*0Sstevel@tonic-gate : '' 293*0Sstevel@tonic-gate ; 294*0Sstevel@tonic-gate 295*0Sstevel@tonic-gate if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 296*0Sstevel@tonic-gate $@ = undef; 297*0Sstevel@tonic-gate 298*0Sstevel@tonic-gate my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 299*0Sstevel@tonic-gate 300*0Sstevel@tonic-gate return _fail(wantarray, $textref) unless @match; 301*0Sstevel@tonic-gate return _succeed wantarray, $textref, 302*0Sstevel@tonic-gate $match[2], $match[3]+$match[5]+$match[7], # MATCH 303*0Sstevel@tonic-gate @match[8..9,0..1,2..7]; # REM, PRE, BITS 304*0Sstevel@tonic-gate} 305*0Sstevel@tonic-gate 306*0Sstevel@tonic-gatesub _match_tagged # ($$$$$$$) 307*0Sstevel@tonic-gate{ 308*0Sstevel@tonic-gate my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; 309*0Sstevel@tonic-gate my $rdelspec; 310*0Sstevel@tonic-gate 311*0Sstevel@tonic-gate my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); 312*0Sstevel@tonic-gate 313*0Sstevel@tonic-gate unless ($$textref =~ m/\G($pre)/gc) 314*0Sstevel@tonic-gate { 315*0Sstevel@tonic-gate _failmsg "Did not find prefix: /$pre/", pos $$textref; 316*0Sstevel@tonic-gate goto failed; 317*0Sstevel@tonic-gate } 318*0Sstevel@tonic-gate 319*0Sstevel@tonic-gate $opentagpos = pos($$textref); 320*0Sstevel@tonic-gate 321*0Sstevel@tonic-gate unless ($$textref =~ m/\G$ldel/gc) 322*0Sstevel@tonic-gate { 323*0Sstevel@tonic-gate _failmsg "Did not find opening tag: /$ldel/", pos $$textref; 324*0Sstevel@tonic-gate goto failed; 325*0Sstevel@tonic-gate } 326*0Sstevel@tonic-gate 327*0Sstevel@tonic-gate $textpos = pos($$textref); 328*0Sstevel@tonic-gate 329*0Sstevel@tonic-gate if (!defined $rdel) 330*0Sstevel@tonic-gate { 331*0Sstevel@tonic-gate $rdelspec = $&; 332*0Sstevel@tonic-gate unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes) 333*0Sstevel@tonic-gate { 334*0Sstevel@tonic-gate _failmsg "Unable to construct closing tag to match: $rdel", 335*0Sstevel@tonic-gate pos $$textref; 336*0Sstevel@tonic-gate goto failed; 337*0Sstevel@tonic-gate } 338*0Sstevel@tonic-gate } 339*0Sstevel@tonic-gate else 340*0Sstevel@tonic-gate { 341*0Sstevel@tonic-gate $rdelspec = eval "qq{$rdel}" || do { 342*0Sstevel@tonic-gate my $del; 343*0Sstevel@tonic-gate for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) 344*0Sstevel@tonic-gate { next if $rdel =~ /\Q$_/; $del = $_; last } 345*0Sstevel@tonic-gate unless ($del) { 346*0Sstevel@tonic-gate use Carp; 347*0Sstevel@tonic-gate croak "Can't interpolate right delimiter $rdel" 348*0Sstevel@tonic-gate } 349*0Sstevel@tonic-gate eval "qq$del$rdel$del"; 350*0Sstevel@tonic-gate }; 351*0Sstevel@tonic-gate } 352*0Sstevel@tonic-gate 353*0Sstevel@tonic-gate while (pos($$textref) < length($$textref)) 354*0Sstevel@tonic-gate { 355*0Sstevel@tonic-gate next if $$textref =~ m/\G\\./gc; 356*0Sstevel@tonic-gate 357*0Sstevel@tonic-gate if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) 358*0Sstevel@tonic-gate { 359*0Sstevel@tonic-gate $parapos = pos($$textref) - length($1) 360*0Sstevel@tonic-gate unless defined $parapos; 361*0Sstevel@tonic-gate } 362*0Sstevel@tonic-gate elsif ($$textref =~ m/\G($rdelspec)/gc ) 363*0Sstevel@tonic-gate { 364*0Sstevel@tonic-gate $closetagpos = pos($$textref)-length($1); 365*0Sstevel@tonic-gate goto matched; 366*0Sstevel@tonic-gate } 367*0Sstevel@tonic-gate elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) 368*0Sstevel@tonic-gate { 369*0Sstevel@tonic-gate next; 370*0Sstevel@tonic-gate } 371*0Sstevel@tonic-gate elsif ($bad && $$textref =~ m/\G($bad)/gcs) 372*0Sstevel@tonic-gate { 373*0Sstevel@tonic-gate pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS 374*0Sstevel@tonic-gate goto short if ($omode eq 'PARA' || $omode eq 'MAX'); 375*0Sstevel@tonic-gate _failmsg "Found invalid nested tag: $1", pos $$textref; 376*0Sstevel@tonic-gate goto failed; 377*0Sstevel@tonic-gate } 378*0Sstevel@tonic-gate elsif ($$textref =~ m/\G($ldel)/gc) 379*0Sstevel@tonic-gate { 380*0Sstevel@tonic-gate my $tag = $1; 381*0Sstevel@tonic-gate pos($$textref) -= length($tag); # REWIND TO NESTED TAG 382*0Sstevel@tonic-gate unless (_match_tagged(@_)) # MATCH NESTED TAG 383*0Sstevel@tonic-gate { 384*0Sstevel@tonic-gate goto short if $omode eq 'PARA' || $omode eq 'MAX'; 385*0Sstevel@tonic-gate _failmsg "Found unbalanced nested tag: $tag", 386*0Sstevel@tonic-gate pos $$textref; 387*0Sstevel@tonic-gate goto failed; 388*0Sstevel@tonic-gate } 389*0Sstevel@tonic-gate } 390*0Sstevel@tonic-gate else { $$textref =~ m/./gcs } 391*0Sstevel@tonic-gate } 392*0Sstevel@tonic-gate 393*0Sstevel@tonic-gateshort: 394*0Sstevel@tonic-gate $closetagpos = pos($$textref); 395*0Sstevel@tonic-gate goto matched if $omode eq 'MAX'; 396*0Sstevel@tonic-gate goto failed unless $omode eq 'PARA'; 397*0Sstevel@tonic-gate 398*0Sstevel@tonic-gate if (defined $parapos) { pos($$textref) = $parapos } 399*0Sstevel@tonic-gate else { $parapos = pos($$textref) } 400*0Sstevel@tonic-gate 401*0Sstevel@tonic-gate return ( 402*0Sstevel@tonic-gate $startpos, $opentagpos-$startpos, # PREFIX 403*0Sstevel@tonic-gate $opentagpos, $textpos-$opentagpos, # OPENING TAG 404*0Sstevel@tonic-gate $textpos, $parapos-$textpos, # TEXT 405*0Sstevel@tonic-gate $parapos, 0, # NO CLOSING TAG 406*0Sstevel@tonic-gate $parapos, length($$textref)-$parapos, # REMAINDER 407*0Sstevel@tonic-gate ); 408*0Sstevel@tonic-gate 409*0Sstevel@tonic-gatematched: 410*0Sstevel@tonic-gate $endpos = pos($$textref); 411*0Sstevel@tonic-gate return ( 412*0Sstevel@tonic-gate $startpos, $opentagpos-$startpos, # PREFIX 413*0Sstevel@tonic-gate $opentagpos, $textpos-$opentagpos, # OPENING TAG 414*0Sstevel@tonic-gate $textpos, $closetagpos-$textpos, # TEXT 415*0Sstevel@tonic-gate $closetagpos, $endpos-$closetagpos, # CLOSING TAG 416*0Sstevel@tonic-gate $endpos, length($$textref)-$endpos, # REMAINDER 417*0Sstevel@tonic-gate ); 418*0Sstevel@tonic-gate 419*0Sstevel@tonic-gatefailed: 420*0Sstevel@tonic-gate _failmsg "Did not find closing tag", pos $$textref unless $@; 421*0Sstevel@tonic-gate pos($$textref) = $startpos; 422*0Sstevel@tonic-gate return; 423*0Sstevel@tonic-gate} 424*0Sstevel@tonic-gate 425*0Sstevel@tonic-gatesub extract_variable (;$$) 426*0Sstevel@tonic-gate{ 427*0Sstevel@tonic-gate my $textref = defined $_[0] ? \$_[0] : \$_; 428*0Sstevel@tonic-gate return ("","","") unless defined $$textref; 429*0Sstevel@tonic-gate my $pre = defined $_[1] ? $_[1] : '\s*'; 430*0Sstevel@tonic-gate 431*0Sstevel@tonic-gate my @match = _match_variable($textref,$pre); 432*0Sstevel@tonic-gate 433*0Sstevel@tonic-gate return _fail wantarray, $textref unless @match; 434*0Sstevel@tonic-gate 435*0Sstevel@tonic-gate return _succeed wantarray, $textref, 436*0Sstevel@tonic-gate @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX 437*0Sstevel@tonic-gate} 438*0Sstevel@tonic-gate 439*0Sstevel@tonic-gatesub _match_variable($$) 440*0Sstevel@tonic-gate{ 441*0Sstevel@tonic-gate# $# 442*0Sstevel@tonic-gate# $^ 443*0Sstevel@tonic-gate# $$ 444*0Sstevel@tonic-gate my ($textref, $pre) = @_; 445*0Sstevel@tonic-gate my $startpos = pos($$textref) = pos($$textref)||0; 446*0Sstevel@tonic-gate unless ($$textref =~ m/\G($pre)/gc) 447*0Sstevel@tonic-gate { 448*0Sstevel@tonic-gate _failmsg "Did not find prefix: /$pre/", pos $$textref; 449*0Sstevel@tonic-gate return; 450*0Sstevel@tonic-gate } 451*0Sstevel@tonic-gate my $varpos = pos($$textref); 452*0Sstevel@tonic-gate unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 453*0Sstevel@tonic-gate { 454*0Sstevel@tonic-gate unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) 455*0Sstevel@tonic-gate { 456*0Sstevel@tonic-gate _failmsg "Did not find leading dereferencer", pos $$textref; 457*0Sstevel@tonic-gate pos $$textref = $startpos; 458*0Sstevel@tonic-gate return; 459*0Sstevel@tonic-gate } 460*0Sstevel@tonic-gate my $deref = $1; 461*0Sstevel@tonic-gate 462*0Sstevel@tonic-gate unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci 463*0Sstevel@tonic-gate or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) 464*0Sstevel@tonic-gate or $deref eq '$#' or $deref eq '$$' ) 465*0Sstevel@tonic-gate { 466*0Sstevel@tonic-gate _failmsg "Bad identifier after dereferencer", pos $$textref; 467*0Sstevel@tonic-gate pos $$textref = $startpos; 468*0Sstevel@tonic-gate return; 469*0Sstevel@tonic-gate } 470*0Sstevel@tonic-gate } 471*0Sstevel@tonic-gate 472*0Sstevel@tonic-gate while (1) 473*0Sstevel@tonic-gate { 474*0Sstevel@tonic-gate next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; 475*0Sstevel@tonic-gate next if _match_codeblock($textref, 476*0Sstevel@tonic-gate qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, 477*0Sstevel@tonic-gate qr/[({[]/, qr/[)}\]]/, 478*0Sstevel@tonic-gate qr/[({[]/, qr/[)}\]]/, 0); 479*0Sstevel@tonic-gate next if _match_codeblock($textref, 480*0Sstevel@tonic-gate qr/\s*/, qr/[{[]/, qr/[}\]]/, 481*0Sstevel@tonic-gate qr/[{[]/, qr/[}\]]/, 0); 482*0Sstevel@tonic-gate next if _match_variable($textref,'\s*->\s*'); 483*0Sstevel@tonic-gate next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; 484*0Sstevel@tonic-gate last; 485*0Sstevel@tonic-gate } 486*0Sstevel@tonic-gate 487*0Sstevel@tonic-gate my $endpos = pos($$textref); 488*0Sstevel@tonic-gate return ($startpos, $varpos-$startpos, 489*0Sstevel@tonic-gate $varpos, $endpos-$varpos, 490*0Sstevel@tonic-gate $endpos, length($$textref)-$endpos 491*0Sstevel@tonic-gate ); 492*0Sstevel@tonic-gate} 493*0Sstevel@tonic-gate 494*0Sstevel@tonic-gatesub extract_codeblock (;$$$$$) 495*0Sstevel@tonic-gate{ 496*0Sstevel@tonic-gate my $textref = defined $_[0] ? \$_[0] : \$_; 497*0Sstevel@tonic-gate my $wantarray = wantarray; 498*0Sstevel@tonic-gate my $ldel_inner = defined $_[1] ? $_[1] : '{'; 499*0Sstevel@tonic-gate my $pre = defined $_[2] ? $_[2] : '\s*'; 500*0Sstevel@tonic-gate my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; 501*0Sstevel@tonic-gate my $rd = $_[4]; 502*0Sstevel@tonic-gate my $rdel_inner = $ldel_inner; 503*0Sstevel@tonic-gate my $rdel_outer = $ldel_outer; 504*0Sstevel@tonic-gate my $posbug = pos; 505*0Sstevel@tonic-gate for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } 506*0Sstevel@tonic-gate for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } 507*0Sstevel@tonic-gate for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) 508*0Sstevel@tonic-gate { 509*0Sstevel@tonic-gate $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' 510*0Sstevel@tonic-gate } 511*0Sstevel@tonic-gate pos = $posbug; 512*0Sstevel@tonic-gate 513*0Sstevel@tonic-gate my @match = _match_codeblock($textref, $pre, 514*0Sstevel@tonic-gate $ldel_outer, $rdel_outer, 515*0Sstevel@tonic-gate $ldel_inner, $rdel_inner, 516*0Sstevel@tonic-gate $rd); 517*0Sstevel@tonic-gate return _fail($wantarray, $textref) unless @match; 518*0Sstevel@tonic-gate return _succeed($wantarray, $textref, 519*0Sstevel@tonic-gate @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX 520*0Sstevel@tonic-gate ); 521*0Sstevel@tonic-gate 522*0Sstevel@tonic-gate} 523*0Sstevel@tonic-gate 524*0Sstevel@tonic-gatesub _match_codeblock($$$$$$$) 525*0Sstevel@tonic-gate{ 526*0Sstevel@tonic-gate my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; 527*0Sstevel@tonic-gate my $startpos = pos($$textref) = pos($$textref) || 0; 528*0Sstevel@tonic-gate unless ($$textref =~ m/\G($pre)/gc) 529*0Sstevel@tonic-gate { 530*0Sstevel@tonic-gate _failmsg qq{Did not match prefix /$pre/ at"} . 531*0Sstevel@tonic-gate substr($$textref,pos($$textref),20) . 532*0Sstevel@tonic-gate q{..."}, 533*0Sstevel@tonic-gate pos $$textref; 534*0Sstevel@tonic-gate return; 535*0Sstevel@tonic-gate } 536*0Sstevel@tonic-gate my $codepos = pos($$textref); 537*0Sstevel@tonic-gate unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER 538*0Sstevel@tonic-gate { 539*0Sstevel@tonic-gate _failmsg qq{Did not find expected opening bracket at "} . 540*0Sstevel@tonic-gate substr($$textref,pos($$textref),20) . 541*0Sstevel@tonic-gate q{..."}, 542*0Sstevel@tonic-gate pos $$textref; 543*0Sstevel@tonic-gate pos $$textref = $startpos; 544*0Sstevel@tonic-gate return; 545*0Sstevel@tonic-gate } 546*0Sstevel@tonic-gate my $closing = $1; 547*0Sstevel@tonic-gate $closing =~ tr/([<{/)]>}/; 548*0Sstevel@tonic-gate my $matched; 549*0Sstevel@tonic-gate my $patvalid = 1; 550*0Sstevel@tonic-gate while (pos($$textref) < length($$textref)) 551*0Sstevel@tonic-gate { 552*0Sstevel@tonic-gate $matched = ''; 553*0Sstevel@tonic-gate if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) 554*0Sstevel@tonic-gate { 555*0Sstevel@tonic-gate $patvalid = 0; 556*0Sstevel@tonic-gate next; 557*0Sstevel@tonic-gate } 558*0Sstevel@tonic-gate 559*0Sstevel@tonic-gate if ($$textref =~ m/\G\s*#.*/gc) 560*0Sstevel@tonic-gate { 561*0Sstevel@tonic-gate next; 562*0Sstevel@tonic-gate } 563*0Sstevel@tonic-gate 564*0Sstevel@tonic-gate if ($$textref =~ m/\G\s*($rdel_outer)/gc) 565*0Sstevel@tonic-gate { 566*0Sstevel@tonic-gate unless ($matched = ($closing && $1 eq $closing) ) 567*0Sstevel@tonic-gate { 568*0Sstevel@tonic-gate next if $1 eq '>'; # MIGHT BE A "LESS THAN" 569*0Sstevel@tonic-gate _failmsg q{Mismatched closing bracket at "} . 570*0Sstevel@tonic-gate substr($$textref,pos($$textref),20) . 571*0Sstevel@tonic-gate qq{...". Expected '$closing'}, 572*0Sstevel@tonic-gate pos $$textref; 573*0Sstevel@tonic-gate } 574*0Sstevel@tonic-gate last; 575*0Sstevel@tonic-gate } 576*0Sstevel@tonic-gate 577*0Sstevel@tonic-gate if (_match_variable($textref,'\s*') || 578*0Sstevel@tonic-gate _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) 579*0Sstevel@tonic-gate { 580*0Sstevel@tonic-gate $patvalid = 0; 581*0Sstevel@tonic-gate next; 582*0Sstevel@tonic-gate } 583*0Sstevel@tonic-gate 584*0Sstevel@tonic-gate 585*0Sstevel@tonic-gate # NEED TO COVER MANY MORE CASES HERE!!! 586*0Sstevel@tonic-gate if ($$textref =~ m#\G\s*(?!$ldel_inner) 587*0Sstevel@tonic-gate ( [-+*x/%^&|.]=? 588*0Sstevel@tonic-gate | [!=]~ 589*0Sstevel@tonic-gate | =(?!>) 590*0Sstevel@tonic-gate | (\*\*|&&|\|\||<<|>>)=? 591*0Sstevel@tonic-gate | split|grep|map|return 592*0Sstevel@tonic-gate | [([] 593*0Sstevel@tonic-gate )#gcx) 594*0Sstevel@tonic-gate { 595*0Sstevel@tonic-gate $patvalid = 1; 596*0Sstevel@tonic-gate next; 597*0Sstevel@tonic-gate } 598*0Sstevel@tonic-gate 599*0Sstevel@tonic-gate if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) 600*0Sstevel@tonic-gate { 601*0Sstevel@tonic-gate $patvalid = 1; 602*0Sstevel@tonic-gate next; 603*0Sstevel@tonic-gate } 604*0Sstevel@tonic-gate 605*0Sstevel@tonic-gate if ($$textref =~ m/\G\s*$ldel_outer/gc) 606*0Sstevel@tonic-gate { 607*0Sstevel@tonic-gate _failmsg q{Improperly nested codeblock at "} . 608*0Sstevel@tonic-gate substr($$textref,pos($$textref),20) . 609*0Sstevel@tonic-gate q{..."}, 610*0Sstevel@tonic-gate pos $$textref; 611*0Sstevel@tonic-gate last; 612*0Sstevel@tonic-gate } 613*0Sstevel@tonic-gate 614*0Sstevel@tonic-gate $patvalid = 0; 615*0Sstevel@tonic-gate $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; 616*0Sstevel@tonic-gate } 617*0Sstevel@tonic-gate continue { $@ = undef } 618*0Sstevel@tonic-gate 619*0Sstevel@tonic-gate unless ($matched) 620*0Sstevel@tonic-gate { 621*0Sstevel@tonic-gate _failmsg 'No match found for opening bracket', pos $$textref 622*0Sstevel@tonic-gate unless $@; 623*0Sstevel@tonic-gate return; 624*0Sstevel@tonic-gate } 625*0Sstevel@tonic-gate 626*0Sstevel@tonic-gate my $endpos = pos($$textref); 627*0Sstevel@tonic-gate return ( $startpos, $codepos-$startpos, 628*0Sstevel@tonic-gate $codepos, $endpos-$codepos, 629*0Sstevel@tonic-gate $endpos, length($$textref)-$endpos, 630*0Sstevel@tonic-gate ); 631*0Sstevel@tonic-gate} 632*0Sstevel@tonic-gate 633*0Sstevel@tonic-gate 634*0Sstevel@tonic-gatemy %mods = ( 635*0Sstevel@tonic-gate 'none' => '[cgimsox]*', 636*0Sstevel@tonic-gate 'm' => '[cgimsox]*', 637*0Sstevel@tonic-gate 's' => '[cegimsox]*', 638*0Sstevel@tonic-gate 'tr' => '[cds]*', 639*0Sstevel@tonic-gate 'y' => '[cds]*', 640*0Sstevel@tonic-gate 'qq' => '', 641*0Sstevel@tonic-gate 'qx' => '', 642*0Sstevel@tonic-gate 'qw' => '', 643*0Sstevel@tonic-gate 'qr' => '[imsx]*', 644*0Sstevel@tonic-gate 'q' => '', 645*0Sstevel@tonic-gate ); 646*0Sstevel@tonic-gate 647*0Sstevel@tonic-gatesub extract_quotelike (;$$) 648*0Sstevel@tonic-gate{ 649*0Sstevel@tonic-gate my $textref = $_[0] ? \$_[0] : \$_; 650*0Sstevel@tonic-gate my $wantarray = wantarray; 651*0Sstevel@tonic-gate my $pre = defined $_[1] ? $_[1] : '\s*'; 652*0Sstevel@tonic-gate 653*0Sstevel@tonic-gate my @match = _match_quotelike($textref,$pre,1,0); 654*0Sstevel@tonic-gate return _fail($wantarray, $textref) unless @match; 655*0Sstevel@tonic-gate return _succeed($wantarray, $textref, 656*0Sstevel@tonic-gate $match[2], $match[18]-$match[2], # MATCH 657*0Sstevel@tonic-gate @match[18,19], # REMAINDER 658*0Sstevel@tonic-gate @match[0,1], # PREFIX 659*0Sstevel@tonic-gate @match[2..17], # THE BITS 660*0Sstevel@tonic-gate @match[20,21], # ANY FILLET? 661*0Sstevel@tonic-gate ); 662*0Sstevel@tonic-gate}; 663*0Sstevel@tonic-gate 664*0Sstevel@tonic-gatesub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) 665*0Sstevel@tonic-gate{ 666*0Sstevel@tonic-gate my ($textref, $pre, $rawmatch, $qmark) = @_; 667*0Sstevel@tonic-gate 668*0Sstevel@tonic-gate my ($textlen,$startpos, 669*0Sstevel@tonic-gate $oppos, 670*0Sstevel@tonic-gate $preld1pos,$ld1pos,$str1pos,$rd1pos, 671*0Sstevel@tonic-gate $preld2pos,$ld2pos,$str2pos,$rd2pos, 672*0Sstevel@tonic-gate $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); 673*0Sstevel@tonic-gate 674*0Sstevel@tonic-gate unless ($$textref =~ m/\G($pre)/gc) 675*0Sstevel@tonic-gate { 676*0Sstevel@tonic-gate _failmsg qq{Did not find prefix /$pre/ at "} . 677*0Sstevel@tonic-gate substr($$textref, pos($$textref), 20) . 678*0Sstevel@tonic-gate q{..."}, 679*0Sstevel@tonic-gate pos $$textref; 680*0Sstevel@tonic-gate return; 681*0Sstevel@tonic-gate } 682*0Sstevel@tonic-gate $oppos = pos($$textref); 683*0Sstevel@tonic-gate 684*0Sstevel@tonic-gate my $initial = substr($$textref,$oppos,1); 685*0Sstevel@tonic-gate 686*0Sstevel@tonic-gate if ($initial && $initial =~ m|^[\"\'\`]| 687*0Sstevel@tonic-gate || $rawmatch && $initial =~ m|^/| 688*0Sstevel@tonic-gate || $qmark && $initial =~ m|^\?|) 689*0Sstevel@tonic-gate { 690*0Sstevel@tonic-gate unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) 691*0Sstevel@tonic-gate { 692*0Sstevel@tonic-gate _failmsg qq{Did not find closing delimiter to match '$initial' at "} . 693*0Sstevel@tonic-gate substr($$textref, $oppos, 20) . 694*0Sstevel@tonic-gate q{..."}, 695*0Sstevel@tonic-gate pos $$textref; 696*0Sstevel@tonic-gate pos $$textref = $startpos; 697*0Sstevel@tonic-gate return; 698*0Sstevel@tonic-gate } 699*0Sstevel@tonic-gate $modpos= pos($$textref); 700*0Sstevel@tonic-gate $rd1pos = $modpos-1; 701*0Sstevel@tonic-gate 702*0Sstevel@tonic-gate if ($initial eq '/' || $initial eq '?') 703*0Sstevel@tonic-gate { 704*0Sstevel@tonic-gate $$textref =~ m/\G$mods{none}/gc 705*0Sstevel@tonic-gate } 706*0Sstevel@tonic-gate 707*0Sstevel@tonic-gate my $endpos = pos($$textref); 708*0Sstevel@tonic-gate return ( 709*0Sstevel@tonic-gate $startpos, $oppos-$startpos, # PREFIX 710*0Sstevel@tonic-gate $oppos, 0, # NO OPERATOR 711*0Sstevel@tonic-gate $oppos, 1, # LEFT DEL 712*0Sstevel@tonic-gate $oppos+1, $rd1pos-$oppos-1, # STR/PAT 713*0Sstevel@tonic-gate $rd1pos, 1, # RIGHT DEL 714*0Sstevel@tonic-gate $modpos, 0, # NO 2ND LDEL 715*0Sstevel@tonic-gate $modpos, 0, # NO 2ND STR 716*0Sstevel@tonic-gate $modpos, 0, # NO 2ND RDEL 717*0Sstevel@tonic-gate $modpos, $endpos-$modpos, # MODIFIERS 718*0Sstevel@tonic-gate $endpos, $textlen-$endpos, # REMAINDER 719*0Sstevel@tonic-gate ); 720*0Sstevel@tonic-gate } 721*0Sstevel@tonic-gate 722*0Sstevel@tonic-gate unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) 723*0Sstevel@tonic-gate { 724*0Sstevel@tonic-gate _failmsg q{No quotelike operator found after prefix at "} . 725*0Sstevel@tonic-gate substr($$textref, pos($$textref), 20) . 726*0Sstevel@tonic-gate q{..."}, 727*0Sstevel@tonic-gate pos $$textref; 728*0Sstevel@tonic-gate pos $$textref = $startpos; 729*0Sstevel@tonic-gate return; 730*0Sstevel@tonic-gate } 731*0Sstevel@tonic-gate 732*0Sstevel@tonic-gate my $op = $1; 733*0Sstevel@tonic-gate $preld1pos = pos($$textref); 734*0Sstevel@tonic-gate if ($op eq '<<') { 735*0Sstevel@tonic-gate $ld1pos = pos($$textref); 736*0Sstevel@tonic-gate my $label; 737*0Sstevel@tonic-gate if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { 738*0Sstevel@tonic-gate $label = $1; 739*0Sstevel@tonic-gate } 740*0Sstevel@tonic-gate elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' 741*0Sstevel@tonic-gate | \G " ([^"\\]* (?:\\.[^"\\]*)*) " 742*0Sstevel@tonic-gate | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` 743*0Sstevel@tonic-gate }gcsx) { 744*0Sstevel@tonic-gate $label = $+; 745*0Sstevel@tonic-gate } 746*0Sstevel@tonic-gate else { 747*0Sstevel@tonic-gate $label = ""; 748*0Sstevel@tonic-gate } 749*0Sstevel@tonic-gate my $extrapos = pos($$textref); 750*0Sstevel@tonic-gate $$textref =~ m{.*\n}gc; 751*0Sstevel@tonic-gate $str1pos = pos($$textref); 752*0Sstevel@tonic-gate unless ($$textref =~ m{.*?\n(?=$label\n)}gc) { 753*0Sstevel@tonic-gate _failmsg qq{Missing here doc terminator ('$label') after "} . 754*0Sstevel@tonic-gate substr($$textref, $startpos, 20) . 755*0Sstevel@tonic-gate q{..."}, 756*0Sstevel@tonic-gate pos $$textref; 757*0Sstevel@tonic-gate pos $$textref = $startpos; 758*0Sstevel@tonic-gate return; 759*0Sstevel@tonic-gate } 760*0Sstevel@tonic-gate $rd1pos = pos($$textref); 761*0Sstevel@tonic-gate $$textref =~ m{$label\n}gc; 762*0Sstevel@tonic-gate $ld2pos = pos($$textref); 763*0Sstevel@tonic-gate return ( 764*0Sstevel@tonic-gate $startpos, $oppos-$startpos, # PREFIX 765*0Sstevel@tonic-gate $oppos, length($op), # OPERATOR 766*0Sstevel@tonic-gate $ld1pos, $extrapos-$ld1pos, # LEFT DEL 767*0Sstevel@tonic-gate $str1pos, $rd1pos-$str1pos, # STR/PAT 768*0Sstevel@tonic-gate $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL 769*0Sstevel@tonic-gate $ld2pos, 0, # NO 2ND LDEL 770*0Sstevel@tonic-gate $ld2pos, 0, # NO 2ND STR 771*0Sstevel@tonic-gate $ld2pos, 0, # NO 2ND RDEL 772*0Sstevel@tonic-gate $ld2pos, 0, # NO MODIFIERS 773*0Sstevel@tonic-gate $ld2pos, $textlen-$ld2pos, # REMAINDER 774*0Sstevel@tonic-gate $extrapos, $str1pos-$extrapos, # FILLETED BIT 775*0Sstevel@tonic-gate ); 776*0Sstevel@tonic-gate } 777*0Sstevel@tonic-gate 778*0Sstevel@tonic-gate $$textref =~ m/\G\s*/gc; 779*0Sstevel@tonic-gate $ld1pos = pos($$textref); 780*0Sstevel@tonic-gate $str1pos = $ld1pos+1; 781*0Sstevel@tonic-gate 782*0Sstevel@tonic-gate unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD 783*0Sstevel@tonic-gate { 784*0Sstevel@tonic-gate _failmsg "No block delimiter found after quotelike $op", 785*0Sstevel@tonic-gate pos $$textref; 786*0Sstevel@tonic-gate pos $$textref = $startpos; 787*0Sstevel@tonic-gate return; 788*0Sstevel@tonic-gate } 789*0Sstevel@tonic-gate pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN 790*0Sstevel@tonic-gate my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); 791*0Sstevel@tonic-gate if ($ldel1 =~ /[[(<{]/) 792*0Sstevel@tonic-gate { 793*0Sstevel@tonic-gate $rdel1 =~ tr/[({</])}>/; 794*0Sstevel@tonic-gate _match_bracketed($textref,"",$ldel1,"","",$rdel1) 795*0Sstevel@tonic-gate || do { pos $$textref = $startpos; return }; 796*0Sstevel@tonic-gate } 797*0Sstevel@tonic-gate else 798*0Sstevel@tonic-gate { 799*0Sstevel@tonic-gate $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs 800*0Sstevel@tonic-gate || do { pos $$textref = $startpos; return }; 801*0Sstevel@tonic-gate } 802*0Sstevel@tonic-gate $ld2pos = $rd1pos = pos($$textref)-1; 803*0Sstevel@tonic-gate 804*0Sstevel@tonic-gate my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; 805*0Sstevel@tonic-gate if ($second_arg) 806*0Sstevel@tonic-gate { 807*0Sstevel@tonic-gate my ($ldel2, $rdel2); 808*0Sstevel@tonic-gate if ($ldel1 =~ /[[(<{]/) 809*0Sstevel@tonic-gate { 810*0Sstevel@tonic-gate unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD 811*0Sstevel@tonic-gate { 812*0Sstevel@tonic-gate _failmsg "Missing second block for quotelike $op", 813*0Sstevel@tonic-gate pos $$textref; 814*0Sstevel@tonic-gate pos $$textref = $startpos; 815*0Sstevel@tonic-gate return; 816*0Sstevel@tonic-gate } 817*0Sstevel@tonic-gate $ldel2 = $rdel2 = "\Q$1"; 818*0Sstevel@tonic-gate $rdel2 =~ tr/[({</])}>/; 819*0Sstevel@tonic-gate } 820*0Sstevel@tonic-gate else 821*0Sstevel@tonic-gate { 822*0Sstevel@tonic-gate $ldel2 = $rdel2 = $ldel1; 823*0Sstevel@tonic-gate } 824*0Sstevel@tonic-gate $str2pos = $ld2pos+1; 825*0Sstevel@tonic-gate 826*0Sstevel@tonic-gate if ($ldel2 =~ /[[(<{]/) 827*0Sstevel@tonic-gate { 828*0Sstevel@tonic-gate pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD 829*0Sstevel@tonic-gate _match_bracketed($textref,"",$ldel2,"","",$rdel2) 830*0Sstevel@tonic-gate || do { pos $$textref = $startpos; return }; 831*0Sstevel@tonic-gate } 832*0Sstevel@tonic-gate else 833*0Sstevel@tonic-gate { 834*0Sstevel@tonic-gate $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs 835*0Sstevel@tonic-gate || do { pos $$textref = $startpos; return }; 836*0Sstevel@tonic-gate } 837*0Sstevel@tonic-gate $rd2pos = pos($$textref)-1; 838*0Sstevel@tonic-gate } 839*0Sstevel@tonic-gate else 840*0Sstevel@tonic-gate { 841*0Sstevel@tonic-gate $ld2pos = $str2pos = $rd2pos = $rd1pos; 842*0Sstevel@tonic-gate } 843*0Sstevel@tonic-gate 844*0Sstevel@tonic-gate $modpos = pos $$textref; 845*0Sstevel@tonic-gate 846*0Sstevel@tonic-gate $$textref =~ m/\G($mods{$op})/gc; 847*0Sstevel@tonic-gate my $endpos = pos $$textref; 848*0Sstevel@tonic-gate 849*0Sstevel@tonic-gate return ( 850*0Sstevel@tonic-gate $startpos, $oppos-$startpos, # PREFIX 851*0Sstevel@tonic-gate $oppos, length($op), # OPERATOR 852*0Sstevel@tonic-gate $ld1pos, 1, # LEFT DEL 853*0Sstevel@tonic-gate $str1pos, $rd1pos-$str1pos, # STR/PAT 854*0Sstevel@tonic-gate $rd1pos, 1, # RIGHT DEL 855*0Sstevel@tonic-gate $ld2pos, $second_arg, # 2ND LDEL (MAYBE) 856*0Sstevel@tonic-gate $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) 857*0Sstevel@tonic-gate $rd2pos, $second_arg, # 2ND RDEL (MAYBE) 858*0Sstevel@tonic-gate $modpos, $endpos-$modpos, # MODIFIERS 859*0Sstevel@tonic-gate $endpos, $textlen-$endpos, # REMAINDER 860*0Sstevel@tonic-gate ); 861*0Sstevel@tonic-gate} 862*0Sstevel@tonic-gate 863*0Sstevel@tonic-gatemy $def_func = 864*0Sstevel@tonic-gate[ 865*0Sstevel@tonic-gate sub { extract_variable($_[0], '') }, 866*0Sstevel@tonic-gate sub { extract_quotelike($_[0],'') }, 867*0Sstevel@tonic-gate sub { extract_codeblock($_[0],'{}','') }, 868*0Sstevel@tonic-gate]; 869*0Sstevel@tonic-gate 870*0Sstevel@tonic-gatesub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) 871*0Sstevel@tonic-gate{ 872*0Sstevel@tonic-gate my $textref = defined($_[0]) ? \$_[0] : \$_; 873*0Sstevel@tonic-gate my $posbug = pos; 874*0Sstevel@tonic-gate my ($lastpos, $firstpos); 875*0Sstevel@tonic-gate my @fields = (); 876*0Sstevel@tonic-gate 877*0Sstevel@tonic-gate #for ($$textref) 878*0Sstevel@tonic-gate { 879*0Sstevel@tonic-gate my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; 880*0Sstevel@tonic-gate my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; 881*0Sstevel@tonic-gate my $igunk = $_[3]; 882*0Sstevel@tonic-gate 883*0Sstevel@tonic-gate pos $$textref ||= 0; 884*0Sstevel@tonic-gate 885*0Sstevel@tonic-gate unless (wantarray) 886*0Sstevel@tonic-gate { 887*0Sstevel@tonic-gate use Carp; 888*0Sstevel@tonic-gate carp "extract_multiple reset maximal count to 1 in scalar context" 889*0Sstevel@tonic-gate if $^W && defined($_[2]) && $max > 1; 890*0Sstevel@tonic-gate $max = 1 891*0Sstevel@tonic-gate } 892*0Sstevel@tonic-gate 893*0Sstevel@tonic-gate my $unkpos; 894*0Sstevel@tonic-gate my $func; 895*0Sstevel@tonic-gate my $class; 896*0Sstevel@tonic-gate 897*0Sstevel@tonic-gate my @class; 898*0Sstevel@tonic-gate foreach $func ( @func ) 899*0Sstevel@tonic-gate { 900*0Sstevel@tonic-gate if (ref($func) eq 'HASH') 901*0Sstevel@tonic-gate { 902*0Sstevel@tonic-gate push @class, (keys %$func)[0]; 903*0Sstevel@tonic-gate $func = (values %$func)[0]; 904*0Sstevel@tonic-gate } 905*0Sstevel@tonic-gate else 906*0Sstevel@tonic-gate { 907*0Sstevel@tonic-gate push @class, undef; 908*0Sstevel@tonic-gate } 909*0Sstevel@tonic-gate } 910*0Sstevel@tonic-gate 911*0Sstevel@tonic-gate FIELD: while (pos($$textref) < length($$textref)) 912*0Sstevel@tonic-gate { 913*0Sstevel@tonic-gate my ($field, $rem); 914*0Sstevel@tonic-gate my @bits; 915*0Sstevel@tonic-gate foreach my $i ( 0..$#func ) 916*0Sstevel@tonic-gate { 917*0Sstevel@tonic-gate my $pref; 918*0Sstevel@tonic-gate $func = $func[$i]; 919*0Sstevel@tonic-gate $class = $class[$i]; 920*0Sstevel@tonic-gate $lastpos = pos $$textref; 921*0Sstevel@tonic-gate if (ref($func) eq 'CODE') 922*0Sstevel@tonic-gate { ($field,$rem,$pref) = @bits = $func->($$textref); 923*0Sstevel@tonic-gate # print "[$field|$rem]" if $field; 924*0Sstevel@tonic-gate } 925*0Sstevel@tonic-gate elsif (ref($func) eq 'Text::Balanced::Extractor') 926*0Sstevel@tonic-gate { @bits = $field = $func->extract($$textref) } 927*0Sstevel@tonic-gate elsif( $$textref =~ m/\G$func/gc ) 928*0Sstevel@tonic-gate { @bits = $field = defined($1) ? $1 : $& } 929*0Sstevel@tonic-gate $pref ||= ""; 930*0Sstevel@tonic-gate if (defined($field) && length($field)) 931*0Sstevel@tonic-gate { 932*0Sstevel@tonic-gate if (!$igunk) { 933*0Sstevel@tonic-gate $unkpos = pos $$textref 934*0Sstevel@tonic-gate if length($pref) && !defined($unkpos); 935*0Sstevel@tonic-gate if (defined $unkpos) 936*0Sstevel@tonic-gate { 937*0Sstevel@tonic-gate push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; 938*0Sstevel@tonic-gate $firstpos = $unkpos unless defined $firstpos; 939*0Sstevel@tonic-gate undef $unkpos; 940*0Sstevel@tonic-gate last FIELD if @fields == $max; 941*0Sstevel@tonic-gate } 942*0Sstevel@tonic-gate } 943*0Sstevel@tonic-gate push @fields, $class 944*0Sstevel@tonic-gate ? bless (\$field, $class) 945*0Sstevel@tonic-gate : $field; 946*0Sstevel@tonic-gate $firstpos = $lastpos unless defined $firstpos; 947*0Sstevel@tonic-gate $lastpos = pos $$textref; 948*0Sstevel@tonic-gate last FIELD if @fields == $max; 949*0Sstevel@tonic-gate next FIELD; 950*0Sstevel@tonic-gate } 951*0Sstevel@tonic-gate } 952*0Sstevel@tonic-gate if ($$textref =~ /\G(.)/gcs) 953*0Sstevel@tonic-gate { 954*0Sstevel@tonic-gate $unkpos = pos($$textref)-1 955*0Sstevel@tonic-gate unless $igunk || defined $unkpos; 956*0Sstevel@tonic-gate } 957*0Sstevel@tonic-gate } 958*0Sstevel@tonic-gate 959*0Sstevel@tonic-gate if (defined $unkpos) 960*0Sstevel@tonic-gate { 961*0Sstevel@tonic-gate push @fields, substr($$textref, $unkpos); 962*0Sstevel@tonic-gate $firstpos = $unkpos unless defined $firstpos; 963*0Sstevel@tonic-gate $lastpos = length $$textref; 964*0Sstevel@tonic-gate } 965*0Sstevel@tonic-gate last; 966*0Sstevel@tonic-gate } 967*0Sstevel@tonic-gate 968*0Sstevel@tonic-gate pos $$textref = $lastpos; 969*0Sstevel@tonic-gate return @fields if wantarray; 970*0Sstevel@tonic-gate 971*0Sstevel@tonic-gate $firstpos ||= 0; 972*0Sstevel@tonic-gate eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; 973*0Sstevel@tonic-gate pos $$textref = $firstpos }; 974*0Sstevel@tonic-gate return $fields[0]; 975*0Sstevel@tonic-gate} 976*0Sstevel@tonic-gate 977*0Sstevel@tonic-gate 978*0Sstevel@tonic-gatesub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) 979*0Sstevel@tonic-gate{ 980*0Sstevel@tonic-gate my $ldel = $_[0]; 981*0Sstevel@tonic-gate my $rdel = $_[1]; 982*0Sstevel@tonic-gate my $pre = defined $_[2] ? $_[2] : '\s*'; 983*0Sstevel@tonic-gate my %options = defined $_[3] ? %{$_[3]} : (); 984*0Sstevel@tonic-gate my $omode = defined $options{fail} ? $options{fail} : ''; 985*0Sstevel@tonic-gate my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 986*0Sstevel@tonic-gate : defined($options{reject}) ? $options{reject} 987*0Sstevel@tonic-gate : '' 988*0Sstevel@tonic-gate ; 989*0Sstevel@tonic-gate my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 990*0Sstevel@tonic-gate : defined($options{ignore}) ? $options{ignore} 991*0Sstevel@tonic-gate : '' 992*0Sstevel@tonic-gate ; 993*0Sstevel@tonic-gate 994*0Sstevel@tonic-gate if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 995*0Sstevel@tonic-gate 996*0Sstevel@tonic-gate my $posbug = pos; 997*0Sstevel@tonic-gate for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } 998*0Sstevel@tonic-gate pos = $posbug; 999*0Sstevel@tonic-gate 1000*0Sstevel@tonic-gate my $closure = sub 1001*0Sstevel@tonic-gate { 1002*0Sstevel@tonic-gate my $textref = defined $_[0] ? \$_[0] : \$_; 1003*0Sstevel@tonic-gate my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 1004*0Sstevel@tonic-gate 1005*0Sstevel@tonic-gate return _fail(wantarray, $textref) unless @match; 1006*0Sstevel@tonic-gate return _succeed wantarray, $textref, 1007*0Sstevel@tonic-gate $match[2], $match[3]+$match[5]+$match[7], # MATCH 1008*0Sstevel@tonic-gate @match[8..9,0..1,2..7]; # REM, PRE, BITS 1009*0Sstevel@tonic-gate }; 1010*0Sstevel@tonic-gate 1011*0Sstevel@tonic-gate bless $closure, 'Text::Balanced::Extractor'; 1012*0Sstevel@tonic-gate} 1013*0Sstevel@tonic-gate 1014*0Sstevel@tonic-gatepackage Text::Balanced::Extractor; 1015*0Sstevel@tonic-gate 1016*0Sstevel@tonic-gatesub extract($$) # ($self, $text) 1017*0Sstevel@tonic-gate{ 1018*0Sstevel@tonic-gate &{$_[0]}($_[1]); 1019*0Sstevel@tonic-gate} 1020*0Sstevel@tonic-gate 1021*0Sstevel@tonic-gatepackage Text::Balanced::ErrorMsg; 1022*0Sstevel@tonic-gate 1023*0Sstevel@tonic-gateuse overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; 1024*0Sstevel@tonic-gate 1025*0Sstevel@tonic-gate1; 1026*0Sstevel@tonic-gate 1027*0Sstevel@tonic-gate__END__ 1028*0Sstevel@tonic-gate 1029*0Sstevel@tonic-gate=head1 NAME 1030*0Sstevel@tonic-gate 1031*0Sstevel@tonic-gateText::Balanced - Extract delimited text sequences from strings. 1032*0Sstevel@tonic-gate 1033*0Sstevel@tonic-gate 1034*0Sstevel@tonic-gate=head1 SYNOPSIS 1035*0Sstevel@tonic-gate 1036*0Sstevel@tonic-gate use Text::Balanced qw ( 1037*0Sstevel@tonic-gate extract_delimited 1038*0Sstevel@tonic-gate extract_bracketed 1039*0Sstevel@tonic-gate extract_quotelike 1040*0Sstevel@tonic-gate extract_codeblock 1041*0Sstevel@tonic-gate extract_variable 1042*0Sstevel@tonic-gate extract_tagged 1043*0Sstevel@tonic-gate extract_multiple 1044*0Sstevel@tonic-gate 1045*0Sstevel@tonic-gate gen_delimited_pat 1046*0Sstevel@tonic-gate gen_extract_tagged 1047*0Sstevel@tonic-gate ); 1048*0Sstevel@tonic-gate 1049*0Sstevel@tonic-gate # Extract the initial substring of $text that is delimited by 1050*0Sstevel@tonic-gate # two (unescaped) instances of the first character in $delim. 1051*0Sstevel@tonic-gate 1052*0Sstevel@tonic-gate ($extracted, $remainder) = extract_delimited($text,$delim); 1053*0Sstevel@tonic-gate 1054*0Sstevel@tonic-gate 1055*0Sstevel@tonic-gate # Extract the initial substring of $text that is bracketed 1056*0Sstevel@tonic-gate # with a delimiter(s) specified by $delim (where the string 1057*0Sstevel@tonic-gate # in $delim contains one or more of '(){}[]<>'). 1058*0Sstevel@tonic-gate 1059*0Sstevel@tonic-gate ($extracted, $remainder) = extract_bracketed($text,$delim); 1060*0Sstevel@tonic-gate 1061*0Sstevel@tonic-gate 1062*0Sstevel@tonic-gate # Extract the initial substring of $text that is bounded by 1063*0Sstevel@tonic-gate # an XML tag. 1064*0Sstevel@tonic-gate 1065*0Sstevel@tonic-gate ($extracted, $remainder) = extract_tagged($text); 1066*0Sstevel@tonic-gate 1067*0Sstevel@tonic-gate 1068*0Sstevel@tonic-gate # Extract the initial substring of $text that is bounded by 1069*0Sstevel@tonic-gate # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1070*0Sstevel@tonic-gate 1071*0Sstevel@tonic-gate ($extracted, $remainder) = 1072*0Sstevel@tonic-gate extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1073*0Sstevel@tonic-gate 1074*0Sstevel@tonic-gate 1075*0Sstevel@tonic-gate # Extract the initial substring of $text that represents a 1076*0Sstevel@tonic-gate # Perl "quote or quote-like operation" 1077*0Sstevel@tonic-gate 1078*0Sstevel@tonic-gate ($extracted, $remainder) = extract_quotelike($text); 1079*0Sstevel@tonic-gate 1080*0Sstevel@tonic-gate 1081*0Sstevel@tonic-gate # Extract the initial substring of $text that represents a block 1082*0Sstevel@tonic-gate # of Perl code, bracketed by any of character(s) specified by $delim 1083*0Sstevel@tonic-gate # (where the string $delim contains one or more of '(){}[]<>'). 1084*0Sstevel@tonic-gate 1085*0Sstevel@tonic-gate ($extracted, $remainder) = extract_codeblock($text,$delim); 1086*0Sstevel@tonic-gate 1087*0Sstevel@tonic-gate 1088*0Sstevel@tonic-gate # Extract the initial substrings of $text that would be extracted by 1089*0Sstevel@tonic-gate # one or more sequential applications of the specified functions 1090*0Sstevel@tonic-gate # or regular expressions 1091*0Sstevel@tonic-gate 1092*0Sstevel@tonic-gate @extracted = extract_multiple($text, 1093*0Sstevel@tonic-gate [ \&extract_bracketed, 1094*0Sstevel@tonic-gate \&extract_quotelike, 1095*0Sstevel@tonic-gate \&some_other_extractor_sub, 1096*0Sstevel@tonic-gate qr/[xyz]*/, 1097*0Sstevel@tonic-gate 'literal', 1098*0Sstevel@tonic-gate ]); 1099*0Sstevel@tonic-gate 1100*0Sstevel@tonic-gate# Create a string representing an optimized pattern (a la Friedl) 1101*0Sstevel@tonic-gate# that matches a substring delimited by any of the specified characters 1102*0Sstevel@tonic-gate# (in this case: any type of quote or a slash) 1103*0Sstevel@tonic-gate 1104*0Sstevel@tonic-gate $patstring = gen_delimited_pat(q{'"`/}); 1105*0Sstevel@tonic-gate 1106*0Sstevel@tonic-gate 1107*0Sstevel@tonic-gate# Generate a reference to an anonymous sub that is just like extract_tagged 1108*0Sstevel@tonic-gate# but pre-compiled and optimized for a specific pair of tags, and consequently 1109*0Sstevel@tonic-gate# much faster (i.e. 3 times faster). It uses qr// for better performance on 1110*0Sstevel@tonic-gate# repeated calls, so it only works under Perl 5.005 or later. 1111*0Sstevel@tonic-gate 1112*0Sstevel@tonic-gate $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1113*0Sstevel@tonic-gate 1114*0Sstevel@tonic-gate ($extracted, $remainder) = $extract_head->($text); 1115*0Sstevel@tonic-gate 1116*0Sstevel@tonic-gate 1117*0Sstevel@tonic-gate=head1 DESCRIPTION 1118*0Sstevel@tonic-gate 1119*0Sstevel@tonic-gateThe various C<extract_...> subroutines may be used to 1120*0Sstevel@tonic-gateextract a delimited substring, possibly after skipping a 1121*0Sstevel@tonic-gatespecified prefix string. By default, that prefix is 1122*0Sstevel@tonic-gateoptional whitespace (C</\s*/>), but you can change it to whatever 1123*0Sstevel@tonic-gateyou wish (see below). 1124*0Sstevel@tonic-gate 1125*0Sstevel@tonic-gateThe substring to be extracted must appear at the 1126*0Sstevel@tonic-gatecurrent C<pos> location of the string's variable 1127*0Sstevel@tonic-gate(or at index zero, if no C<pos> position is defined). 1128*0Sstevel@tonic-gateIn other words, the C<extract_...> subroutines I<don't> 1129*0Sstevel@tonic-gateextract the first occurance of a substring anywhere 1130*0Sstevel@tonic-gatein a string (like an unanchored regex would). Rather, 1131*0Sstevel@tonic-gatethey extract an occurance of the substring appearing 1132*0Sstevel@tonic-gateimmediately at the current matching position in the 1133*0Sstevel@tonic-gatestring (like a C<\G>-anchored regex would). 1134*0Sstevel@tonic-gate 1135*0Sstevel@tonic-gate 1136*0Sstevel@tonic-gate 1137*0Sstevel@tonic-gate=head2 General behaviour in list contexts 1138*0Sstevel@tonic-gate 1139*0Sstevel@tonic-gateIn a list context, all the subroutines return a list, the first three 1140*0Sstevel@tonic-gateelements of which are always: 1141*0Sstevel@tonic-gate 1142*0Sstevel@tonic-gate=over 4 1143*0Sstevel@tonic-gate 1144*0Sstevel@tonic-gate=item [0] 1145*0Sstevel@tonic-gate 1146*0Sstevel@tonic-gateThe extracted string, including the specified delimiters. 1147*0Sstevel@tonic-gateIf the extraction fails an empty string is returned. 1148*0Sstevel@tonic-gate 1149*0Sstevel@tonic-gate=item [1] 1150*0Sstevel@tonic-gate 1151*0Sstevel@tonic-gateThe remainder of the input string (i.e. the characters after the 1152*0Sstevel@tonic-gateextracted string). On failure, the entire string is returned. 1153*0Sstevel@tonic-gate 1154*0Sstevel@tonic-gate=item [2] 1155*0Sstevel@tonic-gate 1156*0Sstevel@tonic-gateThe skipped prefix (i.e. the characters before the extracted string). 1157*0Sstevel@tonic-gateOn failure, the empty string is returned. 1158*0Sstevel@tonic-gate 1159*0Sstevel@tonic-gate=back 1160*0Sstevel@tonic-gate 1161*0Sstevel@tonic-gateNote that in a list context, the contents of the original input text (the first 1162*0Sstevel@tonic-gateargument) are not modified in any way. 1163*0Sstevel@tonic-gate 1164*0Sstevel@tonic-gateHowever, if the input text was passed in a variable, that variable's 1165*0Sstevel@tonic-gateC<pos> value is updated to point at the first character after the 1166*0Sstevel@tonic-gateextracted text. That means that in a list context the various 1167*0Sstevel@tonic-gatesubroutines can be used much like regular expressions. For example: 1168*0Sstevel@tonic-gate 1169*0Sstevel@tonic-gate while ( $next = (extract_quotelike($text))[0] ) 1170*0Sstevel@tonic-gate { 1171*0Sstevel@tonic-gate # process next quote-like (in $next) 1172*0Sstevel@tonic-gate } 1173*0Sstevel@tonic-gate 1174*0Sstevel@tonic-gate 1175*0Sstevel@tonic-gate=head2 General behaviour in scalar and void contexts 1176*0Sstevel@tonic-gate 1177*0Sstevel@tonic-gateIn a scalar context, the extracted string is returned, having first been 1178*0Sstevel@tonic-gateremoved from the input text. Thus, the following code also processes 1179*0Sstevel@tonic-gateeach quote-like operation, but actually removes them from $text: 1180*0Sstevel@tonic-gate 1181*0Sstevel@tonic-gate while ( $next = extract_quotelike($text) ) 1182*0Sstevel@tonic-gate { 1183*0Sstevel@tonic-gate # process next quote-like (in $next) 1184*0Sstevel@tonic-gate } 1185*0Sstevel@tonic-gate 1186*0Sstevel@tonic-gateNote that if the input text is a read-only string (i.e. a literal), 1187*0Sstevel@tonic-gateno attempt is made to remove the extracted text. 1188*0Sstevel@tonic-gate 1189*0Sstevel@tonic-gateIn a void context the behaviour of the extraction subroutines is 1190*0Sstevel@tonic-gateexactly the same as in a scalar context, except (of course) that the 1191*0Sstevel@tonic-gateextracted substring is not returned. 1192*0Sstevel@tonic-gate 1193*0Sstevel@tonic-gate=head2 A note about prefixes 1194*0Sstevel@tonic-gate 1195*0Sstevel@tonic-gatePrefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1196*0Sstevel@tonic-gateThis can bite you if you're expecting a prefix specification like 1197*0Sstevel@tonic-gate'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1198*0Sstevel@tonic-gatepattern will only succeed if the <H1> tag is on the current line, since 1199*0Sstevel@tonic-gate. normally doesn't match newlines. 1200*0Sstevel@tonic-gate 1201*0Sstevel@tonic-gateTo overcome this limitation, you need to turn on /s matching within 1202*0Sstevel@tonic-gatethe prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1203*0Sstevel@tonic-gate 1204*0Sstevel@tonic-gate 1205*0Sstevel@tonic-gate=head2 C<extract_delimited> 1206*0Sstevel@tonic-gate 1207*0Sstevel@tonic-gateThe C<extract_delimited> function formalizes the common idiom 1208*0Sstevel@tonic-gateof extracting a single-character-delimited substring from the start of 1209*0Sstevel@tonic-gatea string. For example, to extract a single-quote delimited string, the 1210*0Sstevel@tonic-gatefollowing code is typically used: 1211*0Sstevel@tonic-gate 1212*0Sstevel@tonic-gate ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1213*0Sstevel@tonic-gate $extracted = $1; 1214*0Sstevel@tonic-gate 1215*0Sstevel@tonic-gatebut with C<extract_delimited> it can be simplified to: 1216*0Sstevel@tonic-gate 1217*0Sstevel@tonic-gate ($extracted,$remainder) = extract_delimited($text, "'"); 1218*0Sstevel@tonic-gate 1219*0Sstevel@tonic-gateC<extract_delimited> takes up to four scalars (the input text, the 1220*0Sstevel@tonic-gatedelimiters, a prefix pattern to be skipped, and any escape characters) 1221*0Sstevel@tonic-gateand extracts the initial substring of the text that 1222*0Sstevel@tonic-gateis appropriately delimited. If the delimiter string has multiple 1223*0Sstevel@tonic-gatecharacters, the first one encountered in the text is taken to delimit 1224*0Sstevel@tonic-gatethe substring. 1225*0Sstevel@tonic-gateThe third argument specifies a prefix pattern that is to be skipped 1226*0Sstevel@tonic-gate(but must be present!) before the substring is extracted. 1227*0Sstevel@tonic-gateThe final argument specifies the escape character to be used for each 1228*0Sstevel@tonic-gatedelimiter. 1229*0Sstevel@tonic-gate 1230*0Sstevel@tonic-gateAll arguments are optional. If the escape characters are not specified, 1231*0Sstevel@tonic-gateevery delimiter is escaped with a backslash (C<\>). 1232*0Sstevel@tonic-gateIf the prefix is not specified, the 1233*0Sstevel@tonic-gatepattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1234*0Sstevel@tonic-gateis also not specified, the set C</["'`]/> is used. If the text to be processed 1235*0Sstevel@tonic-gateis not specified either, C<$_> is used. 1236*0Sstevel@tonic-gate 1237*0Sstevel@tonic-gateIn list context, C<extract_delimited> returns a array of three 1238*0Sstevel@tonic-gateelements, the extracted substring (I<including the surrounding 1239*0Sstevel@tonic-gatedelimiters>), the remainder of the text, and the skipped prefix (if 1240*0Sstevel@tonic-gateany). If a suitable delimited substring is not found, the first 1241*0Sstevel@tonic-gateelement of the array is the empty string, the second is the complete 1242*0Sstevel@tonic-gateoriginal text, and the prefix returned in the third element is an 1243*0Sstevel@tonic-gateempty string. 1244*0Sstevel@tonic-gate 1245*0Sstevel@tonic-gateIn a scalar context, just the extracted substring is returned. In 1246*0Sstevel@tonic-gatea void context, the extracted substring (and any prefix) are simply 1247*0Sstevel@tonic-gateremoved from the beginning of the first argument. 1248*0Sstevel@tonic-gate 1249*0Sstevel@tonic-gateExamples: 1250*0Sstevel@tonic-gate 1251*0Sstevel@tonic-gate # Remove a single-quoted substring from the very beginning of $text: 1252*0Sstevel@tonic-gate 1253*0Sstevel@tonic-gate $substring = extract_delimited($text, "'", ''); 1254*0Sstevel@tonic-gate 1255*0Sstevel@tonic-gate # Remove a single-quoted Pascalish substring (i.e. one in which 1256*0Sstevel@tonic-gate # doubling the quote character escapes it) from the very 1257*0Sstevel@tonic-gate # beginning of $text: 1258*0Sstevel@tonic-gate 1259*0Sstevel@tonic-gate $substring = extract_delimited($text, "'", '', "'"); 1260*0Sstevel@tonic-gate 1261*0Sstevel@tonic-gate # Extract a single- or double- quoted substring from the 1262*0Sstevel@tonic-gate # beginning of $text, optionally after some whitespace 1263*0Sstevel@tonic-gate # (note the list context to protect $text from modification): 1264*0Sstevel@tonic-gate 1265*0Sstevel@tonic-gate ($substring) = extract_delimited $text, q{"'}; 1266*0Sstevel@tonic-gate 1267*0Sstevel@tonic-gate 1268*0Sstevel@tonic-gate # Delete the substring delimited by the first '/' in $text: 1269*0Sstevel@tonic-gate 1270*0Sstevel@tonic-gate $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1271*0Sstevel@tonic-gate 1272*0Sstevel@tonic-gateNote that this last example is I<not> the same as deleting the first 1273*0Sstevel@tonic-gatequote-like pattern. For instance, if C<$text> contained the string: 1274*0Sstevel@tonic-gate 1275*0Sstevel@tonic-gate "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1276*0Sstevel@tonic-gate 1277*0Sstevel@tonic-gatethen after the deletion it would contain: 1278*0Sstevel@tonic-gate 1279*0Sstevel@tonic-gate "if ('.$UNIXCMD/s) { $cmd = $1; }" 1280*0Sstevel@tonic-gate 1281*0Sstevel@tonic-gatenot: 1282*0Sstevel@tonic-gate 1283*0Sstevel@tonic-gate "if ('./cmd' =~ ms) { $cmd = $1; }" 1284*0Sstevel@tonic-gate 1285*0Sstevel@tonic-gate 1286*0Sstevel@tonic-gateSee L<"extract_quotelike"> for a (partial) solution to this problem. 1287*0Sstevel@tonic-gate 1288*0Sstevel@tonic-gate 1289*0Sstevel@tonic-gate=head2 C<extract_bracketed> 1290*0Sstevel@tonic-gate 1291*0Sstevel@tonic-gateLike C<"extract_delimited">, the C<extract_bracketed> function takes 1292*0Sstevel@tonic-gateup to three optional scalar arguments: a string to extract from, a delimiter 1293*0Sstevel@tonic-gatespecifier, and a prefix pattern. As before, a missing prefix defaults to 1294*0Sstevel@tonic-gateoptional whitespace and a missing text defaults to C<$_>. However, a missing 1295*0Sstevel@tonic-gatedelimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1296*0Sstevel@tonic-gate 1297*0Sstevel@tonic-gateC<extract_bracketed> extracts a balanced-bracket-delimited 1298*0Sstevel@tonic-gatesubstring (using any one (or more) of the user-specified delimiter 1299*0Sstevel@tonic-gatebrackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1300*0Sstevel@tonic-gaterespect quoted unbalanced brackets (see below). 1301*0Sstevel@tonic-gate 1302*0Sstevel@tonic-gateA "delimiter bracket" is a bracket in list of delimiters passed as 1303*0Sstevel@tonic-gateC<extract_bracketed>'s second argument. Delimiter brackets are 1304*0Sstevel@tonic-gatespecified by giving either the left or right (or both!) versions 1305*0Sstevel@tonic-gateof the required bracket(s). Note that the order in which 1306*0Sstevel@tonic-gatetwo or more delimiter brackets are specified is not significant. 1307*0Sstevel@tonic-gate 1308*0Sstevel@tonic-gateA "balanced-bracket-delimited substring" is a substring bounded by 1309*0Sstevel@tonic-gatematched brackets, such that any other (left or right) delimiter 1310*0Sstevel@tonic-gatebracket I<within> the substring is also matched by an opposite 1311*0Sstevel@tonic-gate(right or left) delimiter bracket I<at the same level of nesting>. Any 1312*0Sstevel@tonic-gatetype of bracket not in the delimiter list is treated as an ordinary 1313*0Sstevel@tonic-gatecharacter. 1314*0Sstevel@tonic-gate 1315*0Sstevel@tonic-gateIn other words, each type of bracket specified as a delimiter must be 1316*0Sstevel@tonic-gatebalanced and correctly nested within the substring, and any other kind of 1317*0Sstevel@tonic-gate("non-delimiter") bracket in the substring is ignored. 1318*0Sstevel@tonic-gate 1319*0Sstevel@tonic-gateFor example, given the string: 1320*0Sstevel@tonic-gate 1321*0Sstevel@tonic-gate $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1322*0Sstevel@tonic-gate 1323*0Sstevel@tonic-gatethen a call to C<extract_bracketed> in a list context: 1324*0Sstevel@tonic-gate 1325*0Sstevel@tonic-gate @result = extract_bracketed( $text, '{}' ); 1326*0Sstevel@tonic-gate 1327*0Sstevel@tonic-gatewould return: 1328*0Sstevel@tonic-gate 1329*0Sstevel@tonic-gate ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1330*0Sstevel@tonic-gate 1331*0Sstevel@tonic-gatesince both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1332*0Sstevel@tonic-gate(In a scalar context just the first element of the array would be returned. In 1333*0Sstevel@tonic-gatea void context, C<$text> would be replaced by an empty string.) 1334*0Sstevel@tonic-gate 1335*0Sstevel@tonic-gateLikewise the call in: 1336*0Sstevel@tonic-gate 1337*0Sstevel@tonic-gate @result = extract_bracketed( $text, '{[' ); 1338*0Sstevel@tonic-gate 1339*0Sstevel@tonic-gatewould return the same result, since all sets of both types of specified 1340*0Sstevel@tonic-gatedelimiter brackets are correctly nested and balanced. 1341*0Sstevel@tonic-gate 1342*0Sstevel@tonic-gateHowever, the call in: 1343*0Sstevel@tonic-gate 1344*0Sstevel@tonic-gate @result = extract_bracketed( $text, '{([<' ); 1345*0Sstevel@tonic-gate 1346*0Sstevel@tonic-gatewould fail, returning: 1347*0Sstevel@tonic-gate 1348*0Sstevel@tonic-gate ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1349*0Sstevel@tonic-gate 1350*0Sstevel@tonic-gatebecause the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1351*0Sstevel@tonic-gatethe embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1352*0Sstevel@tonic-gatereturn an empty string. In a void context, C<$text> would be unchanged.) 1353*0Sstevel@tonic-gate 1354*0Sstevel@tonic-gateNote that the embedded single-quotes in the string don't help in this 1355*0Sstevel@tonic-gatecase, since they have not been specified as acceptable delimiters and are 1356*0Sstevel@tonic-gatetherefore treated as non-delimiter characters (and ignored). 1357*0Sstevel@tonic-gate 1358*0Sstevel@tonic-gateHowever, if a particular species of quote character is included in the 1359*0Sstevel@tonic-gatedelimiter specification, then that type of quote will be correctly handled. 1360*0Sstevel@tonic-gatefor example, if C<$text> is: 1361*0Sstevel@tonic-gate 1362*0Sstevel@tonic-gate $text = '<A HREF=">>>>">link</A>'; 1363*0Sstevel@tonic-gate 1364*0Sstevel@tonic-gatethen 1365*0Sstevel@tonic-gate 1366*0Sstevel@tonic-gate @result = extract_bracketed( $text, '<">' ); 1367*0Sstevel@tonic-gate 1368*0Sstevel@tonic-gatereturns: 1369*0Sstevel@tonic-gate 1370*0Sstevel@tonic-gate ( '<A HREF=">>>>">', 'link</A>', "" ) 1371*0Sstevel@tonic-gate 1372*0Sstevel@tonic-gateas expected. Without the specification of C<"> as an embedded quoter: 1373*0Sstevel@tonic-gate 1374*0Sstevel@tonic-gate @result = extract_bracketed( $text, '<>' ); 1375*0Sstevel@tonic-gate 1376*0Sstevel@tonic-gatethe result would be: 1377*0Sstevel@tonic-gate 1378*0Sstevel@tonic-gate ( '<A HREF=">', '>>>">link</A>', "" ) 1379*0Sstevel@tonic-gate 1380*0Sstevel@tonic-gateIn addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1381*0Sstevel@tonic-gatequoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1382*0Sstevel@tonic-gateletter 'q' as a delimiter. Hence: 1383*0Sstevel@tonic-gate 1384*0Sstevel@tonic-gate @result = extract_bracketed( $text, '<q>' ); 1385*0Sstevel@tonic-gate 1386*0Sstevel@tonic-gatewould correctly match something like this: 1387*0Sstevel@tonic-gate 1388*0Sstevel@tonic-gate $text = '<leftop: conj /and/ conj>'; 1389*0Sstevel@tonic-gate 1390*0Sstevel@tonic-gateSee also: C<"extract_quotelike"> and C<"extract_codeblock">. 1391*0Sstevel@tonic-gate 1392*0Sstevel@tonic-gate 1393*0Sstevel@tonic-gate=head2 C<extract_variable> 1394*0Sstevel@tonic-gate 1395*0Sstevel@tonic-gateC<extract_variable> extracts any valid Perl variable or 1396*0Sstevel@tonic-gatevariable-involved expression, including scalars, arrays, hashes, array 1397*0Sstevel@tonic-gateaccesses, hash look-ups, method calls through objects, subroutine calles 1398*0Sstevel@tonic-gatethrough subroutine references, etc. 1399*0Sstevel@tonic-gate 1400*0Sstevel@tonic-gateThe subroutine takes up to two optional arguments: 1401*0Sstevel@tonic-gate 1402*0Sstevel@tonic-gate=over 4 1403*0Sstevel@tonic-gate 1404*0Sstevel@tonic-gate=item 1. 1405*0Sstevel@tonic-gate 1406*0Sstevel@tonic-gateA string to be processed (C<$_> if the string is omitted or C<undef>) 1407*0Sstevel@tonic-gate 1408*0Sstevel@tonic-gate=item 2. 1409*0Sstevel@tonic-gate 1410*0Sstevel@tonic-gateA string specifying a pattern to be matched as a prefix (which is to be 1411*0Sstevel@tonic-gateskipped). If omitted, optional whitespace is skipped. 1412*0Sstevel@tonic-gate 1413*0Sstevel@tonic-gate=back 1414*0Sstevel@tonic-gate 1415*0Sstevel@tonic-gateOn success in a list context, an array of 3 elements is returned. The 1416*0Sstevel@tonic-gateelements are: 1417*0Sstevel@tonic-gate 1418*0Sstevel@tonic-gate=over 4 1419*0Sstevel@tonic-gate 1420*0Sstevel@tonic-gate=item [0] 1421*0Sstevel@tonic-gate 1422*0Sstevel@tonic-gatethe extracted variable, or variablish expression 1423*0Sstevel@tonic-gate 1424*0Sstevel@tonic-gate=item [1] 1425*0Sstevel@tonic-gate 1426*0Sstevel@tonic-gatethe remainder of the input text, 1427*0Sstevel@tonic-gate 1428*0Sstevel@tonic-gate=item [2] 1429*0Sstevel@tonic-gate 1430*0Sstevel@tonic-gatethe prefix substring (if any), 1431*0Sstevel@tonic-gate 1432*0Sstevel@tonic-gate=back 1433*0Sstevel@tonic-gate 1434*0Sstevel@tonic-gateOn failure, all of these values (except the remaining text) are C<undef>. 1435*0Sstevel@tonic-gate 1436*0Sstevel@tonic-gateIn a scalar context, C<extract_variable> returns just the complete 1437*0Sstevel@tonic-gatesubstring that matched a variablish expression. C<undef> is returned on 1438*0Sstevel@tonic-gatefailure. In addition, the original input text has the returned substring 1439*0Sstevel@tonic-gate(and any prefix) removed from it. 1440*0Sstevel@tonic-gate 1441*0Sstevel@tonic-gateIn a void context, the input text just has the matched substring (and 1442*0Sstevel@tonic-gateany specified prefix) removed. 1443*0Sstevel@tonic-gate 1444*0Sstevel@tonic-gate 1445*0Sstevel@tonic-gate=head2 C<extract_tagged> 1446*0Sstevel@tonic-gate 1447*0Sstevel@tonic-gateC<extract_tagged> extracts and segments text between (balanced) 1448*0Sstevel@tonic-gatespecified tags. 1449*0Sstevel@tonic-gate 1450*0Sstevel@tonic-gateThe subroutine takes up to five optional arguments: 1451*0Sstevel@tonic-gate 1452*0Sstevel@tonic-gate=over 4 1453*0Sstevel@tonic-gate 1454*0Sstevel@tonic-gate=item 1. 1455*0Sstevel@tonic-gate 1456*0Sstevel@tonic-gateA string to be processed (C<$_> if the string is omitted or C<undef>) 1457*0Sstevel@tonic-gate 1458*0Sstevel@tonic-gate=item 2. 1459*0Sstevel@tonic-gate 1460*0Sstevel@tonic-gateA string specifying a pattern to be matched as the opening tag. 1461*0Sstevel@tonic-gateIf the pattern string is omitted (or C<undef>) then a pattern 1462*0Sstevel@tonic-gatethat matches any standard XML tag is used. 1463*0Sstevel@tonic-gate 1464*0Sstevel@tonic-gate=item 3. 1465*0Sstevel@tonic-gate 1466*0Sstevel@tonic-gateA string specifying a pattern to be matched at the closing tag. 1467*0Sstevel@tonic-gateIf the pattern string is omitted (or C<undef>) then the closing 1468*0Sstevel@tonic-gatetag is constructed by inserting a C</> after any leading bracket 1469*0Sstevel@tonic-gatecharacters in the actual opening tag that was matched (I<not> the pattern 1470*0Sstevel@tonic-gatethat matched the tag). For example, if the opening tag pattern 1471*0Sstevel@tonic-gateis specified as C<'{{\w+}}'> and actually matched the opening tag 1472*0Sstevel@tonic-gateC<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1473*0Sstevel@tonic-gate 1474*0Sstevel@tonic-gate=item 4. 1475*0Sstevel@tonic-gate 1476*0Sstevel@tonic-gateA string specifying a pattern to be matched as a prefix (which is to be 1477*0Sstevel@tonic-gateskipped). If omitted, optional whitespace is skipped. 1478*0Sstevel@tonic-gate 1479*0Sstevel@tonic-gate=item 5. 1480*0Sstevel@tonic-gate 1481*0Sstevel@tonic-gateA hash reference containing various parsing options (see below) 1482*0Sstevel@tonic-gate 1483*0Sstevel@tonic-gate=back 1484*0Sstevel@tonic-gate 1485*0Sstevel@tonic-gateThe various options that can be specified are: 1486*0Sstevel@tonic-gate 1487*0Sstevel@tonic-gate=over 4 1488*0Sstevel@tonic-gate 1489*0Sstevel@tonic-gate=item C<reject =E<gt> $listref> 1490*0Sstevel@tonic-gate 1491*0Sstevel@tonic-gateThe list reference contains one or more strings specifying patterns 1492*0Sstevel@tonic-gatethat must I<not> appear within the tagged text. 1493*0Sstevel@tonic-gate 1494*0Sstevel@tonic-gateFor example, to extract 1495*0Sstevel@tonic-gatean HTML link (which should not contain nested links) use: 1496*0Sstevel@tonic-gate 1497*0Sstevel@tonic-gate extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1498*0Sstevel@tonic-gate 1499*0Sstevel@tonic-gate=item C<ignore =E<gt> $listref> 1500*0Sstevel@tonic-gate 1501*0Sstevel@tonic-gateThe list reference contains one or more strings specifying patterns 1502*0Sstevel@tonic-gatethat are I<not> be be treated as nested tags within the tagged text 1503*0Sstevel@tonic-gate(even if they would match the start tag pattern). 1504*0Sstevel@tonic-gate 1505*0Sstevel@tonic-gateFor example, to extract an arbitrary XML tag, but ignore "empty" elements: 1506*0Sstevel@tonic-gate 1507*0Sstevel@tonic-gate extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1508*0Sstevel@tonic-gate 1509*0Sstevel@tonic-gate(also see L<"gen_delimited_pat"> below). 1510*0Sstevel@tonic-gate 1511*0Sstevel@tonic-gate 1512*0Sstevel@tonic-gate=item C<fail =E<gt> $str> 1513*0Sstevel@tonic-gate 1514*0Sstevel@tonic-gateThe C<fail> option indicates the action to be taken if a matching end 1515*0Sstevel@tonic-gatetag is not encountered (i.e. before the end of the string or some 1516*0Sstevel@tonic-gateC<reject> pattern matches). By default, a failure to match a closing 1517*0Sstevel@tonic-gatetag causes C<extract_tagged> to immediately fail. 1518*0Sstevel@tonic-gate 1519*0Sstevel@tonic-gateHowever, if the string value associated with <reject> is "MAX", then 1520*0Sstevel@tonic-gateC<extract_tagged> returns the complete text up to the point of failure. 1521*0Sstevel@tonic-gateIf the string is "PARA", C<extract_tagged> returns only the first paragraph 1522*0Sstevel@tonic-gateafter the tag (up to the first line that is either empty or contains 1523*0Sstevel@tonic-gateonly whitespace characters). 1524*0Sstevel@tonic-gateIf the string is "", the the default behaviour (i.e. failure) is reinstated. 1525*0Sstevel@tonic-gate 1526*0Sstevel@tonic-gateFor example, suppose the start tag "/para" introduces a paragraph, which then 1527*0Sstevel@tonic-gatecontinues until the next "/endpara" tag or until another "/para" tag is 1528*0Sstevel@tonic-gateencountered: 1529*0Sstevel@tonic-gate 1530*0Sstevel@tonic-gate $text = "/para line 1\n\nline 3\n/para line 4"; 1531*0Sstevel@tonic-gate 1532*0Sstevel@tonic-gate extract_tagged($text, '/para', '/endpara', undef, 1533*0Sstevel@tonic-gate {reject => '/para', fail => MAX ); 1534*0Sstevel@tonic-gate 1535*0Sstevel@tonic-gate # EXTRACTED: "/para line 1\n\nline 3\n" 1536*0Sstevel@tonic-gate 1537*0Sstevel@tonic-gateSuppose instead, that if no matching "/endpara" tag is found, the "/para" 1538*0Sstevel@tonic-gatetag refers only to the immediately following paragraph: 1539*0Sstevel@tonic-gate 1540*0Sstevel@tonic-gate $text = "/para line 1\n\nline 3\n/para line 4"; 1541*0Sstevel@tonic-gate 1542*0Sstevel@tonic-gate extract_tagged($text, '/para', '/endpara', undef, 1543*0Sstevel@tonic-gate {reject => '/para', fail => MAX ); 1544*0Sstevel@tonic-gate 1545*0Sstevel@tonic-gate # EXTRACTED: "/para line 1\n" 1546*0Sstevel@tonic-gate 1547*0Sstevel@tonic-gateNote that the specified C<fail> behaviour applies to nested tags as well. 1548*0Sstevel@tonic-gate 1549*0Sstevel@tonic-gate=back 1550*0Sstevel@tonic-gate 1551*0Sstevel@tonic-gateOn success in a list context, an array of 6 elements is returned. The elements are: 1552*0Sstevel@tonic-gate 1553*0Sstevel@tonic-gate=over 4 1554*0Sstevel@tonic-gate 1555*0Sstevel@tonic-gate=item [0] 1556*0Sstevel@tonic-gate 1557*0Sstevel@tonic-gatethe extracted tagged substring (including the outermost tags), 1558*0Sstevel@tonic-gate 1559*0Sstevel@tonic-gate=item [1] 1560*0Sstevel@tonic-gate 1561*0Sstevel@tonic-gatethe remainder of the input text, 1562*0Sstevel@tonic-gate 1563*0Sstevel@tonic-gate=item [2] 1564*0Sstevel@tonic-gate 1565*0Sstevel@tonic-gatethe prefix substring (if any), 1566*0Sstevel@tonic-gate 1567*0Sstevel@tonic-gate=item [3] 1568*0Sstevel@tonic-gate 1569*0Sstevel@tonic-gatethe opening tag 1570*0Sstevel@tonic-gate 1571*0Sstevel@tonic-gate=item [4] 1572*0Sstevel@tonic-gate 1573*0Sstevel@tonic-gatethe text between the opening and closing tags 1574*0Sstevel@tonic-gate 1575*0Sstevel@tonic-gate=item [5] 1576*0Sstevel@tonic-gate 1577*0Sstevel@tonic-gatethe closing tag (or "" if no closing tag was found) 1578*0Sstevel@tonic-gate 1579*0Sstevel@tonic-gate=back 1580*0Sstevel@tonic-gate 1581*0Sstevel@tonic-gateOn failure, all of these values (except the remaining text) are C<undef>. 1582*0Sstevel@tonic-gate 1583*0Sstevel@tonic-gateIn a scalar context, C<extract_tagged> returns just the complete 1584*0Sstevel@tonic-gatesubstring that matched a tagged text (including the start and end 1585*0Sstevel@tonic-gatetags). C<undef> is returned on failure. In addition, the original input 1586*0Sstevel@tonic-gatetext has the returned substring (and any prefix) removed from it. 1587*0Sstevel@tonic-gate 1588*0Sstevel@tonic-gateIn a void context, the input text just has the matched substring (and 1589*0Sstevel@tonic-gateany specified prefix) removed. 1590*0Sstevel@tonic-gate 1591*0Sstevel@tonic-gate 1592*0Sstevel@tonic-gate=head2 C<gen_extract_tagged> 1593*0Sstevel@tonic-gate 1594*0Sstevel@tonic-gate(Note: This subroutine is only available under Perl5.005) 1595*0Sstevel@tonic-gate 1596*0Sstevel@tonic-gateC<gen_extract_tagged> generates a new anonymous subroutine which 1597*0Sstevel@tonic-gateextracts text between (balanced) specified tags. In other words, 1598*0Sstevel@tonic-gateit generates a function identical in function to C<extract_tagged>. 1599*0Sstevel@tonic-gate 1600*0Sstevel@tonic-gateThe difference between C<extract_tagged> and the anonymous 1601*0Sstevel@tonic-gatesubroutines generated by 1602*0Sstevel@tonic-gateC<gen_extract_tagged>, is that those generated subroutines: 1603*0Sstevel@tonic-gate 1604*0Sstevel@tonic-gate=over 4 1605*0Sstevel@tonic-gate 1606*0Sstevel@tonic-gate=item * 1607*0Sstevel@tonic-gate 1608*0Sstevel@tonic-gatedo not have to reparse tag specification or parsing options every time 1609*0Sstevel@tonic-gatethey are called (whereas C<extract_tagged> has to effectively rebuild 1610*0Sstevel@tonic-gateits tag parser on every call); 1611*0Sstevel@tonic-gate 1612*0Sstevel@tonic-gate=item * 1613*0Sstevel@tonic-gate 1614*0Sstevel@tonic-gatemake use of the new qr// construct to pre-compile the regexes they use 1615*0Sstevel@tonic-gate(whereas C<extract_tagged> uses standard string variable interpolation 1616*0Sstevel@tonic-gateto create tag-matching patterns). 1617*0Sstevel@tonic-gate 1618*0Sstevel@tonic-gate=back 1619*0Sstevel@tonic-gate 1620*0Sstevel@tonic-gateThe subroutine takes up to four optional arguments (the same set as 1621*0Sstevel@tonic-gateC<extract_tagged> except for the string to be processed). It returns 1622*0Sstevel@tonic-gatea reference to a subroutine which in turn takes a single argument (the text to 1623*0Sstevel@tonic-gatebe extracted from). 1624*0Sstevel@tonic-gate 1625*0Sstevel@tonic-gateIn other words, the implementation of C<extract_tagged> is exactly 1626*0Sstevel@tonic-gateequivalent to: 1627*0Sstevel@tonic-gate 1628*0Sstevel@tonic-gate sub extract_tagged 1629*0Sstevel@tonic-gate { 1630*0Sstevel@tonic-gate my $text = shift; 1631*0Sstevel@tonic-gate $extractor = gen_extract_tagged(@_); 1632*0Sstevel@tonic-gate return $extractor->($text); 1633*0Sstevel@tonic-gate } 1634*0Sstevel@tonic-gate 1635*0Sstevel@tonic-gate(although C<extract_tagged> is not currently implemented that way, in order 1636*0Sstevel@tonic-gateto preserve pre-5.005 compatibility). 1637*0Sstevel@tonic-gate 1638*0Sstevel@tonic-gateUsing C<gen_extract_tagged> to create extraction functions for specific tags 1639*0Sstevel@tonic-gateis a good idea if those functions are going to be called more than once, since 1640*0Sstevel@tonic-gatetheir performance is typically twice as good as the more general-purpose 1641*0Sstevel@tonic-gateC<extract_tagged>. 1642*0Sstevel@tonic-gate 1643*0Sstevel@tonic-gate 1644*0Sstevel@tonic-gate=head2 C<extract_quotelike> 1645*0Sstevel@tonic-gate 1646*0Sstevel@tonic-gateC<extract_quotelike> attempts to recognize, extract, and segment any 1647*0Sstevel@tonic-gateone of the various Perl quotes and quotelike operators (see 1648*0Sstevel@tonic-gateL<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1649*0Sstevel@tonic-gatedelimiters (for the quotelike operators), and trailing modifiers are 1650*0Sstevel@tonic-gateall caught. For example, in: 1651*0Sstevel@tonic-gate 1652*0Sstevel@tonic-gate extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1653*0Sstevel@tonic-gate 1654*0Sstevel@tonic-gate extract_quotelike ' "You said, \"Use sed\"." ' 1655*0Sstevel@tonic-gate 1656*0Sstevel@tonic-gate extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1657*0Sstevel@tonic-gate 1658*0Sstevel@tonic-gate extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1659*0Sstevel@tonic-gate 1660*0Sstevel@tonic-gatethe full Perl quotelike operations are all extracted correctly. 1661*0Sstevel@tonic-gate 1662*0Sstevel@tonic-gateNote too that, when using the /x modifier on a regex, any comment 1663*0Sstevel@tonic-gatecontaining the current pattern delimiter will cause the regex to be 1664*0Sstevel@tonic-gateimmediately terminated. In other words: 1665*0Sstevel@tonic-gate 1666*0Sstevel@tonic-gate 'm / 1667*0Sstevel@tonic-gate (?i) # CASE INSENSITIVE 1668*0Sstevel@tonic-gate [a-z_] # LEADING ALPHABETIC/UNDERSCORE 1669*0Sstevel@tonic-gate [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1670*0Sstevel@tonic-gate /x' 1671*0Sstevel@tonic-gate 1672*0Sstevel@tonic-gatewill be extracted as if it were: 1673*0Sstevel@tonic-gate 1674*0Sstevel@tonic-gate 'm / 1675*0Sstevel@tonic-gate (?i) # CASE INSENSITIVE 1676*0Sstevel@tonic-gate [a-z_] # LEADING ALPHABETIC/' 1677*0Sstevel@tonic-gate 1678*0Sstevel@tonic-gateThis behaviour is identical to that of the actual compiler. 1679*0Sstevel@tonic-gate 1680*0Sstevel@tonic-gateC<extract_quotelike> takes two arguments: the text to be processed and 1681*0Sstevel@tonic-gatea prefix to be matched at the very beginning of the text. If no prefix 1682*0Sstevel@tonic-gateis specified, optional whitespace is the default. If no text is given, 1683*0Sstevel@tonic-gateC<$_> is used. 1684*0Sstevel@tonic-gate 1685*0Sstevel@tonic-gateIn a list context, an array of 11 elements is returned. The elements are: 1686*0Sstevel@tonic-gate 1687*0Sstevel@tonic-gate=over 4 1688*0Sstevel@tonic-gate 1689*0Sstevel@tonic-gate=item [0] 1690*0Sstevel@tonic-gate 1691*0Sstevel@tonic-gatethe extracted quotelike substring (including trailing modifiers), 1692*0Sstevel@tonic-gate 1693*0Sstevel@tonic-gate=item [1] 1694*0Sstevel@tonic-gate 1695*0Sstevel@tonic-gatethe remainder of the input text, 1696*0Sstevel@tonic-gate 1697*0Sstevel@tonic-gate=item [2] 1698*0Sstevel@tonic-gate 1699*0Sstevel@tonic-gatethe prefix substring (if any), 1700*0Sstevel@tonic-gate 1701*0Sstevel@tonic-gate=item [3] 1702*0Sstevel@tonic-gate 1703*0Sstevel@tonic-gatethe name of the quotelike operator (if any), 1704*0Sstevel@tonic-gate 1705*0Sstevel@tonic-gate=item [4] 1706*0Sstevel@tonic-gate 1707*0Sstevel@tonic-gatethe left delimiter of the first block of the operation, 1708*0Sstevel@tonic-gate 1709*0Sstevel@tonic-gate=item [5] 1710*0Sstevel@tonic-gate 1711*0Sstevel@tonic-gatethe text of the first block of the operation 1712*0Sstevel@tonic-gate(that is, the contents of 1713*0Sstevel@tonic-gatea quote, the regex of a match or substitution or the target list of a 1714*0Sstevel@tonic-gatetranslation), 1715*0Sstevel@tonic-gate 1716*0Sstevel@tonic-gate=item [6] 1717*0Sstevel@tonic-gate 1718*0Sstevel@tonic-gatethe right delimiter of the first block of the operation, 1719*0Sstevel@tonic-gate 1720*0Sstevel@tonic-gate=item [7] 1721*0Sstevel@tonic-gate 1722*0Sstevel@tonic-gatethe left delimiter of the second block of the operation 1723*0Sstevel@tonic-gate(that is, if it is a C<s>, C<tr>, or C<y>), 1724*0Sstevel@tonic-gate 1725*0Sstevel@tonic-gate=item [8] 1726*0Sstevel@tonic-gate 1727*0Sstevel@tonic-gatethe text of the second block of the operation 1728*0Sstevel@tonic-gate(that is, the replacement of a substitution or the translation list 1729*0Sstevel@tonic-gateof a translation), 1730*0Sstevel@tonic-gate 1731*0Sstevel@tonic-gate=item [9] 1732*0Sstevel@tonic-gate 1733*0Sstevel@tonic-gatethe right delimiter of the second block of the operation (if any), 1734*0Sstevel@tonic-gate 1735*0Sstevel@tonic-gate=item [10] 1736*0Sstevel@tonic-gate 1737*0Sstevel@tonic-gatethe trailing modifiers on the operation (if any). 1738*0Sstevel@tonic-gate 1739*0Sstevel@tonic-gate=back 1740*0Sstevel@tonic-gate 1741*0Sstevel@tonic-gateFor each of the fields marked "(if any)" the default value on success is 1742*0Sstevel@tonic-gatean empty string. 1743*0Sstevel@tonic-gateOn failure, all of these values (except the remaining text) are C<undef>. 1744*0Sstevel@tonic-gate 1745*0Sstevel@tonic-gate 1746*0Sstevel@tonic-gateIn a scalar context, C<extract_quotelike> returns just the complete substring 1747*0Sstevel@tonic-gatethat matched a quotelike operation (or C<undef> on failure). In a scalar or 1748*0Sstevel@tonic-gatevoid context, the input text has the same substring (and any specified 1749*0Sstevel@tonic-gateprefix) removed. 1750*0Sstevel@tonic-gate 1751*0Sstevel@tonic-gateExamples: 1752*0Sstevel@tonic-gate 1753*0Sstevel@tonic-gate # Remove the first quotelike literal that appears in text 1754*0Sstevel@tonic-gate 1755*0Sstevel@tonic-gate $quotelike = extract_quotelike($text,'.*?'); 1756*0Sstevel@tonic-gate 1757*0Sstevel@tonic-gate # Replace one or more leading whitespace-separated quotelike 1758*0Sstevel@tonic-gate # literals in $_ with "<QLL>" 1759*0Sstevel@tonic-gate 1760*0Sstevel@tonic-gate do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1761*0Sstevel@tonic-gate 1762*0Sstevel@tonic-gate 1763*0Sstevel@tonic-gate # Isolate the search pattern in a quotelike operation from $text 1764*0Sstevel@tonic-gate 1765*0Sstevel@tonic-gate ($op,$pat) = (extract_quotelike $text)[3,5]; 1766*0Sstevel@tonic-gate if ($op =~ /[ms]/) 1767*0Sstevel@tonic-gate { 1768*0Sstevel@tonic-gate print "search pattern: $pat\n"; 1769*0Sstevel@tonic-gate } 1770*0Sstevel@tonic-gate else 1771*0Sstevel@tonic-gate { 1772*0Sstevel@tonic-gate print "$op is not a pattern matching operation\n"; 1773*0Sstevel@tonic-gate } 1774*0Sstevel@tonic-gate 1775*0Sstevel@tonic-gate 1776*0Sstevel@tonic-gate=head2 C<extract_quotelike> and "here documents" 1777*0Sstevel@tonic-gate 1778*0Sstevel@tonic-gateC<extract_quotelike> can successfully extract "here documents" from an input 1779*0Sstevel@tonic-gatestring, but with an important caveat in list contexts. 1780*0Sstevel@tonic-gate 1781*0Sstevel@tonic-gateUnlike other types of quote-like literals, a here document is rarely 1782*0Sstevel@tonic-gatea contiguous substring. For example, a typical piece of code using 1783*0Sstevel@tonic-gatehere document might look like this: 1784*0Sstevel@tonic-gate 1785*0Sstevel@tonic-gate <<'EOMSG' || die; 1786*0Sstevel@tonic-gate This is the message. 1787*0Sstevel@tonic-gate EOMSG 1788*0Sstevel@tonic-gate exit; 1789*0Sstevel@tonic-gate 1790*0Sstevel@tonic-gateGiven this as an input string in a scalar context, C<extract_quotelike> 1791*0Sstevel@tonic-gatewould correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1792*0Sstevel@tonic-gateleaving the string " || die;\nexit;" in the original variable. In other words, 1793*0Sstevel@tonic-gatethe two separate pieces of the here document are successfully extracted and 1794*0Sstevel@tonic-gateconcatenated. 1795*0Sstevel@tonic-gate 1796*0Sstevel@tonic-gateIn a list context, C<extract_quotelike> would return the list 1797*0Sstevel@tonic-gate 1798*0Sstevel@tonic-gate=over 4 1799*0Sstevel@tonic-gate 1800*0Sstevel@tonic-gate=item [0] 1801*0Sstevel@tonic-gate 1802*0Sstevel@tonic-gate"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1803*0Sstevel@tonic-gateincluding fore and aft delimiters), 1804*0Sstevel@tonic-gate 1805*0Sstevel@tonic-gate=item [1] 1806*0Sstevel@tonic-gate 1807*0Sstevel@tonic-gate" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1808*0Sstevel@tonic-gate 1809*0Sstevel@tonic-gate=item [2] 1810*0Sstevel@tonic-gate 1811*0Sstevel@tonic-gate"" (i.e. the prefix substring -- trivial in this case), 1812*0Sstevel@tonic-gate 1813*0Sstevel@tonic-gate=item [3] 1814*0Sstevel@tonic-gate 1815*0Sstevel@tonic-gate"<<" (i.e. the "name" of the quotelike operator) 1816*0Sstevel@tonic-gate 1817*0Sstevel@tonic-gate=item [4] 1818*0Sstevel@tonic-gate 1819*0Sstevel@tonic-gate"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1820*0Sstevel@tonic-gate 1821*0Sstevel@tonic-gate=item [5] 1822*0Sstevel@tonic-gate 1823*0Sstevel@tonic-gate"This is the message.\n" (i.e. the text of the here document), 1824*0Sstevel@tonic-gate 1825*0Sstevel@tonic-gate=item [6] 1826*0Sstevel@tonic-gate 1827*0Sstevel@tonic-gate"EOMSG" (i.e. the right delimiter of the here document), 1828*0Sstevel@tonic-gate 1829*0Sstevel@tonic-gate=item [7..10] 1830*0Sstevel@tonic-gate 1831*0Sstevel@tonic-gate"" (a here document has no second left delimiter, second text, second right 1832*0Sstevel@tonic-gatedelimiter, or trailing modifiers). 1833*0Sstevel@tonic-gate 1834*0Sstevel@tonic-gate=back 1835*0Sstevel@tonic-gate 1836*0Sstevel@tonic-gateHowever, the matching position of the input variable would be set to 1837*0Sstevel@tonic-gate"exit;" (i.e. I<after> the closing delimiter of the here document), 1838*0Sstevel@tonic-gatewhich would cause the earlier " || die;\nexit;" to be skipped in any 1839*0Sstevel@tonic-gatesequence of code fragment extractions. 1840*0Sstevel@tonic-gate 1841*0Sstevel@tonic-gateTo avoid this problem, when it encounters a here document whilst 1842*0Sstevel@tonic-gateextracting from a modifiable string, C<extract_quotelike> silently 1843*0Sstevel@tonic-gaterearranges the string to an equivalent piece of Perl: 1844*0Sstevel@tonic-gate 1845*0Sstevel@tonic-gate <<'EOMSG' 1846*0Sstevel@tonic-gate This is the message. 1847*0Sstevel@tonic-gate EOMSG 1848*0Sstevel@tonic-gate || die; 1849*0Sstevel@tonic-gate exit; 1850*0Sstevel@tonic-gate 1851*0Sstevel@tonic-gatein which the here document I<is> contiguous. It still leaves the 1852*0Sstevel@tonic-gatematching position after the here document, but now the rest of the line 1853*0Sstevel@tonic-gateon which the here document starts is not skipped. 1854*0Sstevel@tonic-gate 1855*0Sstevel@tonic-gateTo prevent <extract_quotelike> from mucking about with the input in this way 1856*0Sstevel@tonic-gate(this is the only case where a list-context C<extract_quotelike> does so), 1857*0Sstevel@tonic-gateyou can pass the input variable as an interpolated literal: 1858*0Sstevel@tonic-gate 1859*0Sstevel@tonic-gate $quotelike = extract_quotelike("$var"); 1860*0Sstevel@tonic-gate 1861*0Sstevel@tonic-gate 1862*0Sstevel@tonic-gate=head2 C<extract_codeblock> 1863*0Sstevel@tonic-gate 1864*0Sstevel@tonic-gateC<extract_codeblock> attempts to recognize and extract a balanced 1865*0Sstevel@tonic-gatebracket delimited substring that may contain unbalanced brackets 1866*0Sstevel@tonic-gateinside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1867*0Sstevel@tonic-gateis like a combination of C<"extract_bracketed"> and 1868*0Sstevel@tonic-gateC<"extract_quotelike">. 1869*0Sstevel@tonic-gate 1870*0Sstevel@tonic-gateC<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1871*0Sstevel@tonic-gatea text to process, a set of delimiter brackets to look for, and a prefix to 1872*0Sstevel@tonic-gatematch first. It also takes an optional fourth parameter, which allows the 1873*0Sstevel@tonic-gateoutermost delimiter brackets to be specified separately (see below). 1874*0Sstevel@tonic-gate 1875*0Sstevel@tonic-gateOmitting the first argument (input text) means process C<$_> instead. 1876*0Sstevel@tonic-gateOmitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1877*0Sstevel@tonic-gateOmitting the third argument (prefix argument) implies optional whitespace at the start. 1878*0Sstevel@tonic-gateOmitting the fourth argument (outermost delimiter brackets) indicates that the 1879*0Sstevel@tonic-gatevalue of the second argument is to be used for the outermost delimiters. 1880*0Sstevel@tonic-gate 1881*0Sstevel@tonic-gateOnce the prefix an dthe outermost opening delimiter bracket have been 1882*0Sstevel@tonic-gaterecognized, code blocks are extracted by stepping through the input text and 1883*0Sstevel@tonic-gatetrying the following alternatives in sequence: 1884*0Sstevel@tonic-gate 1885*0Sstevel@tonic-gate=over 4 1886*0Sstevel@tonic-gate 1887*0Sstevel@tonic-gate=item 1. 1888*0Sstevel@tonic-gate 1889*0Sstevel@tonic-gateTry and match a closing delimiter bracket. If the bracket was the same 1890*0Sstevel@tonic-gatespecies as the last opening bracket, return the substring to that 1891*0Sstevel@tonic-gatepoint. If the bracket was mismatched, return an error. 1892*0Sstevel@tonic-gate 1893*0Sstevel@tonic-gate=item 2. 1894*0Sstevel@tonic-gate 1895*0Sstevel@tonic-gateTry to match a quote or quotelike operator. If found, call 1896*0Sstevel@tonic-gateC<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1897*0Sstevel@tonic-gatethe error it returned. Otherwise go back to step 1. 1898*0Sstevel@tonic-gate 1899*0Sstevel@tonic-gate=item 3. 1900*0Sstevel@tonic-gate 1901*0Sstevel@tonic-gateTry to match an opening delimiter bracket. If found, call 1902*0Sstevel@tonic-gateC<extract_codeblock> recursively to eat the embedded block. If the 1903*0Sstevel@tonic-gaterecursive call fails, return an error. Otherwise, go back to step 1. 1904*0Sstevel@tonic-gate 1905*0Sstevel@tonic-gate=item 4. 1906*0Sstevel@tonic-gate 1907*0Sstevel@tonic-gateUnconditionally match a bareword or any other single character, and 1908*0Sstevel@tonic-gatethen go back to step 1. 1909*0Sstevel@tonic-gate 1910*0Sstevel@tonic-gate=back 1911*0Sstevel@tonic-gate 1912*0Sstevel@tonic-gate 1913*0Sstevel@tonic-gateExamples: 1914*0Sstevel@tonic-gate 1915*0Sstevel@tonic-gate # Find a while loop in the text 1916*0Sstevel@tonic-gate 1917*0Sstevel@tonic-gate if ($text =~ s/.*?while\s*\{/{/) 1918*0Sstevel@tonic-gate { 1919*0Sstevel@tonic-gate $loop = "while " . extract_codeblock($text); 1920*0Sstevel@tonic-gate } 1921*0Sstevel@tonic-gate 1922*0Sstevel@tonic-gate # Remove the first round-bracketed list (which may include 1923*0Sstevel@tonic-gate # round- or curly-bracketed code blocks or quotelike operators) 1924*0Sstevel@tonic-gate 1925*0Sstevel@tonic-gate extract_codeblock $text, "(){}", '[^(]*'; 1926*0Sstevel@tonic-gate 1927*0Sstevel@tonic-gate 1928*0Sstevel@tonic-gateThe ability to specify a different outermost delimiter bracket is useful 1929*0Sstevel@tonic-gatein some circumstances. For example, in the Parse::RecDescent module, 1930*0Sstevel@tonic-gateparser actions which are to be performed only on a successful parse 1931*0Sstevel@tonic-gateare specified using a C<E<lt>defer:...E<gt>> directive. For example: 1932*0Sstevel@tonic-gate 1933*0Sstevel@tonic-gate sentence: subject verb object 1934*0Sstevel@tonic-gate <defer: {$::theVerb = $item{verb}} > 1935*0Sstevel@tonic-gate 1936*0Sstevel@tonic-gateParse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1937*0Sstevel@tonic-gatewithin the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1938*0Sstevel@tonic-gate 1939*0Sstevel@tonic-gateA deferred action like this: 1940*0Sstevel@tonic-gate 1941*0Sstevel@tonic-gate <defer: {if ($count>10) {$count--}} > 1942*0Sstevel@tonic-gate 1943*0Sstevel@tonic-gatewill be incorrectly parsed as: 1944*0Sstevel@tonic-gate 1945*0Sstevel@tonic-gate <defer: {if ($count> 1946*0Sstevel@tonic-gate 1947*0Sstevel@tonic-gatebecause the "less than" operator is interpreted as a closing delimiter. 1948*0Sstevel@tonic-gate 1949*0Sstevel@tonic-gateBut, by extracting the directive using 1950*0Sstevel@tonic-gateS<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1951*0Sstevel@tonic-gatethe '>' character is only treated as a delimited at the outermost 1952*0Sstevel@tonic-gatelevel of the code block, so the directive is parsed correctly. 1953*0Sstevel@tonic-gate 1954*0Sstevel@tonic-gate=head2 C<extract_multiple> 1955*0Sstevel@tonic-gate 1956*0Sstevel@tonic-gateThe C<extract_multiple> subroutine takes a string to be processed and a 1957*0Sstevel@tonic-gatelist of extractors (subroutines or regular expressions) to apply to that string. 1958*0Sstevel@tonic-gate 1959*0Sstevel@tonic-gateIn an array context C<extract_multiple> returns an array of substrings 1960*0Sstevel@tonic-gateof the original string, as extracted by the specified extractors. 1961*0Sstevel@tonic-gateIn a scalar context, C<extract_multiple> returns the first 1962*0Sstevel@tonic-gatesubstring successfully extracted from the original string. In both 1963*0Sstevel@tonic-gatescalar and void contexts the original string has the first successfully 1964*0Sstevel@tonic-gateextracted substring removed from it. In all contexts 1965*0Sstevel@tonic-gateC<extract_multiple> starts at the current C<pos> of the string, and 1966*0Sstevel@tonic-gatesets that C<pos> appropriately after it matches. 1967*0Sstevel@tonic-gate 1968*0Sstevel@tonic-gateHence, the aim of of a call to C<extract_multiple> in a list context 1969*0Sstevel@tonic-gateis to split the processed string into as many non-overlapping fields as 1970*0Sstevel@tonic-gatepossible, by repeatedly applying each of the specified extractors 1971*0Sstevel@tonic-gateto the remainder of the string. Thus C<extract_multiple> is 1972*0Sstevel@tonic-gatea generalized form of Perl's C<split> subroutine. 1973*0Sstevel@tonic-gate 1974*0Sstevel@tonic-gateThe subroutine takes up to four optional arguments: 1975*0Sstevel@tonic-gate 1976*0Sstevel@tonic-gate=over 4 1977*0Sstevel@tonic-gate 1978*0Sstevel@tonic-gate=item 1. 1979*0Sstevel@tonic-gate 1980*0Sstevel@tonic-gateA string to be processed (C<$_> if the string is omitted or C<undef>) 1981*0Sstevel@tonic-gate 1982*0Sstevel@tonic-gate=item 2. 1983*0Sstevel@tonic-gate 1984*0Sstevel@tonic-gateA reference to a list of subroutine references and/or qr// objects and/or 1985*0Sstevel@tonic-gateliteral strings and/or hash references, specifying the extractors 1986*0Sstevel@tonic-gateto be used to split the string. If this argument is omitted (or 1987*0Sstevel@tonic-gateC<undef>) the list: 1988*0Sstevel@tonic-gate 1989*0Sstevel@tonic-gate [ 1990*0Sstevel@tonic-gate sub { extract_variable($_[0], '') }, 1991*0Sstevel@tonic-gate sub { extract_quotelike($_[0],'') }, 1992*0Sstevel@tonic-gate sub { extract_codeblock($_[0],'{}','') }, 1993*0Sstevel@tonic-gate ] 1994*0Sstevel@tonic-gate 1995*0Sstevel@tonic-gateis used. 1996*0Sstevel@tonic-gate 1997*0Sstevel@tonic-gate 1998*0Sstevel@tonic-gate=item 3. 1999*0Sstevel@tonic-gate 2000*0Sstevel@tonic-gateAn number specifying the maximum number of fields to return. If this 2001*0Sstevel@tonic-gateargument is omitted (or C<undef>), split continues as long as possible. 2002*0Sstevel@tonic-gate 2003*0Sstevel@tonic-gateIf the third argument is I<N>, then extraction continues until I<N> fields 2004*0Sstevel@tonic-gatehave been successfully extracted, or until the string has been completely 2005*0Sstevel@tonic-gateprocessed. 2006*0Sstevel@tonic-gate 2007*0Sstevel@tonic-gateNote that in scalar and void contexts the value of this argument is 2008*0Sstevel@tonic-gateautomatically reset to 1 (under C<-w>, a warning is issued if the argument 2009*0Sstevel@tonic-gatehas to be reset). 2010*0Sstevel@tonic-gate 2011*0Sstevel@tonic-gate=item 4. 2012*0Sstevel@tonic-gate 2013*0Sstevel@tonic-gateA value indicating whether unmatched substrings (see below) within the 2014*0Sstevel@tonic-gatetext should be skipped or returned as fields. If the value is true, 2015*0Sstevel@tonic-gatesuch substrings are skipped. Otherwise, they are returned. 2016*0Sstevel@tonic-gate 2017*0Sstevel@tonic-gate=back 2018*0Sstevel@tonic-gate 2019*0Sstevel@tonic-gateThe extraction process works by applying each extractor in 2020*0Sstevel@tonic-gatesequence to the text string. 2021*0Sstevel@tonic-gate 2022*0Sstevel@tonic-gateIf the extractor is a subroutine it is called in a list context and is 2023*0Sstevel@tonic-gateexpected to return a list of a single element, namely the extracted 2024*0Sstevel@tonic-gatetext. It may optionally also return two further arguments: a string 2025*0Sstevel@tonic-gaterepresenting the text left after extraction (like $' for a pattern 2026*0Sstevel@tonic-gatematch), and a string representing any prefix skipped before the 2027*0Sstevel@tonic-gateextraction (like $` in a pattern match). Note that this is designed 2028*0Sstevel@tonic-gateto facilitate the use of other Text::Balanced subroutines with 2029*0Sstevel@tonic-gateC<extract_multiple>. Note too that the value returned by an extractor 2030*0Sstevel@tonic-gatesubroutine need not bear any relationship to the corresponding substring 2031*0Sstevel@tonic-gateof the original text (see examples below). 2032*0Sstevel@tonic-gate 2033*0Sstevel@tonic-gateIf the extractor is a precompiled regular expression or a string, 2034*0Sstevel@tonic-gateit is matched against the text in a scalar context with a leading 2035*0Sstevel@tonic-gate'\G' and the gc modifiers enabled. The extracted value is either 2036*0Sstevel@tonic-gate$1 if that variable is defined after the match, or else the 2037*0Sstevel@tonic-gatecomplete match (i.e. $&). 2038*0Sstevel@tonic-gate 2039*0Sstevel@tonic-gateIf the extractor is a hash reference, it must contain exactly one element. 2040*0Sstevel@tonic-gateThe value of that element is one of the 2041*0Sstevel@tonic-gateabove extractor types (subroutine reference, regular expression, or string). 2042*0Sstevel@tonic-gateThe key of that element is the name of a class into which the successful 2043*0Sstevel@tonic-gatereturn value of the extractor will be blessed. 2044*0Sstevel@tonic-gate 2045*0Sstevel@tonic-gateIf an extractor returns a defined value, that value is immediately 2046*0Sstevel@tonic-gatetreated as the next extracted field and pushed onto the list of fields. 2047*0Sstevel@tonic-gateIf the extractor was specified in a hash reference, the field is also 2048*0Sstevel@tonic-gateblessed into the appropriate class, 2049*0Sstevel@tonic-gate 2050*0Sstevel@tonic-gateIf the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 2051*0Sstevel@tonic-gateassumed to have failed to extract. 2052*0Sstevel@tonic-gateIf none of the extractor subroutines succeeds, then one 2053*0Sstevel@tonic-gatecharacter is extracted from the start of the text and the extraction 2054*0Sstevel@tonic-gatesubroutines reapplied. Characters which are thus removed are accumulated and 2055*0Sstevel@tonic-gateeventually become the next field (unless the fourth argument is true, in which 2056*0Sstevel@tonic-gatecase they are disgarded). 2057*0Sstevel@tonic-gate 2058*0Sstevel@tonic-gateFor example, the following extracts substrings that are valid Perl variables: 2059*0Sstevel@tonic-gate 2060*0Sstevel@tonic-gate @fields = extract_multiple($text, 2061*0Sstevel@tonic-gate [ sub { extract_variable($_[0]) } ], 2062*0Sstevel@tonic-gate undef, 1); 2063*0Sstevel@tonic-gate 2064*0Sstevel@tonic-gateThis example separates a text into fields which are quote delimited, 2065*0Sstevel@tonic-gatecurly bracketed, and anything else. The delimited and bracketed 2066*0Sstevel@tonic-gateparts are also blessed to identify them (the "anything else" is unblessed): 2067*0Sstevel@tonic-gate 2068*0Sstevel@tonic-gate @fields = extract_multiple($text, 2069*0Sstevel@tonic-gate [ 2070*0Sstevel@tonic-gate { Delim => sub { extract_delimited($_[0],q{'"}) } }, 2071*0Sstevel@tonic-gate { Brack => sub { extract_bracketed($_[0],'{}') } }, 2072*0Sstevel@tonic-gate ]); 2073*0Sstevel@tonic-gate 2074*0Sstevel@tonic-gateThis call extracts the next single substring that is a valid Perl quotelike 2075*0Sstevel@tonic-gateoperator (and removes it from $text): 2076*0Sstevel@tonic-gate 2077*0Sstevel@tonic-gate $quotelike = extract_multiple($text, 2078*0Sstevel@tonic-gate [ 2079*0Sstevel@tonic-gate sub { extract_quotelike($_[0]) }, 2080*0Sstevel@tonic-gate ], undef, 1); 2081*0Sstevel@tonic-gate 2082*0Sstevel@tonic-gateFinally, here is yet another way to do comma-separated value parsing: 2083*0Sstevel@tonic-gate 2084*0Sstevel@tonic-gate @fields = extract_multiple($csv_text, 2085*0Sstevel@tonic-gate [ 2086*0Sstevel@tonic-gate sub { extract_delimited($_[0],q{'"}) }, 2087*0Sstevel@tonic-gate qr/([^,]+)(.*)/, 2088*0Sstevel@tonic-gate ], 2089*0Sstevel@tonic-gate undef,1); 2090*0Sstevel@tonic-gate 2091*0Sstevel@tonic-gateThe list in the second argument means: 2092*0Sstevel@tonic-gateI<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 2093*0Sstevel@tonic-gateThe undef third argument means: 2094*0Sstevel@tonic-gateI<"...as many times as possible...">, 2095*0Sstevel@tonic-gateand the true value in the fourth argument means 2096*0Sstevel@tonic-gateI<"...discarding anything else that appears (i.e. the commas)">. 2097*0Sstevel@tonic-gate 2098*0Sstevel@tonic-gateIf you wanted the commas preserved as separate fields (i.e. like split 2099*0Sstevel@tonic-gatedoes if your split pattern has capturing parentheses), you would 2100*0Sstevel@tonic-gatejust make the last parameter undefined (or remove it). 2101*0Sstevel@tonic-gate 2102*0Sstevel@tonic-gate 2103*0Sstevel@tonic-gate=head2 C<gen_delimited_pat> 2104*0Sstevel@tonic-gate 2105*0Sstevel@tonic-gateThe C<gen_delimited_pat> subroutine takes a single (string) argument and 2106*0Sstevel@tonic-gate > builds a Friedl-style optimized regex that matches a string delimited 2107*0Sstevel@tonic-gateby any one of the characters in the single argument. For example: 2108*0Sstevel@tonic-gate 2109*0Sstevel@tonic-gate gen_delimited_pat(q{'"}) 2110*0Sstevel@tonic-gate 2111*0Sstevel@tonic-gatereturns the regex: 2112*0Sstevel@tonic-gate 2113*0Sstevel@tonic-gate (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 2114*0Sstevel@tonic-gate 2115*0Sstevel@tonic-gateNote that the specified delimiters are automatically quotemeta'd. 2116*0Sstevel@tonic-gate 2117*0Sstevel@tonic-gateA typical use of C<gen_delimited_pat> would be to build special purpose tags 2118*0Sstevel@tonic-gatefor C<extract_tagged>. For example, to properly ignore "empty" XML elements 2119*0Sstevel@tonic-gate(which might contain quoted strings): 2120*0Sstevel@tonic-gate 2121*0Sstevel@tonic-gate my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 2122*0Sstevel@tonic-gate 2123*0Sstevel@tonic-gate extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 2124*0Sstevel@tonic-gate 2125*0Sstevel@tonic-gate 2126*0Sstevel@tonic-gateC<gen_delimited_pat> may also be called with an optional second argument, 2127*0Sstevel@tonic-gatewhich specifies the "escape" character(s) to be used for each delimiter. 2128*0Sstevel@tonic-gateFor example to match a Pascal-style string (where ' is the delimiter 2129*0Sstevel@tonic-gateand '' is a literal ' within the string): 2130*0Sstevel@tonic-gate 2131*0Sstevel@tonic-gate gen_delimited_pat(q{'},q{'}); 2132*0Sstevel@tonic-gate 2133*0Sstevel@tonic-gateDifferent escape characters can be specified for different delimiters. 2134*0Sstevel@tonic-gateFor example, to specify that '/' is the escape for single quotes 2135*0Sstevel@tonic-gateand '%' is the escape for double quotes: 2136*0Sstevel@tonic-gate 2137*0Sstevel@tonic-gate gen_delimited_pat(q{'"},q{/%}); 2138*0Sstevel@tonic-gate 2139*0Sstevel@tonic-gateIf more delimiters than escape chars are specified, the last escape char 2140*0Sstevel@tonic-gateis used for the remaining delimiters. 2141*0Sstevel@tonic-gateIf no escape char is specified for a given specified delimiter, '\' is used. 2142*0Sstevel@tonic-gate 2143*0Sstevel@tonic-gateNote that 2144*0Sstevel@tonic-gateC<gen_delimited_pat> was previously called 2145*0Sstevel@tonic-gateC<delimited_pat>. That name may still be used, but is now deprecated. 2146*0Sstevel@tonic-gate 2147*0Sstevel@tonic-gate 2148*0Sstevel@tonic-gate=head1 DIAGNOSTICS 2149*0Sstevel@tonic-gate 2150*0Sstevel@tonic-gateIn a list context, all the functions return C<(undef,$original_text)> 2151*0Sstevel@tonic-gateon failure. In a scalar context, failure is indicated by returning C<undef> 2152*0Sstevel@tonic-gate(in this case the input text is not modified in any way). 2153*0Sstevel@tonic-gate 2154*0Sstevel@tonic-gateIn addition, on failure in I<any> context, the C<$@> variable is set. 2155*0Sstevel@tonic-gateAccessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 2156*0Sstevel@tonic-gatebelow. 2157*0Sstevel@tonic-gateAccessing C<$@-E<gt>{pos}> returns the offset into the original string at 2158*0Sstevel@tonic-gatewhich the error was detected (although not necessarily where it occurred!) 2159*0Sstevel@tonic-gatePrinting C<$@> directly produces the error message, with the offset appended. 2160*0Sstevel@tonic-gateOn success, the C<$@> variable is guaranteed to be C<undef>. 2161*0Sstevel@tonic-gate 2162*0Sstevel@tonic-gateThe available diagnostics are: 2163*0Sstevel@tonic-gate 2164*0Sstevel@tonic-gate=over 4 2165*0Sstevel@tonic-gate 2166*0Sstevel@tonic-gate=item C<Did not find a suitable bracket: "%s"> 2167*0Sstevel@tonic-gate 2168*0Sstevel@tonic-gateThe delimiter provided to C<extract_bracketed> was not one of 2169*0Sstevel@tonic-gateC<'()[]E<lt>E<gt>{}'>. 2170*0Sstevel@tonic-gate 2171*0Sstevel@tonic-gate=item C<Did not find prefix: /%s/> 2172*0Sstevel@tonic-gate 2173*0Sstevel@tonic-gateA non-optional prefix was specified but wasn't found at the start of the text. 2174*0Sstevel@tonic-gate 2175*0Sstevel@tonic-gate=item C<Did not find opening bracket after prefix: "%s"> 2176*0Sstevel@tonic-gate 2177*0Sstevel@tonic-gateC<extract_bracketed> or C<extract_codeblock> was expecting a 2178*0Sstevel@tonic-gateparticular kind of bracket at the start of the text, and didn't find it. 2179*0Sstevel@tonic-gate 2180*0Sstevel@tonic-gate=item C<No quotelike operator found after prefix: "%s"> 2181*0Sstevel@tonic-gate 2182*0Sstevel@tonic-gateC<extract_quotelike> didn't find one of the quotelike operators C<q>, 2183*0Sstevel@tonic-gateC<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 2184*0Sstevel@tonic-gateit was extracting. 2185*0Sstevel@tonic-gate 2186*0Sstevel@tonic-gate=item C<Unmatched closing bracket: "%c"> 2187*0Sstevel@tonic-gate 2188*0Sstevel@tonic-gateC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 2189*0Sstevel@tonic-gatea closing bracket where none was expected. 2190*0Sstevel@tonic-gate 2191*0Sstevel@tonic-gate=item C<Unmatched opening bracket(s): "%s"> 2192*0Sstevel@tonic-gate 2193*0Sstevel@tonic-gateC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 2194*0Sstevel@tonic-gateout of characters in the text before closing one or more levels of nested 2195*0Sstevel@tonic-gatebrackets. 2196*0Sstevel@tonic-gate 2197*0Sstevel@tonic-gate=item C<Unmatched embedded quote (%s)> 2198*0Sstevel@tonic-gate 2199*0Sstevel@tonic-gateC<extract_bracketed> attempted to match an embedded quoted substring, but 2200*0Sstevel@tonic-gatefailed to find a closing quote to match it. 2201*0Sstevel@tonic-gate 2202*0Sstevel@tonic-gate=item C<Did not find closing delimiter to match '%s'> 2203*0Sstevel@tonic-gate 2204*0Sstevel@tonic-gateC<extract_quotelike> was unable to find a closing delimiter to match the 2205*0Sstevel@tonic-gateone that opened the quote-like operation. 2206*0Sstevel@tonic-gate 2207*0Sstevel@tonic-gate=item C<Mismatched closing bracket: expected "%c" but found "%s"> 2208*0Sstevel@tonic-gate 2209*0Sstevel@tonic-gateC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 2210*0Sstevel@tonic-gatea valid bracket delimiter, but it was the wrong species. This usually 2211*0Sstevel@tonic-gateindicates a nesting error, but may indicate incorrect quoting or escaping. 2212*0Sstevel@tonic-gate 2213*0Sstevel@tonic-gate=item C<No block delimiter found after quotelike "%s"> 2214*0Sstevel@tonic-gate 2215*0Sstevel@tonic-gateC<extract_quotelike> or C<extract_codeblock> found one of the 2216*0Sstevel@tonic-gatequotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 2217*0Sstevel@tonic-gatewithout a suitable block after it. 2218*0Sstevel@tonic-gate 2219*0Sstevel@tonic-gate=item C<Did not find leading dereferencer> 2220*0Sstevel@tonic-gate 2221*0Sstevel@tonic-gateC<extract_variable> was expecting one of '$', '@', or '%' at the start of 2222*0Sstevel@tonic-gatea variable, but didn't find any of them. 2223*0Sstevel@tonic-gate 2224*0Sstevel@tonic-gate=item C<Bad identifier after dereferencer> 2225*0Sstevel@tonic-gate 2226*0Sstevel@tonic-gateC<extract_variable> found a '$', '@', or '%' indicating a variable, but that 2227*0Sstevel@tonic-gatecharacter was not followed by a legal Perl identifier. 2228*0Sstevel@tonic-gate 2229*0Sstevel@tonic-gate=item C<Did not find expected opening bracket at %s> 2230*0Sstevel@tonic-gate 2231*0Sstevel@tonic-gateC<extract_codeblock> failed to find any of the outermost opening brackets 2232*0Sstevel@tonic-gatethat were specified. 2233*0Sstevel@tonic-gate 2234*0Sstevel@tonic-gate=item C<Improperly nested codeblock at %s> 2235*0Sstevel@tonic-gate 2236*0Sstevel@tonic-gateA nested code block was found that started with a delimiter that was specified 2237*0Sstevel@tonic-gateas being only to be used as an outermost bracket. 2238*0Sstevel@tonic-gate 2239*0Sstevel@tonic-gate=item C<Missing second block for quotelike "%s"> 2240*0Sstevel@tonic-gate 2241*0Sstevel@tonic-gateC<extract_codeblock> or C<extract_quotelike> found one of the 2242*0Sstevel@tonic-gatequotelike operators C<s>, C<tr> or C<y> followed by only one block. 2243*0Sstevel@tonic-gate 2244*0Sstevel@tonic-gate=item C<No match found for opening bracket> 2245*0Sstevel@tonic-gate 2246*0Sstevel@tonic-gateC<extract_codeblock> failed to find a closing bracket to match the outermost 2247*0Sstevel@tonic-gateopening bracket. 2248*0Sstevel@tonic-gate 2249*0Sstevel@tonic-gate=item C<Did not find opening tag: /%s/> 2250*0Sstevel@tonic-gate 2251*0Sstevel@tonic-gateC<extract_tagged> did not find a suitable opening tag (after any specified 2252*0Sstevel@tonic-gateprefix was removed). 2253*0Sstevel@tonic-gate 2254*0Sstevel@tonic-gate=item C<Unable to construct closing tag to match: /%s/> 2255*0Sstevel@tonic-gate 2256*0Sstevel@tonic-gateC<extract_tagged> matched the specified opening tag and tried to 2257*0Sstevel@tonic-gatemodify the matched text to produce a matching closing tag (because 2258*0Sstevel@tonic-gatenone was specified). It failed to generate the closing tag, almost 2259*0Sstevel@tonic-gatecertainly because the opening tag did not start with a 2260*0Sstevel@tonic-gatebracket of some kind. 2261*0Sstevel@tonic-gate 2262*0Sstevel@tonic-gate=item C<Found invalid nested tag: %s> 2263*0Sstevel@tonic-gate 2264*0Sstevel@tonic-gateC<extract_tagged> found a nested tag that appeared in the "reject" list 2265*0Sstevel@tonic-gate(and the failure mode was not "MAX" or "PARA"). 2266*0Sstevel@tonic-gate 2267*0Sstevel@tonic-gate=item C<Found unbalanced nested tag: %s> 2268*0Sstevel@tonic-gate 2269*0Sstevel@tonic-gateC<extract_tagged> found a nested opening tag that was not matched by a 2270*0Sstevel@tonic-gatecorresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 2271*0Sstevel@tonic-gate 2272*0Sstevel@tonic-gate=item C<Did not find closing tag> 2273*0Sstevel@tonic-gate 2274*0Sstevel@tonic-gateC<extract_tagged> reached the end of the text without finding a closing tag 2275*0Sstevel@tonic-gateto match the original opening tag (and the failure mode was not 2276*0Sstevel@tonic-gate"MAX" or "PARA"). 2277*0Sstevel@tonic-gate 2278*0Sstevel@tonic-gate 2279*0Sstevel@tonic-gate 2280*0Sstevel@tonic-gate 2281*0Sstevel@tonic-gate=back 2282*0Sstevel@tonic-gate 2283*0Sstevel@tonic-gate 2284*0Sstevel@tonic-gate=head1 AUTHOR 2285*0Sstevel@tonic-gate 2286*0Sstevel@tonic-gateDamian Conway (damian@conway.org) 2287*0Sstevel@tonic-gate 2288*0Sstevel@tonic-gate 2289*0Sstevel@tonic-gate=head1 BUGS AND IRRITATIONS 2290*0Sstevel@tonic-gate 2291*0Sstevel@tonic-gateThere are undoubtedly serious bugs lurking somewhere in this code, if 2292*0Sstevel@tonic-gateonly because parts of it give the impression of understanding a great deal 2293*0Sstevel@tonic-gatemore about Perl than they really do. 2294*0Sstevel@tonic-gate 2295*0Sstevel@tonic-gateBug reports and other feedback are most welcome. 2296*0Sstevel@tonic-gate 2297*0Sstevel@tonic-gate 2298*0Sstevel@tonic-gate=head1 COPYRIGHT 2299*0Sstevel@tonic-gate 2300*0Sstevel@tonic-gate Copyright (c) 1997-2001, Damian Conway. All Rights Reserved. 2301*0Sstevel@tonic-gate This module is free software. It may be used, redistributed 2302*0Sstevel@tonic-gate and/or modified under the same terms as Perl itself. 2303