1eac174f2Safresh1# Copyright (C) 1997-2001 Damian Conway. All rights reserved. 2eac174f2Safresh1# Copyright (C) 2009 Adam Kennedy. 3*e0680481Safresh1# Copyright (C) 2015, 2022 Steve Hay and other contributors. All rights 4*e0680481Safresh1# reserved. 5eac174f2Safresh1 6eac174f2Safresh1# This module is free software; you can redistribute it and/or modify it under 7eac174f2Safresh1# the same terms as Perl itself, i.e. under the terms of either the GNU General 8eac174f2Safresh1# Public License or the Artistic License, as specified in the F<LICENCE> file. 9eac174f2Safresh1 10b39c5158Smillertpackage Text::Balanced; 11b39c5158Smillert 12b39c5158Smillert# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 13b39c5158Smillert# FOR FULL DOCUMENTATION SEE Balanced.pod 14b39c5158Smillert 15eac174f2Safresh1use 5.008001; 16b39c5158Smillertuse strict; 17b39c5158Smillertuse Exporter (); 18b39c5158Smillert 19b39c5158Smillertuse vars qw { $VERSION @ISA %EXPORT_TAGS }; 20b39c5158SmillertBEGIN { 21*e0680481Safresh1 $VERSION = '2.06'; 22b39c5158Smillert @ISA = 'Exporter'; 23b39c5158Smillert %EXPORT_TAGS = ( 24b39c5158Smillert ALL => [ qw{ 25b39c5158Smillert &extract_delimited 26b39c5158Smillert &extract_bracketed 27b39c5158Smillert &extract_quotelike 28b39c5158Smillert &extract_codeblock 29b39c5158Smillert &extract_variable 30b39c5158Smillert &extract_tagged 31b39c5158Smillert &extract_multiple 32b39c5158Smillert &gen_delimited_pat 33b39c5158Smillert &gen_extract_tagged 34b39c5158Smillert &delimited_pat 35b39c5158Smillert } ], 36b39c5158Smillert ); 37b39c5158Smillert} 38b39c5158Smillert 39b39c5158SmillertExporter::export_ok_tags('ALL'); 40b39c5158Smillert 41*e0680481Safresh1our $RE_PREREGEX_PAT = qr#( 42*e0680481Safresh1 [!=]~ 43*e0680481Safresh1 | split|grep|map 44*e0680481Safresh1 | not|and|or|xor 45*e0680481Safresh1)#x; 46*e0680481Safresh1our $RE_EXPR_PAT = qr#( 47*e0680481Safresh1 (?:\*\*|&&|\|\||<<|>>|//|[-+*x%^&|.])=? 48*e0680481Safresh1 | /(?:[^/]) 49*e0680481Safresh1 | =(?!>) 50*e0680481Safresh1 | return 51*e0680481Safresh1 | [\(\[] 52*e0680481Safresh1)#x; 53*e0680481Safresh1our $RE_NUM = qr/\s*[+\-.0-9][+\-.0-9e]*/i; # numerical constant 54eac174f2Safresh1 55*e0680481Safresh1our %ref2slashvalid; # is quotelike /.../ pattern valid here for given textref? 56*e0680481Safresh1our %ref2qmarkvalid; # is quotelike ?...? pattern valid here for given textref? 57b39c5158Smillert 58b39c5158Smillert# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 59b39c5158Smillert 60b39c5158Smillertsub _failmsg { 61b39c5158Smillert my ($message, $pos) = @_; 62b39c5158Smillert $@ = bless { 63b39c5158Smillert error => $message, 64b39c5158Smillert pos => $pos, 65b39c5158Smillert }, 'Text::Balanced::ErrorMsg'; 66b39c5158Smillert} 67b39c5158Smillert 68b39c5158Smillertsub _fail { 69b39c5158Smillert my ($wantarray, $textref, $message, $pos) = @_; 70b39c5158Smillert _failmsg $message, $pos if $message; 71b39c5158Smillert return (undef, $$textref, undef) if $wantarray; 72eac174f2Safresh1 return; 73b39c5158Smillert} 74b39c5158Smillert 75b39c5158Smillertsub _succeed { 76b39c5158Smillert $@ = undef; 77b39c5158Smillert my ($wantarray,$textref) = splice @_, 0, 2; 78b39c5158Smillert my ($extrapos, $extralen) = @_ > 18 79b39c5158Smillert ? splice(@_, -2, 2) 80b39c5158Smillert : (0, 0); 81b39c5158Smillert my ($startlen, $oppos) = @_[5,6]; 82b39c5158Smillert my $remainderpos = $_[2]; 83b39c5158Smillert if ( $wantarray ) { 84b39c5158Smillert my @res; 85b39c5158Smillert while (my ($from, $len) = splice @_, 0, 2) { 86b39c5158Smillert push @res, substr($$textref, $from, $len); 87b39c5158Smillert } 88b39c5158Smillert if ( $extralen ) { # CORRECT FILLET 89b39c5158Smillert my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); 90b39c5158Smillert $res[1] = "$extra$res[1]"; 91b39c5158Smillert eval { substr($$textref,$remainderpos,0) = $extra; 92b39c5158Smillert substr($$textref,$extrapos,$extralen,"\n")} ; 93b39c5158Smillert #REARRANGE HERE DOC AND FILLET IF POSSIBLE 94b39c5158Smillert pos($$textref) = $remainderpos-$extralen+1; # RESET \G 95b39c5158Smillert } else { 96b39c5158Smillert pos($$textref) = $remainderpos; # RESET \G 97b39c5158Smillert } 98b39c5158Smillert return @res; 99b39c5158Smillert } else { 100b39c5158Smillert my $match = substr($$textref,$_[0],$_[1]); 101b39c5158Smillert substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; 102b39c5158Smillert my $extra = $extralen 103b39c5158Smillert ? substr($$textref, $extrapos, $extralen)."\n" : ""; 104b39c5158Smillert eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE 105b39c5158Smillert pos($$textref) = $_[4]; # RESET \G 106b39c5158Smillert return $match; 107b39c5158Smillert } 108b39c5158Smillert} 109b39c5158Smillert 110b39c5158Smillert# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 111*e0680481Safresh1## no critic (Subroutines::ProhibitSubroutinePrototypes) 112b39c5158Smillert 113b39c5158Smillertsub gen_delimited_pat($;$) # ($delimiters;$escapes) 114b39c5158Smillert{ 115b39c5158Smillert my ($dels, $escs) = @_; 116b39c5158Smillert return "" unless $dels =~ /\S/; 117b39c5158Smillert $escs = '\\' unless $escs; 118b39c5158Smillert $escs .= substr($escs,-1) x (length($dels)-length($escs)); 119b39c5158Smillert my @pat = (); 120b39c5158Smillert my $i; 121b39c5158Smillert for ($i=0; $i<length $dels; $i++) 122b39c5158Smillert { 123b39c5158Smillert my $del = quotemeta substr($dels,$i,1); 124b39c5158Smillert my $esc = quotemeta substr($escs,$i,1); 125b39c5158Smillert if ($del eq $esc) 126b39c5158Smillert { 127b39c5158Smillert push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; 128b39c5158Smillert } 129b39c5158Smillert else 130b39c5158Smillert { 131b39c5158Smillert push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; 132b39c5158Smillert } 133b39c5158Smillert } 134b39c5158Smillert my $pat = join '|', @pat; 135b39c5158Smillert return "(?:$pat)"; 136b39c5158Smillert} 137b39c5158Smillert 138b39c5158Smillert*delimited_pat = \&gen_delimited_pat; 139b39c5158Smillert 140b39c5158Smillert# THE EXTRACTION FUNCTIONS 141b39c5158Smillert 142b39c5158Smillertsub extract_delimited (;$$$$) 143b39c5158Smillert{ 144b39c5158Smillert my $textref = defined $_[0] ? \$_[0] : \$_; 145*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 146b39c5158Smillert my $wantarray = wantarray; 147b39c5158Smillert my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; 148b39c5158Smillert my $pre = defined $_[2] ? $_[2] : '\s*'; 149b39c5158Smillert my $esc = defined $_[3] ? $_[3] : qq{\\}; 150b39c5158Smillert my $pat = gen_delimited_pat($del, $esc); 151b39c5158Smillert my $startpos = pos $$textref || 0; 152b39c5158Smillert return _fail($wantarray, $textref, "Not a delimited pattern", 0) 153b39c5158Smillert unless $$textref =~ m/\G($pre)($pat)/gc; 154b39c5158Smillert my $prelen = length($1); 155b39c5158Smillert my $matchpos = $startpos+$prelen; 156b39c5158Smillert my $endpos = pos $$textref; 157b39c5158Smillert return _succeed $wantarray, $textref, 158b39c5158Smillert $matchpos, $endpos-$matchpos, # MATCH 159b39c5158Smillert $endpos, length($$textref)-$endpos, # REMAINDER 160b39c5158Smillert $startpos, $prelen; # PREFIX 161b39c5158Smillert} 162b39c5158Smillert 163*e0680481Safresh1my %eb_delim_cache; 164*e0680481Safresh1sub _eb_delims { 165*e0680481Safresh1 my ($ldel_orig) = @_; 166*e0680481Safresh1 return @{ $eb_delim_cache{$ldel_orig} } if $eb_delim_cache{$ldel_orig}; 167b39c5158Smillert my $qdel = ""; 168b39c5158Smillert my $quotelike; 169*e0680481Safresh1 my $ldel = $ldel_orig; 170b39c5158Smillert $ldel =~ s/'//g and $qdel .= q{'}; 171b39c5158Smillert $ldel =~ s/"//g and $qdel .= q{"}; 172b39c5158Smillert $ldel =~ s/`//g and $qdel .= q{`}; 173b39c5158Smillert $ldel =~ s/q//g and $quotelike = 1; 174b39c5158Smillert $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; 175b39c5158Smillert my $rdel = $ldel; 176*e0680481Safresh1 return @{ $eb_delim_cache{$ldel_orig} = [] } unless $rdel =~ tr/[({</])}>/; 177*e0680481Safresh1 my $posbug = pos; 178*e0680481Safresh1 $ldel = join('|', map { quotemeta $_ } split('', $ldel)); 179*e0680481Safresh1 $rdel = join('|', map { quotemeta $_ } split('', $rdel)); 180*e0680481Safresh1 pos = $posbug; 181*e0680481Safresh1 @{ $eb_delim_cache{$ldel_orig} = [ 182*e0680481Safresh1 qr/\G($ldel)/, $qdel && qr/\G([$qdel])/, $quotelike, qr/\G($rdel)/ 183*e0680481Safresh1 ] }; 184*e0680481Safresh1} 185*e0680481Safresh1sub extract_bracketed (;$$$) 186*e0680481Safresh1{ 187*e0680481Safresh1 my $textref = defined $_[0] ? \$_[0] : \$_; 188*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 189*e0680481Safresh1 my $ldel = defined $_[1] ? $_[1] : '{([<'; 190*e0680481Safresh1 my $pre = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/; 191*e0680481Safresh1 my $wantarray = wantarray; 192*e0680481Safresh1 my @ret = _eb_delims($ldel); 193*e0680481Safresh1 unless (@ret) 194b39c5158Smillert { 195b39c5158Smillert return _fail $wantarray, $textref, 196b39c5158Smillert "Did not find a suitable bracket in delimiter: \"$_[1]\"", 197b39c5158Smillert 0; 198b39c5158Smillert } 199b39c5158Smillert 200b39c5158Smillert my $startpos = pos $$textref || 0; 201*e0680481Safresh1 my @match = _match_bracketed($textref, $pre, @ret); 202b39c5158Smillert 203b39c5158Smillert return _fail ($wantarray, $textref) unless @match; 204b39c5158Smillert 205b39c5158Smillert return _succeed ( $wantarray, $textref, 206b39c5158Smillert $match[2], $match[5]+2, # MATCH 207b39c5158Smillert @match[8,9], # REMAINDER 208b39c5158Smillert @match[0,1], # PREFIX 209b39c5158Smillert ); 210b39c5158Smillert} 211b39c5158Smillert 212*e0680481Safresh1sub _match_bracketed # $textref, $pre, $ldel, $qdel, $quotelike, $rdel 213b39c5158Smillert{ 214b39c5158Smillert my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; 215b39c5158Smillert my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); 216*e0680481Safresh1 unless ($$textref =~ m/$pre/gc) 217b39c5158Smillert { 218b39c5158Smillert _failmsg "Did not find prefix: /$pre/", $startpos; 219b39c5158Smillert return; 220b39c5158Smillert } 221b39c5158Smillert 222b39c5158Smillert $ldelpos = pos $$textref; 223b39c5158Smillert 224*e0680481Safresh1 unless ($$textref =~ m/$ldel/gc) 225b39c5158Smillert { 226b39c5158Smillert _failmsg "Did not find opening bracket after prefix: \"$pre\"", 227b39c5158Smillert pos $$textref; 228b39c5158Smillert pos $$textref = $startpos; 229b39c5158Smillert return; 230b39c5158Smillert } 231b39c5158Smillert 232b39c5158Smillert my @nesting = ( $1 ); 233b39c5158Smillert my $textlen = length $$textref; 234b39c5158Smillert while (pos $$textref < $textlen) 235b39c5158Smillert { 236b39c5158Smillert next if $$textref =~ m/\G\\./gcs; 237b39c5158Smillert 238*e0680481Safresh1 if ($$textref =~ m/$ldel/gc) 239b39c5158Smillert { 240b39c5158Smillert push @nesting, $1; 241b39c5158Smillert } 242*e0680481Safresh1 elsif ($$textref =~ m/$rdel/gc) 243b39c5158Smillert { 244b39c5158Smillert my ($found, $brackettype) = ($1, $1); 245b39c5158Smillert if ($#nesting < 0) 246b39c5158Smillert { 247b39c5158Smillert _failmsg "Unmatched closing bracket: \"$found\"", 248b39c5158Smillert pos $$textref; 249b39c5158Smillert pos $$textref = $startpos; 250b39c5158Smillert return; 251b39c5158Smillert } 252b39c5158Smillert my $expected = pop(@nesting); 253b39c5158Smillert $expected =~ tr/({[</)}]>/; 254b39c5158Smillert if ($expected ne $brackettype) 255b39c5158Smillert { 256b39c5158Smillert _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, 257b39c5158Smillert pos $$textref; 258b39c5158Smillert pos $$textref = $startpos; 259b39c5158Smillert return; 260b39c5158Smillert } 261b39c5158Smillert last if $#nesting < 0; 262b39c5158Smillert } 263*e0680481Safresh1 elsif ($qdel && $$textref =~ m/$qdel/gc) 264b39c5158Smillert { 265b39c5158Smillert $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; 266b39c5158Smillert _failmsg "Unmatched embedded quote ($1)", 267b39c5158Smillert pos $$textref; 268b39c5158Smillert pos $$textref = $startpos; 269b39c5158Smillert return; 270b39c5158Smillert } 271*e0680481Safresh1 elsif ($quotelike && _match_quotelike($textref,qr/\G()/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref})) 272b39c5158Smillert { 273*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; # back-compat 274b39c5158Smillert next; 275b39c5158Smillert } 276b39c5158Smillert 277b39c5158Smillert else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } 278b39c5158Smillert } 279b39c5158Smillert if ($#nesting>=0) 280b39c5158Smillert { 281b39c5158Smillert _failmsg "Unmatched opening bracket(s): " 282b39c5158Smillert . join("..",@nesting)."..", 283b39c5158Smillert pos $$textref; 284b39c5158Smillert pos $$textref = $startpos; 285b39c5158Smillert return; 286b39c5158Smillert } 287b39c5158Smillert 288b39c5158Smillert $endpos = pos $$textref; 289b39c5158Smillert 290b39c5158Smillert return ( 291b39c5158Smillert $startpos, $ldelpos-$startpos, # PREFIX 292b39c5158Smillert $ldelpos, 1, # OPENING BRACKET 293b39c5158Smillert $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS 294b39c5158Smillert $endpos-1, 1, # CLOSING BRACKET 295b39c5158Smillert $endpos, length($$textref)-$endpos, # REMAINDER 296b39c5158Smillert ); 297b39c5158Smillert} 298b39c5158Smillert 299b39c5158Smillertsub _revbracket($) 300b39c5158Smillert{ 301b39c5158Smillert my $brack = reverse $_[0]; 302b39c5158Smillert $brack =~ tr/[({</])}>/; 303b39c5158Smillert return $brack; 304b39c5158Smillert} 305b39c5158Smillert 306b39c5158Smillertmy $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; 307b39c5158Smillert 308*e0680481Safresh1my $et_default_ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; 309b39c5158Smillertsub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) 310b39c5158Smillert{ 311b39c5158Smillert my $textref = defined $_[0] ? \$_[0] : \$_; 312*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 313b39c5158Smillert my $ldel = $_[1]; 314b39c5158Smillert my $rdel = $_[2]; 315*e0680481Safresh1 my $pre = defined $_[3] ? qr/\G$_[3]/ : qr/\G\s*/; 316b39c5158Smillert my %options = defined $_[4] ? %{$_[4]} : (); 317b39c5158Smillert my $omode = defined $options{fail} ? $options{fail} : ''; 318b39c5158Smillert my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 319b39c5158Smillert : defined($options{reject}) ? $options{reject} 320b39c5158Smillert : '' 321b39c5158Smillert ; 322b39c5158Smillert my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 323b39c5158Smillert : defined($options{ignore}) ? $options{ignore} 324b39c5158Smillert : '' 325b39c5158Smillert ; 326b39c5158Smillert 327*e0680481Safresh1 $ldel = $et_default_ldel if !defined $ldel; 328b39c5158Smillert $@ = undef; 329b39c5158Smillert 330b39c5158Smillert my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 331b39c5158Smillert 332b39c5158Smillert return _fail(wantarray, $textref) unless @match; 333b39c5158Smillert return _succeed wantarray, $textref, 334b39c5158Smillert $match[2], $match[3]+$match[5]+$match[7], # MATCH 335b39c5158Smillert @match[8..9,0..1,2..7]; # REM, PRE, BITS 336b39c5158Smillert} 337b39c5158Smillert 338b39c5158Smillertsub _match_tagged # ($$$$$$$) 339b39c5158Smillert{ 340b39c5158Smillert my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; 341b39c5158Smillert my $rdelspec; 342b39c5158Smillert 343b39c5158Smillert my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); 344b39c5158Smillert 345*e0680481Safresh1 unless ($$textref =~ m/$pre/gc) 346b39c5158Smillert { 347b39c5158Smillert _failmsg "Did not find prefix: /$pre/", pos $$textref; 348b39c5158Smillert goto failed; 349b39c5158Smillert } 350b39c5158Smillert 351b39c5158Smillert $opentagpos = pos($$textref); 352b39c5158Smillert 353b39c5158Smillert unless ($$textref =~ m/\G$ldel/gc) 354b39c5158Smillert { 355b39c5158Smillert _failmsg "Did not find opening tag: /$ldel/", pos $$textref; 356b39c5158Smillert goto failed; 357b39c5158Smillert } 358b39c5158Smillert 359b39c5158Smillert $textpos = pos($$textref); 360b39c5158Smillert 361b39c5158Smillert if (!defined $rdel) 362b39c5158Smillert { 363b39c5158Smillert $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); 364b39c5158Smillert unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) 365b39c5158Smillert { 366b39c5158Smillert _failmsg "Unable to construct closing tag to match: $rdel", 367b39c5158Smillert pos $$textref; 368b39c5158Smillert goto failed; 369b39c5158Smillert } 370b39c5158Smillert } 371b39c5158Smillert else 372b39c5158Smillert { 373eac174f2Safresh1 ## no critic (BuiltinFunctions::ProhibitStringyEval) 374b39c5158Smillert $rdelspec = eval "qq{$rdel}" || do { 375b39c5158Smillert my $del; 376b39c5158Smillert for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) 377b39c5158Smillert { next if $rdel =~ /\Q$_/; $del = $_; last } 378b39c5158Smillert unless ($del) { 379b39c5158Smillert use Carp; 380b39c5158Smillert croak "Can't interpolate right delimiter $rdel" 381b39c5158Smillert } 382b39c5158Smillert eval "qq$del$rdel$del"; 383b39c5158Smillert }; 384b39c5158Smillert } 385b39c5158Smillert 386b39c5158Smillert while (pos($$textref) < length($$textref)) 387b39c5158Smillert { 388b39c5158Smillert next if $$textref =~ m/\G\\./gc; 389b39c5158Smillert 390b39c5158Smillert if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) 391b39c5158Smillert { 392b39c5158Smillert $parapos = pos($$textref) - length($1) 393b39c5158Smillert unless defined $parapos; 394b39c5158Smillert } 395b39c5158Smillert elsif ($$textref =~ m/\G($rdelspec)/gc ) 396b39c5158Smillert { 397b39c5158Smillert $closetagpos = pos($$textref)-length($1); 398b39c5158Smillert goto matched; 399b39c5158Smillert } 400b39c5158Smillert elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) 401b39c5158Smillert { 402b39c5158Smillert next; 403b39c5158Smillert } 404b39c5158Smillert elsif ($bad && $$textref =~ m/\G($bad)/gcs) 405b39c5158Smillert { 406b39c5158Smillert pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS 407b39c5158Smillert goto short if ($omode eq 'PARA' || $omode eq 'MAX'); 408b39c5158Smillert _failmsg "Found invalid nested tag: $1", pos $$textref; 409b39c5158Smillert goto failed; 410b39c5158Smillert } 411b39c5158Smillert elsif ($$textref =~ m/\G($ldel)/gc) 412b39c5158Smillert { 413b39c5158Smillert my $tag = $1; 414b39c5158Smillert pos($$textref) -= length($tag); # REWIND TO NESTED TAG 415b39c5158Smillert unless (_match_tagged(@_)) # MATCH NESTED TAG 416b39c5158Smillert { 417b39c5158Smillert goto short if $omode eq 'PARA' || $omode eq 'MAX'; 418b39c5158Smillert _failmsg "Found unbalanced nested tag: $tag", 419b39c5158Smillert pos $$textref; 420b39c5158Smillert goto failed; 421b39c5158Smillert } 422b39c5158Smillert } 423b39c5158Smillert else { $$textref =~ m/./gcs } 424b39c5158Smillert } 425b39c5158Smillert 426b39c5158Smillertshort: 427b39c5158Smillert $closetagpos = pos($$textref); 428b39c5158Smillert goto matched if $omode eq 'MAX'; 429b39c5158Smillert goto failed unless $omode eq 'PARA'; 430b39c5158Smillert 431b39c5158Smillert if (defined $parapos) { pos($$textref) = $parapos } 432b39c5158Smillert else { $parapos = pos($$textref) } 433b39c5158Smillert 434b39c5158Smillert return ( 435b39c5158Smillert $startpos, $opentagpos-$startpos, # PREFIX 436b39c5158Smillert $opentagpos, $textpos-$opentagpos, # OPENING TAG 437b39c5158Smillert $textpos, $parapos-$textpos, # TEXT 438b39c5158Smillert $parapos, 0, # NO CLOSING TAG 439b39c5158Smillert $parapos, length($$textref)-$parapos, # REMAINDER 440b39c5158Smillert ); 441b39c5158Smillert 442b39c5158Smillertmatched: 443b39c5158Smillert $endpos = pos($$textref); 444b39c5158Smillert return ( 445b39c5158Smillert $startpos, $opentagpos-$startpos, # PREFIX 446b39c5158Smillert $opentagpos, $textpos-$opentagpos, # OPENING TAG 447b39c5158Smillert $textpos, $closetagpos-$textpos, # TEXT 448b39c5158Smillert $closetagpos, $endpos-$closetagpos, # CLOSING TAG 449b39c5158Smillert $endpos, length($$textref)-$endpos, # REMAINDER 450b39c5158Smillert ); 451b39c5158Smillert 452b39c5158Smillertfailed: 453b39c5158Smillert _failmsg "Did not find closing tag", pos $$textref unless $@; 454b39c5158Smillert pos($$textref) = $startpos; 455b39c5158Smillert return; 456b39c5158Smillert} 457b39c5158Smillert 458b39c5158Smillertsub extract_variable (;$$) 459b39c5158Smillert{ 460b39c5158Smillert my $textref = defined $_[0] ? \$_[0] : \$_; 461b39c5158Smillert return ("","","") unless defined $$textref; 462*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 463*e0680481Safresh1 my $pre = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/; 464b39c5158Smillert 465b39c5158Smillert my @match = _match_variable($textref,$pre); 466b39c5158Smillert 467b39c5158Smillert return _fail wantarray, $textref unless @match; 468b39c5158Smillert 469b39c5158Smillert return _succeed wantarray, $textref, 470b39c5158Smillert @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX 471b39c5158Smillert} 472b39c5158Smillert 473*e0680481Safresh1sub _match_variable 474b39c5158Smillert{ 475b39c5158Smillert# $# 476b39c5158Smillert# $^ 477b39c5158Smillert# $$ 478b39c5158Smillert my ($textref, $pre) = @_; 479b39c5158Smillert my $startpos = pos($$textref) = pos($$textref)||0; 480*e0680481Safresh1 unless ($$textref =~ m/$pre/gc) 481b39c5158Smillert { 482b39c5158Smillert _failmsg "Did not find prefix: /$pre/", pos $$textref; 483b39c5158Smillert return; 484b39c5158Smillert } 485b39c5158Smillert my $varpos = pos($$textref); 486b39c5158Smillert unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 487b39c5158Smillert { 488b39c5158Smillert unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) 489b39c5158Smillert { 490b39c5158Smillert _failmsg "Did not find leading dereferencer", pos $$textref; 491b39c5158Smillert pos $$textref = $startpos; 492b39c5158Smillert return; 493b39c5158Smillert } 494b39c5158Smillert my $deref = $1; 495b39c5158Smillert 496b39c5158Smillert unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci 497*e0680481Safresh1 or _match_codeblock($textref, qr/\G()/, '\{', qr/\G\s*(\})/, '\{', '\}', 0, 1) 498*e0680481Safresh1 or $deref eq '$#' or $deref eq '$$' 499*e0680481Safresh1 or pos($$textref) == length $$textref ) 500b39c5158Smillert { 501b39c5158Smillert _failmsg "Bad identifier after dereferencer", pos $$textref; 502b39c5158Smillert pos $$textref = $startpos; 503b39c5158Smillert return; 504b39c5158Smillert } 505b39c5158Smillert } 506b39c5158Smillert 507b39c5158Smillert while (1) 508b39c5158Smillert { 509b39c5158Smillert next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; 510b39c5158Smillert next if _match_codeblock($textref, 511*e0680481Safresh1 qr/\G\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, 512*e0680481Safresh1 qr/[({[]/, qr/\G\s*([)}\]])/, 513*e0680481Safresh1 qr/[({[]/, qr/[)}\]]/, 0, 1); 514b39c5158Smillert next if _match_codeblock($textref, 515*e0680481Safresh1 qr/\G\s*/, qr/[{[]/, qr/\G\s*([}\]])/, 516*e0680481Safresh1 qr/[{[]/, qr/[}\]]/, 0, 1); 517*e0680481Safresh1 next if _match_variable($textref,qr/\G\s*->\s*/); 518b39c5158Smillert next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; 519b39c5158Smillert last; 520b39c5158Smillert } 521*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 522b39c5158Smillert 523b39c5158Smillert my $endpos = pos($$textref); 524b39c5158Smillert return ($startpos, $varpos-$startpos, 525b39c5158Smillert $varpos, $endpos-$varpos, 526b39c5158Smillert $endpos, length($$textref)-$endpos 527b39c5158Smillert ); 528b39c5158Smillert} 529b39c5158Smillert 530*e0680481Safresh1my %ec_delim_cache; 531*e0680481Safresh1sub _ec_delims { 532*e0680481Safresh1 my ($ldel_inner, $ldel_outer) = @_; 533*e0680481Safresh1 return @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} } 534*e0680481Safresh1 if $ec_delim_cache{$ldel_outer}{$ldel_inner}; 535b39c5158Smillert my $rdel_inner = $ldel_inner; 536b39c5158Smillert my $rdel_outer = $ldel_outer; 537b39c5158Smillert my $posbug = pos; 538b39c5158Smillert for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } 539b39c5158Smillert for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } 540b39c5158Smillert for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) 541b39c5158Smillert { 542b39c5158Smillert $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' 543b39c5158Smillert } 544b39c5158Smillert pos = $posbug; 545*e0680481Safresh1 @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} = [ 546*e0680481Safresh1 $ldel_outer, qr/\G\s*($rdel_outer)/, $ldel_inner, $rdel_inner 547*e0680481Safresh1 ] }; 548*e0680481Safresh1} 549*e0680481Safresh1sub extract_codeblock (;$$$$$) 550*e0680481Safresh1{ 551*e0680481Safresh1 my $textref = defined $_[0] ? \$_[0] : \$_; 552*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 553*e0680481Safresh1 my $wantarray = wantarray; 554*e0680481Safresh1 my $ldel_inner = defined $_[1] ? $_[1] : '{'; 555*e0680481Safresh1 my $pre = !defined $_[2] ? qr/\G\s*/ : qr/\G$_[2]/; 556*e0680481Safresh1 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; 557*e0680481Safresh1 my $rd = $_[4]; 558*e0680481Safresh1 my @delims = _ec_delims($ldel_inner, $ldel_outer); 559b39c5158Smillert 560*e0680481Safresh1 my @match = _match_codeblock($textref, $pre, @delims, $rd, 1); 561b39c5158Smillert return _fail($wantarray, $textref) unless @match; 562b39c5158Smillert return _succeed($wantarray, $textref, 563b39c5158Smillert @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX 564b39c5158Smillert ); 565b39c5158Smillert} 566b39c5158Smillert 567*e0680481Safresh1sub _match_codeblock 568b39c5158Smillert{ 569*e0680481Safresh1 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd, $no_backcompat) = @_; 570*e0680481Safresh1 $rdel_outer = qr/\G\s*($rdel_outer)/ if !$no_backcompat; # Switch calls this func directly 571b39c5158Smillert my $startpos = pos($$textref) = pos($$textref) || 0; 572*e0680481Safresh1 unless ($$textref =~ m/$pre/gc) 573b39c5158Smillert { 574b39c5158Smillert _failmsg qq{Did not match prefix /$pre/ at"} . 575b39c5158Smillert substr($$textref,pos($$textref),20) . 576b39c5158Smillert q{..."}, 577b39c5158Smillert pos $$textref; 578b39c5158Smillert return; 579b39c5158Smillert } 580b39c5158Smillert my $codepos = pos($$textref); 581b39c5158Smillert unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER 582b39c5158Smillert { 583b39c5158Smillert _failmsg qq{Did not find expected opening bracket at "} . 584b39c5158Smillert substr($$textref,pos($$textref),20) . 585b39c5158Smillert q{..."}, 586b39c5158Smillert pos $$textref; 587b39c5158Smillert pos $$textref = $startpos; 588b39c5158Smillert return; 589b39c5158Smillert } 590b39c5158Smillert my $closing = $1; 591b39c5158Smillert $closing =~ tr/([<{/)]>}/; 592b39c5158Smillert my $matched; 593*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 594*e0680481Safresh1 if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset 595b39c5158Smillert while (pos($$textref) < length($$textref)) 596b39c5158Smillert { 597b39c5158Smillert if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) 598b39c5158Smillert { 599*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 600b39c5158Smillert next; 601b39c5158Smillert } 602b39c5158Smillert 603b39c5158Smillert if ($$textref =~ m/\G\s*#.*/gc) 604b39c5158Smillert { 605b39c5158Smillert next; 606b39c5158Smillert } 607b39c5158Smillert 608*e0680481Safresh1 if ($$textref =~ m/$rdel_outer/gc) 609b39c5158Smillert { 610b39c5158Smillert unless ($matched = ($closing && $1 eq $closing) ) 611b39c5158Smillert { 612b39c5158Smillert next if $1 eq '>'; # MIGHT BE A "LESS THAN" 613b39c5158Smillert _failmsg q{Mismatched closing bracket at "} . 614b39c5158Smillert substr($$textref,pos($$textref),20) . 615b39c5158Smillert qq{...". Expected '$closing'}, 616b39c5158Smillert pos $$textref; 617b39c5158Smillert } 618b39c5158Smillert last; 619b39c5158Smillert } 620b39c5158Smillert 621*e0680481Safresh1 if (_match_variable($textref,qr/\G\s*/) || 622*e0680481Safresh1 _match_quotelike($textref,qr/\G\s*/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}) ) 623b39c5158Smillert { 624*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 625b39c5158Smillert next; 626b39c5158Smillert } 627b39c5158Smillert 628*e0680481Safresh1 if ($$textref =~ m#\G\s*(?!$ldel_inner)(?:$RE_PREREGEX_PAT|$RE_EXPR_PAT)#gc) 629b39c5158Smillert { 630*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; 631b39c5158Smillert next; 632b39c5158Smillert } 633b39c5158Smillert 634*e0680481Safresh1 if ( _match_codeblock($textref, qr/\G\s*/, $ldel_inner, qr/\G\s*($rdel_inner)/, $ldel_inner, $rdel_inner, $rd, 1) ) 635b39c5158Smillert { 636*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; 637b39c5158Smillert next; 638b39c5158Smillert } 639b39c5158Smillert 640b39c5158Smillert if ($$textref =~ m/\G\s*$ldel_outer/gc) 641b39c5158Smillert { 642b39c5158Smillert _failmsg q{Improperly nested codeblock at "} . 643b39c5158Smillert substr($$textref,pos($$textref),20) . 644b39c5158Smillert q{..."}, 645b39c5158Smillert pos $$textref; 646b39c5158Smillert last; 647b39c5158Smillert } 648b39c5158Smillert 649*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 650b39c5158Smillert $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; 651b39c5158Smillert } 652b39c5158Smillert continue { $@ = undef } 653b39c5158Smillert 654b39c5158Smillert unless ($matched) 655b39c5158Smillert { 656b39c5158Smillert _failmsg 'No match found for opening bracket', pos $$textref 657b39c5158Smillert unless $@; 658b39c5158Smillert return; 659b39c5158Smillert } 660b39c5158Smillert 661*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = undef; 662b39c5158Smillert my $endpos = pos($$textref); 663b39c5158Smillert return ( $startpos, $codepos-$startpos, 664b39c5158Smillert $codepos, $endpos-$codepos, 665b39c5158Smillert $endpos, length($$textref)-$endpos, 666b39c5158Smillert ); 667b39c5158Smillert} 668b39c5158Smillert 669b39c5158Smillert 670b39c5158Smillertmy %mods = ( 671b39c5158Smillert 'none' => '[cgimsox]*', 672b39c5158Smillert 'm' => '[cgimsox]*', 673b39c5158Smillert 's' => '[cegimsox]*', 674b39c5158Smillert 'tr' => '[cds]*', 675b39c5158Smillert 'y' => '[cds]*', 676b39c5158Smillert 'qq' => '', 677b39c5158Smillert 'qx' => '', 678b39c5158Smillert 'qw' => '', 679b39c5158Smillert 'qr' => '[imsx]*', 680b39c5158Smillert 'q' => '', 681b39c5158Smillert); 682b39c5158Smillert 683b39c5158Smillertsub extract_quotelike (;$$) 684b39c5158Smillert{ 685b39c5158Smillert my $textref = $_[0] ? \$_[0] : \$_; 686*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 687b39c5158Smillert my $wantarray = wantarray; 688*e0680481Safresh1 my $pre = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/; 689b39c5158Smillert 690*e0680481Safresh1 my @match = _match_quotelike($textref,$pre,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}); 691b39c5158Smillert return _fail($wantarray, $textref) unless @match; 692b39c5158Smillert return _succeed($wantarray, $textref, 693b39c5158Smillert $match[2], $match[18]-$match[2], # MATCH 694b39c5158Smillert @match[18,19], # REMAINDER 695b39c5158Smillert @match[0,1], # PREFIX 696b39c5158Smillert @match[2..17], # THE BITS 697b39c5158Smillert @match[20,21], # ANY FILLET? 698b39c5158Smillert ); 699b39c5158Smillert}; 700b39c5158Smillert 701*e0680481Safresh1my %maybe_quote = map +($_=>1), qw(" ' `); 702*e0680481Safresh1sub _match_quotelike 703b39c5158Smillert{ 704*e0680481Safresh1 my ($textref, $pre, $allow_slash_match, $allow_qmark_match) = @_; 705*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 706*e0680481Safresh1 if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset 707b39c5158Smillert 708b39c5158Smillert my ($textlen,$startpos, 709b39c5158Smillert $preld1pos,$ld1pos,$str1pos,$rd1pos, 710b39c5158Smillert $preld2pos,$ld2pos,$str2pos,$rd2pos, 711b39c5158Smillert $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); 712b39c5158Smillert 713*e0680481Safresh1 unless ($$textref =~ m/$pre/gc) 714b39c5158Smillert { 715b39c5158Smillert _failmsg qq{Did not find prefix /$pre/ at "} . 716b39c5158Smillert substr($$textref, pos($$textref), 20) . 717b39c5158Smillert q{..."}, 718b39c5158Smillert pos $$textref; 719b39c5158Smillert return; 720b39c5158Smillert } 721*e0680481Safresh1 my $oppos = pos($$textref); 722b39c5158Smillert my $initial = substr($$textref,$oppos,1); 723*e0680481Safresh1 if ($initial && $maybe_quote{$initial} 724*e0680481Safresh1 || $allow_slash_match && $initial eq '/' 725*e0680481Safresh1 || $allow_qmark_match && $initial eq '?') 726b39c5158Smillert { 727*e0680481Safresh1 unless ($$textref =~ m/\G \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) 728b39c5158Smillert { 729b39c5158Smillert _failmsg qq{Did not find closing delimiter to match '$initial' at "} . 730b39c5158Smillert substr($$textref, $oppos, 20) . 731b39c5158Smillert q{..."}, 732b39c5158Smillert pos $$textref; 733b39c5158Smillert pos $$textref = $startpos; 734b39c5158Smillert return; 735b39c5158Smillert } 736b39c5158Smillert $modpos= pos($$textref); 737b39c5158Smillert $rd1pos = $modpos-1; 738b39c5158Smillert 739b39c5158Smillert if ($initial eq '/' || $initial eq '?') 740b39c5158Smillert { 741b39c5158Smillert $$textref =~ m/\G$mods{none}/gc 742b39c5158Smillert } 743b39c5158Smillert 744b39c5158Smillert my $endpos = pos($$textref); 745*e0680481Safresh1 $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0; 746b39c5158Smillert return ( 747b39c5158Smillert $startpos, $oppos-$startpos, # PREFIX 748b39c5158Smillert $oppos, 0, # NO OPERATOR 749b39c5158Smillert $oppos, 1, # LEFT DEL 750b39c5158Smillert $oppos+1, $rd1pos-$oppos-1, # STR/PAT 751b39c5158Smillert $rd1pos, 1, # RIGHT DEL 752b39c5158Smillert $modpos, 0, # NO 2ND LDEL 753b39c5158Smillert $modpos, 0, # NO 2ND STR 754b39c5158Smillert $modpos, 0, # NO 2ND RDEL 755b39c5158Smillert $modpos, $endpos-$modpos, # MODIFIERS 756b39c5158Smillert $endpos, $textlen-$endpos, # REMAINDER 757b39c5158Smillert ); 758b39c5158Smillert } 759b39c5158Smillert 760*e0680481Safresh1 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=[a-zA-Z]|\s*['"`;,]))}gc) 761b39c5158Smillert { 762b39c5158Smillert _failmsg q{No quotelike operator found after prefix at "} . 763b39c5158Smillert substr($$textref, pos($$textref), 20) . 764b39c5158Smillert q{..."}, 765b39c5158Smillert pos $$textref; 766b39c5158Smillert pos $$textref = $startpos; 767b39c5158Smillert return; 768b39c5158Smillert } 769b39c5158Smillert 770b39c5158Smillert my $op = $1; 771b39c5158Smillert $preld1pos = pos($$textref); 772b39c5158Smillert if ($op eq '<<') { 773b39c5158Smillert $ld1pos = pos($$textref); 774b39c5158Smillert my $label; 775b39c5158Smillert if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { 776b39c5158Smillert $label = $1; 777b39c5158Smillert } 778b39c5158Smillert elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' 779b39c5158Smillert | \G " ([^"\\]* (?:\\.[^"\\]*)*) " 780b39c5158Smillert | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` 781b39c5158Smillert }gcsx) { 782b39c5158Smillert $label = $+; 783b39c5158Smillert } 784b39c5158Smillert else { 785b39c5158Smillert $label = ""; 786b39c5158Smillert } 787b39c5158Smillert my $extrapos = pos($$textref); 788b39c5158Smillert $$textref =~ m{.*\n}gc; 789b39c5158Smillert $str1pos = pos($$textref)--; 790b39c5158Smillert unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { 791b39c5158Smillert _failmsg qq{Missing here doc terminator ('$label') after "} . 792b39c5158Smillert substr($$textref, $startpos, 20) . 793b39c5158Smillert q{..."}, 794b39c5158Smillert pos $$textref; 795b39c5158Smillert pos $$textref = $startpos; 796b39c5158Smillert return; 797b39c5158Smillert } 798b39c5158Smillert $rd1pos = pos($$textref); 799b39c5158Smillert $$textref =~ m{\Q$label\E\n}gc; 800b39c5158Smillert $ld2pos = pos($$textref); 801*e0680481Safresh1 $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0; 802b39c5158Smillert return ( 803b39c5158Smillert $startpos, $oppos-$startpos, # PREFIX 804b39c5158Smillert $oppos, length($op), # OPERATOR 805b39c5158Smillert $ld1pos, $extrapos-$ld1pos, # LEFT DEL 806b39c5158Smillert $str1pos, $rd1pos-$str1pos, # STR/PAT 807b39c5158Smillert $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL 808b39c5158Smillert $ld2pos, 0, # NO 2ND LDEL 809b39c5158Smillert $ld2pos, 0, # NO 2ND STR 810b39c5158Smillert $ld2pos, 0, # NO 2ND RDEL 811b39c5158Smillert $ld2pos, 0, # NO MODIFIERS 812b39c5158Smillert $ld2pos, $textlen-$ld2pos, # REMAINDER 813b39c5158Smillert $extrapos, $str1pos-$extrapos, # FILLETED BIT 814b39c5158Smillert ); 815b39c5158Smillert } 816b39c5158Smillert 817b39c5158Smillert $$textref =~ m/\G\s*/gc; 818b39c5158Smillert $ld1pos = pos($$textref); 819b39c5158Smillert $str1pos = $ld1pos+1; 820b39c5158Smillert 821*e0680481Safresh1 if ($$textref !~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD 822b39c5158Smillert { 823b39c5158Smillert _failmsg "No block delimiter found after quotelike $op", 824b39c5158Smillert pos $$textref; 825b39c5158Smillert pos $$textref = $startpos; 826b39c5158Smillert return; 827b39c5158Smillert } 828*e0680481Safresh1 elsif (substr($$textref, $ld1pos, 2) eq '=>') 829*e0680481Safresh1 { 830*e0680481Safresh1 _failmsg "quotelike $op was actually quoted by '=>'", 831*e0680481Safresh1 pos $$textref; 832*e0680481Safresh1 pos $$textref = $startpos; 833*e0680481Safresh1 return; 834*e0680481Safresh1 } 835b39c5158Smillert pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN 836b39c5158Smillert my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); 837b39c5158Smillert if ($ldel1 =~ /[[(<{]/) 838b39c5158Smillert { 839b39c5158Smillert $rdel1 =~ tr/[({</])}>/; 840*e0680481Safresh1 defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel1)/,"","",qr/\G($rdel1)/)) 841b39c5158Smillert || do { pos $$textref = $startpos; return }; 842b39c5158Smillert $ld2pos = pos($$textref); 843b39c5158Smillert $rd1pos = $ld2pos-1; 844b39c5158Smillert } 845b39c5158Smillert else 846b39c5158Smillert { 847b39c5158Smillert $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs 848b39c5158Smillert || do { pos $$textref = $startpos; return }; 849b39c5158Smillert $ld2pos = $rd1pos = pos($$textref)-1; 850b39c5158Smillert } 851b39c5158Smillert 852b39c5158Smillert my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; 853b39c5158Smillert if ($second_arg) 854b39c5158Smillert { 855b39c5158Smillert my ($ldel2, $rdel2); 856b39c5158Smillert if ($ldel1 =~ /[[(<{]/) 857b39c5158Smillert { 858b39c5158Smillert unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD 859b39c5158Smillert { 860b39c5158Smillert _failmsg "Missing second block for quotelike $op", 861b39c5158Smillert pos $$textref; 862b39c5158Smillert pos $$textref = $startpos; 863b39c5158Smillert return; 864b39c5158Smillert } 865b39c5158Smillert $ldel2 = $rdel2 = "\Q$1"; 866b39c5158Smillert $rdel2 =~ tr/[({</])}>/; 867b39c5158Smillert } 868b39c5158Smillert else 869b39c5158Smillert { 870b39c5158Smillert $ldel2 = $rdel2 = $ldel1; 871b39c5158Smillert } 872b39c5158Smillert $str2pos = $ld2pos+1; 873b39c5158Smillert 874b39c5158Smillert if ($ldel2 =~ /[[(<{]/) 875b39c5158Smillert { 876b39c5158Smillert pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD 877*e0680481Safresh1 defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel2)/,"","",qr/\G($rdel2)/)) 878b39c5158Smillert || do { pos $$textref = $startpos; return }; 879b39c5158Smillert } 880b39c5158Smillert else 881b39c5158Smillert { 882b39c5158Smillert $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs 883b39c5158Smillert || do { pos $$textref = $startpos; return }; 884b39c5158Smillert } 885b39c5158Smillert $rd2pos = pos($$textref)-1; 886b39c5158Smillert } 887b39c5158Smillert else 888b39c5158Smillert { 889b39c5158Smillert $ld2pos = $str2pos = $rd2pos = $rd1pos; 890b39c5158Smillert } 891b39c5158Smillert 892b39c5158Smillert $modpos = pos $$textref; 893b39c5158Smillert 894b39c5158Smillert $$textref =~ m/\G($mods{$op})/gc; 895b39c5158Smillert my $endpos = pos $$textref; 896*e0680481Safresh1 $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = undef; 897b39c5158Smillert 898b39c5158Smillert return ( 899b39c5158Smillert $startpos, $oppos-$startpos, # PREFIX 900b39c5158Smillert $oppos, length($op), # OPERATOR 901b39c5158Smillert $ld1pos, 1, # LEFT DEL 902b39c5158Smillert $str1pos, $rd1pos-$str1pos, # STR/PAT 903b39c5158Smillert $rd1pos, 1, # RIGHT DEL 904b39c5158Smillert $ld2pos, $second_arg, # 2ND LDEL (MAYBE) 905b39c5158Smillert $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) 906b39c5158Smillert $rd2pos, $second_arg, # 2ND RDEL (MAYBE) 907b39c5158Smillert $modpos, $endpos-$modpos, # MODIFIERS 908b39c5158Smillert $endpos, $textlen-$endpos, # REMAINDER 909b39c5158Smillert ); 910b39c5158Smillert} 911b39c5158Smillert 912b39c5158Smillertmy $def_func = [ 913b39c5158Smillert sub { extract_variable($_[0], '') }, 914b39c5158Smillert sub { extract_quotelike($_[0],'') }, 915b39c5158Smillert sub { extract_codeblock($_[0],'{}','') }, 916b39c5158Smillert]; 917*e0680481Safresh1my %ref_not_regex = map +($_=>1), qw(CODE Text::Balanced::Extractor); 918b39c5158Smillert 919*e0680481Safresh1sub _update_patvalid { 920*e0680481Safresh1 my ($textref, $text) = @_; 921*e0680481Safresh1 if ($ref2slashvalid{$textref} && $text =~ m/(?:$RE_NUM|[\)\]])\s*$/) 922*e0680481Safresh1 { 923*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 924*e0680481Safresh1 } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_PREREGEX_PAT\s*$/) 925*e0680481Safresh1 { 926*e0680481Safresh1 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; 927*e0680481Safresh1 } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_EXPR_PAT\s*$/) 928*e0680481Safresh1 { 929*e0680481Safresh1 $ref2slashvalid{$textref} = 1; 930*e0680481Safresh1 $ref2qmarkvalid{$textref} = 0; 931*e0680481Safresh1 } 932*e0680481Safresh1} 933b39c5158Smillertsub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) 934b39c5158Smillert{ 935b39c5158Smillert my $textref = defined($_[0]) ? \$_[0] : \$_; 936*e0680481Safresh1 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 937b39c5158Smillert my $posbug = pos; 938b39c5158Smillert my ($lastpos, $firstpos); 939b39c5158Smillert my @fields = (); 940b39c5158Smillert 941b39c5158Smillert #for ($$textref) 942b39c5158Smillert { 943b39c5158Smillert my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; 944b39c5158Smillert my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; 945b39c5158Smillert my $igunk = $_[3]; 946b39c5158Smillert 947b39c5158Smillert pos $$textref ||= 0; 948b39c5158Smillert 949b39c5158Smillert unless (wantarray) 950b39c5158Smillert { 951b39c5158Smillert use Carp; 952b39c5158Smillert carp "extract_multiple reset maximal count to 1 in scalar context" 953b39c5158Smillert if $^W && defined($_[2]) && $max > 1; 954b39c5158Smillert $max = 1 955b39c5158Smillert } 956b39c5158Smillert 957b39c5158Smillert my @class; 958eac174f2Safresh1 foreach my $func ( @func ) 959b39c5158Smillert { 960b39c5158Smillert push @class, undef; 961*e0680481Safresh1 ($class[-1], $func) = %$func if ref($func) eq 'HASH'; 962*e0680481Safresh1 $func = qr/\G$func/ if !$ref_not_regex{ref $func}; 963b39c5158Smillert } 964b39c5158Smillert 965*e0680481Safresh1 my $unkpos; 966b39c5158Smillert FIELD: while (pos($$textref) < length($$textref)) 967b39c5158Smillert { 968b39c5158Smillert foreach my $i ( 0..$#func ) 969b39c5158Smillert { 970*e0680481Safresh1 my ($field, $pref); 971*e0680481Safresh1 my ($class, $func) = ($class[$i], $func[$i]); 972b39c5158Smillert $lastpos = pos $$textref; 973b39c5158Smillert if (ref($func) eq 'CODE') 974*e0680481Safresh1 { ($field,undef,$pref) = $func->($$textref) } 975b39c5158Smillert elsif (ref($func) eq 'Text::Balanced::Extractor') 976*e0680481Safresh1 { $field = $func->extract($$textref) } 977*e0680481Safresh1 elsif( $$textref =~ m/$func[$i]/gc ) 978*e0680481Safresh1 { $field = defined($1) 979b39c5158Smillert ? $1 980b39c5158Smillert : substr($$textref, $-[0], $+[0] - $-[0]) 981b39c5158Smillert } 982b39c5158Smillert $pref ||= ""; 983b39c5158Smillert if (defined($field) && length($field)) 984b39c5158Smillert { 985b39c5158Smillert if (!$igunk) { 986b39c5158Smillert $unkpos = $lastpos 987b39c5158Smillert if length($pref) && !defined($unkpos); 988b39c5158Smillert if (defined $unkpos) 989b39c5158Smillert { 990b39c5158Smillert push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; 991b39c5158Smillert $firstpos = $unkpos unless defined $firstpos; 992b39c5158Smillert undef $unkpos; 993b39c5158Smillert last FIELD if @fields == $max; 994b39c5158Smillert } 995b39c5158Smillert } 996*e0680481Safresh1 push @fields, $class ? bless(\$field, $class) : $field; 997*e0680481Safresh1 _update_patvalid($textref, $fields[-1]); 998b39c5158Smillert $firstpos = $lastpos unless defined $firstpos; 999b39c5158Smillert $lastpos = pos $$textref; 1000b39c5158Smillert last FIELD if @fields == $max; 1001b39c5158Smillert next FIELD; 1002b39c5158Smillert } 1003b39c5158Smillert } 1004b39c5158Smillert if ($$textref =~ /\G(.)/gcs) 1005b39c5158Smillert { 1006b39c5158Smillert $unkpos = pos($$textref)-1 1007b39c5158Smillert unless $igunk || defined $unkpos; 1008*e0680481Safresh1 _update_patvalid($textref, substr $$textref, $unkpos, pos($$textref)-$unkpos); 1009b39c5158Smillert } 1010b39c5158Smillert } 1011b39c5158Smillert 1012b39c5158Smillert if (defined $unkpos) 1013b39c5158Smillert { 1014b39c5158Smillert push @fields, substr($$textref, $unkpos); 1015b39c5158Smillert $firstpos = $unkpos unless defined $firstpos; 1016b39c5158Smillert $lastpos = length $$textref; 1017b39c5158Smillert } 1018b39c5158Smillert last; 1019b39c5158Smillert } 1020b39c5158Smillert 1021b39c5158Smillert pos $$textref = $lastpos; 1022b39c5158Smillert return @fields if wantarray; 1023b39c5158Smillert 1024b39c5158Smillert $firstpos ||= 0; 1025b39c5158Smillert eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; 1026b39c5158Smillert pos $$textref = $firstpos }; 1027b39c5158Smillert return $fields[0]; 1028b39c5158Smillert} 1029b39c5158Smillert 1030b39c5158Smillertsub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) 1031b39c5158Smillert{ 1032b39c5158Smillert my $ldel = $_[0]; 1033b39c5158Smillert my $rdel = $_[1]; 1034*e0680481Safresh1 my $pre = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/; 1035b39c5158Smillert my %options = defined $_[3] ? %{$_[3]} : (); 1036b39c5158Smillert my $omode = defined $options{fail} ? $options{fail} : ''; 1037b39c5158Smillert my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 1038b39c5158Smillert : defined($options{reject}) ? $options{reject} 1039b39c5158Smillert : '' 1040b39c5158Smillert ; 1041b39c5158Smillert my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 1042b39c5158Smillert : defined($options{ignore}) ? $options{ignore} 1043b39c5158Smillert : '' 1044b39c5158Smillert ; 1045b39c5158Smillert 1046*e0680481Safresh1 $ldel = $et_default_ldel if !defined $ldel; 1047b39c5158Smillert 1048b39c5158Smillert my $posbug = pos; 1049*e0680481Safresh1 for ($ldel, $bad, $ignore) { $_ = qr/$_/ if $_ } 1050b39c5158Smillert pos = $posbug; 1051b39c5158Smillert 1052b39c5158Smillert my $closure = sub 1053b39c5158Smillert { 1054b39c5158Smillert my $textref = defined $_[0] ? \$_[0] : \$_; 1055*e0680481Safresh1 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 1056b39c5158Smillert 1057b39c5158Smillert return _fail(wantarray, $textref) unless @match; 1058b39c5158Smillert return _succeed wantarray, $textref, 1059b39c5158Smillert $match[2], $match[3]+$match[5]+$match[7], # MATCH 1060b39c5158Smillert @match[8..9,0..1,2..7]; # REM, PRE, BITS 1061b39c5158Smillert }; 1062b39c5158Smillert 1063b39c5158Smillert bless $closure, 'Text::Balanced::Extractor'; 1064b39c5158Smillert} 1065b39c5158Smillert 1066b39c5158Smillertpackage Text::Balanced::Extractor; 1067b39c5158Smillert 1068b39c5158Smillertsub extract($$) # ($self, $text) 1069b39c5158Smillert{ 1070b39c5158Smillert &{$_[0]}($_[1]); 1071b39c5158Smillert} 1072b39c5158Smillert 1073b39c5158Smillertpackage Text::Balanced::ErrorMsg; 1074b39c5158Smillert 1075*e0680481Safresh1use overload 1076*e0680481Safresh1 '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }, 1077*e0680481Safresh1 fallback => 1; 1078b39c5158Smillert 1079b39c5158Smillert1; 1080b39c5158Smillert 1081b39c5158Smillert__END__ 1082b39c5158Smillert 1083b39c5158Smillert=pod 1084b39c5158Smillert 1085b39c5158Smillert=head1 NAME 1086b39c5158Smillert 1087b39c5158SmillertText::Balanced - Extract delimited text sequences from strings. 1088b39c5158Smillert 1089b39c5158Smillert=head1 SYNOPSIS 1090b39c5158Smillert 1091b39c5158Smillert use Text::Balanced qw ( 1092b39c5158Smillert extract_delimited 1093b39c5158Smillert extract_bracketed 1094b39c5158Smillert extract_quotelike 1095b39c5158Smillert extract_codeblock 1096b39c5158Smillert extract_variable 1097b39c5158Smillert extract_tagged 1098b39c5158Smillert extract_multiple 1099b39c5158Smillert gen_delimited_pat 1100b39c5158Smillert gen_extract_tagged 1101b39c5158Smillert ); 1102b39c5158Smillert 1103b39c5158Smillert # Extract the initial substring of $text that is delimited by 1104b39c5158Smillert # two (unescaped) instances of the first character in $delim. 1105b39c5158Smillert 1106b39c5158Smillert ($extracted, $remainder) = extract_delimited($text,$delim); 1107b39c5158Smillert 1108b39c5158Smillert # Extract the initial substring of $text that is bracketed 1109b39c5158Smillert # with a delimiter(s) specified by $delim (where the string 1110b39c5158Smillert # in $delim contains one or more of '(){}[]<>'). 1111b39c5158Smillert 1112b39c5158Smillert ($extracted, $remainder) = extract_bracketed($text,$delim); 1113b39c5158Smillert 1114b39c5158Smillert # Extract the initial substring of $text that is bounded by 1115b39c5158Smillert # an XML tag. 1116b39c5158Smillert 1117b39c5158Smillert ($extracted, $remainder) = extract_tagged($text); 1118b39c5158Smillert 1119b39c5158Smillert # Extract the initial substring of $text that is bounded by 1120b39c5158Smillert # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1121b39c5158Smillert 1122b39c5158Smillert ($extracted, $remainder) = 1123b39c5158Smillert extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1124b39c5158Smillert 1125b39c5158Smillert # Extract the initial substring of $text that represents a 1126b39c5158Smillert # Perl "quote or quote-like operation" 1127b39c5158Smillert 1128b39c5158Smillert ($extracted, $remainder) = extract_quotelike($text); 1129b39c5158Smillert 1130b39c5158Smillert # Extract the initial substring of $text that represents a block 1131b39c5158Smillert # of Perl code, bracketed by any of character(s) specified by $delim 1132b39c5158Smillert # (where the string $delim contains one or more of '(){}[]<>'). 1133b39c5158Smillert 1134b39c5158Smillert ($extracted, $remainder) = extract_codeblock($text,$delim); 1135b39c5158Smillert 1136b39c5158Smillert # Extract the initial substrings of $text that would be extracted by 1137b39c5158Smillert # one or more sequential applications of the specified functions 1138b39c5158Smillert # or regular expressions 1139b39c5158Smillert 1140b39c5158Smillert @extracted = extract_multiple($text, 1141b39c5158Smillert [ \&extract_bracketed, 1142b39c5158Smillert \&extract_quotelike, 1143b39c5158Smillert \&some_other_extractor_sub, 1144b39c5158Smillert qr/[xyz]*/, 1145b39c5158Smillert 'literal', 1146b39c5158Smillert ]); 1147b39c5158Smillert 1148b39c5158Smillert # Create a string representing an optimized pattern (a la Friedl) 1149b39c5158Smillert # that matches a substring delimited by any of the specified characters 1150b39c5158Smillert # (in this case: any type of quote or a slash) 1151b39c5158Smillert 1152b39c5158Smillert $patstring = gen_delimited_pat(q{'"`/}); 1153b39c5158Smillert 1154b39c5158Smillert # Generate a reference to an anonymous sub that is just like extract_tagged 1155eac174f2Safresh1 # but pre-compiled and optimized for a specific pair of tags, and 1156eac174f2Safresh1 # consequently much faster (i.e. 3 times faster). It uses qr// for better 1157eac174f2Safresh1 # performance on repeated calls. 1158b39c5158Smillert 1159b39c5158Smillert $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1160b39c5158Smillert ($extracted, $remainder) = $extract_head->($text); 1161b39c5158Smillert 1162b39c5158Smillert=head1 DESCRIPTION 1163b39c5158Smillert 1164b39c5158SmillertThe various C<extract_...> subroutines may be used to 1165b39c5158Smillertextract a delimited substring, possibly after skipping a 1166b39c5158Smillertspecified prefix string. By default, that prefix is 1167b39c5158Smillertoptional whitespace (C</\s*/>), but you can change it to whatever 1168b39c5158Smillertyou wish (see below). 1169b39c5158Smillert 1170b39c5158SmillertThe substring to be extracted must appear at the 1171b39c5158Smillertcurrent C<pos> location of the string's variable 1172b39c5158Smillert(or at index zero, if no C<pos> position is defined). 1173b39c5158SmillertIn other words, the C<extract_...> subroutines I<don't> 1174b39c5158Smillertextract the first occurrence of a substring anywhere 1175b39c5158Smillertin a string (like an unanchored regex would). Rather, 1176b39c5158Smillertthey extract an occurrence of the substring appearing 1177b39c5158Smillertimmediately at the current matching position in the 1178b39c5158Smillertstring (like a C<\G>-anchored regex would). 1179b39c5158Smillert 1180eac174f2Safresh1=head2 General Behaviour in List Contexts 1181b39c5158Smillert 1182b39c5158SmillertIn a list context, all the subroutines return a list, the first three 1183b39c5158Smillertelements of which are always: 1184b39c5158Smillert 1185b39c5158Smillert=over 4 1186b39c5158Smillert 1187b39c5158Smillert=item [0] 1188b39c5158Smillert 1189b39c5158SmillertThe extracted string, including the specified delimiters. 1190b39c5158SmillertIf the extraction fails C<undef> is returned. 1191b39c5158Smillert 1192b39c5158Smillert=item [1] 1193b39c5158Smillert 1194b39c5158SmillertThe remainder of the input string (i.e. the characters after the 1195b39c5158Smillertextracted string). On failure, the entire string is returned. 1196b39c5158Smillert 1197b39c5158Smillert=item [2] 1198b39c5158Smillert 1199b39c5158SmillertThe skipped prefix (i.e. the characters before the extracted string). 1200b39c5158SmillertOn failure, C<undef> is returned. 1201b39c5158Smillert 1202b39c5158Smillert=back 1203b39c5158Smillert 1204b39c5158SmillertNote that in a list context, the contents of the original input text (the first 1205b39c5158Smillertargument) are not modified in any way. 1206b39c5158Smillert 1207b39c5158SmillertHowever, if the input text was passed in a variable, that variable's 1208b39c5158SmillertC<pos> value is updated to point at the first character after the 1209b39c5158Smillertextracted text. That means that in a list context the various 1210b39c5158Smillertsubroutines can be used much like regular expressions. For example: 1211b39c5158Smillert 1212b39c5158Smillert while ( $next = (extract_quotelike($text))[0] ) 1213b39c5158Smillert { 1214b39c5158Smillert # process next quote-like (in $next) 1215b39c5158Smillert } 1216b39c5158Smillert 1217eac174f2Safresh1=head2 General Behaviour in Scalar and Void Contexts 1218b39c5158Smillert 1219b39c5158SmillertIn a scalar context, the extracted string is returned, having first been 1220b39c5158Smillertremoved from the input text. Thus, the following code also processes 1221b39c5158Smillerteach quote-like operation, but actually removes them from $text: 1222b39c5158Smillert 1223b39c5158Smillert while ( $next = extract_quotelike($text) ) 1224b39c5158Smillert { 1225b39c5158Smillert # process next quote-like (in $next) 1226b39c5158Smillert } 1227b39c5158Smillert 1228b39c5158SmillertNote that if the input text is a read-only string (i.e. a literal), 1229b39c5158Smillertno attempt is made to remove the extracted text. 1230b39c5158Smillert 1231b39c5158SmillertIn a void context the behaviour of the extraction subroutines is 1232b39c5158Smillertexactly the same as in a scalar context, except (of course) that the 1233b39c5158Smillertextracted substring is not returned. 1234b39c5158Smillert 1235eac174f2Safresh1=head2 A Note About Prefixes 1236b39c5158Smillert 1237b39c5158SmillertPrefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1238b39c5158SmillertThis can bite you if you're expecting a prefix specification like 1239b39c5158Smillert'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1240b39c5158Smillertpattern will only succeed if the <H1> tag is on the current line, since 1241b39c5158Smillert. normally doesn't match newlines. 1242b39c5158Smillert 1243b39c5158SmillertTo overcome this limitation, you need to turn on /s matching within 1244b39c5158Smillertthe prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1245b39c5158Smillert 1246eac174f2Safresh1=head2 Functions 1247eac174f2Safresh1 1248eac174f2Safresh1=over 4 1249eac174f2Safresh1 1250eac174f2Safresh1=item C<extract_delimited> 1251b39c5158Smillert 1252b39c5158SmillertThe C<extract_delimited> function formalizes the common idiom 1253b39c5158Smillertof extracting a single-character-delimited substring from the start of 1254b39c5158Smillerta string. For example, to extract a single-quote delimited string, the 1255b39c5158Smillertfollowing code is typically used: 1256b39c5158Smillert 1257b39c5158Smillert ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1258b39c5158Smillert $extracted = $1; 1259b39c5158Smillert 1260b39c5158Smillertbut with C<extract_delimited> it can be simplified to: 1261b39c5158Smillert 1262b39c5158Smillert ($extracted,$remainder) = extract_delimited($text, "'"); 1263b39c5158Smillert 1264b39c5158SmillertC<extract_delimited> takes up to four scalars (the input text, the 1265b39c5158Smillertdelimiters, a prefix pattern to be skipped, and any escape characters) 1266b39c5158Smillertand extracts the initial substring of the text that 1267b39c5158Smillertis appropriately delimited. If the delimiter string has multiple 1268b39c5158Smillertcharacters, the first one encountered in the text is taken to delimit 1269b39c5158Smillertthe substring. 1270b39c5158SmillertThe third argument specifies a prefix pattern that is to be skipped 1271b39c5158Smillert(but must be present!) before the substring is extracted. 1272b39c5158SmillertThe final argument specifies the escape character to be used for each 1273b39c5158Smillertdelimiter. 1274b39c5158Smillert 1275b39c5158SmillertAll arguments are optional. If the escape characters are not specified, 1276b39c5158Smillertevery delimiter is escaped with a backslash (C<\>). 1277b39c5158SmillertIf the prefix is not specified, the 1278b39c5158Smillertpattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1279b39c5158Smillertis also not specified, the set C</["'`]/> is used. If the text to be processed 1280b39c5158Smillertis not specified either, C<$_> is used. 1281b39c5158Smillert 1282b39c5158SmillertIn list context, C<extract_delimited> returns a array of three 1283b39c5158Smillertelements, the extracted substring (I<including the surrounding 1284b39c5158Smillertdelimiters>), the remainder of the text, and the skipped prefix (if 1285b39c5158Smillertany). If a suitable delimited substring is not found, the first 1286b39c5158Smillertelement of the array is the empty string, the second is the complete 1287b39c5158Smillertoriginal text, and the prefix returned in the third element is an 1288b39c5158Smillertempty string. 1289b39c5158Smillert 1290b39c5158SmillertIn a scalar context, just the extracted substring is returned. In 1291b39c5158Smillerta void context, the extracted substring (and any prefix) are simply 1292b39c5158Smillertremoved from the beginning of the first argument. 1293b39c5158Smillert 1294b39c5158SmillertExamples: 1295b39c5158Smillert 1296b39c5158Smillert # Remove a single-quoted substring from the very beginning of $text: 1297b39c5158Smillert 1298b39c5158Smillert $substring = extract_delimited($text, "'", ''); 1299b39c5158Smillert 1300b39c5158Smillert # Remove a single-quoted Pascalish substring (i.e. one in which 1301b39c5158Smillert # doubling the quote character escapes it) from the very 1302b39c5158Smillert # beginning of $text: 1303b39c5158Smillert 1304b39c5158Smillert $substring = extract_delimited($text, "'", '', "'"); 1305b39c5158Smillert 1306b39c5158Smillert # Extract a single- or double- quoted substring from the 1307b39c5158Smillert # beginning of $text, optionally after some whitespace 1308b39c5158Smillert # (note the list context to protect $text from modification): 1309b39c5158Smillert 1310b39c5158Smillert ($substring) = extract_delimited $text, q{"'}; 1311b39c5158Smillert 1312b39c5158Smillert # Delete the substring delimited by the first '/' in $text: 1313b39c5158Smillert 1314b39c5158Smillert $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1315b39c5158Smillert 1316b39c5158SmillertNote that this last example is I<not> the same as deleting the first 1317b39c5158Smillertquote-like pattern. For instance, if C<$text> contained the string: 1318b39c5158Smillert 1319b39c5158Smillert "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1320b39c5158Smillert 1321b39c5158Smillertthen after the deletion it would contain: 1322b39c5158Smillert 1323b39c5158Smillert "if ('.$UNIXCMD/s) { $cmd = $1; }" 1324b39c5158Smillert 1325b39c5158Smillertnot: 1326b39c5158Smillert 1327b39c5158Smillert "if ('./cmd' =~ ms) { $cmd = $1; }" 1328b39c5158Smillert 1329b39c5158SmillertSee L<"extract_quotelike"> for a (partial) solution to this problem. 1330b39c5158Smillert 1331eac174f2Safresh1=item C<extract_bracketed> 1332b39c5158Smillert 1333b39c5158SmillertLike C<"extract_delimited">, the C<extract_bracketed> function takes 1334b39c5158Smillertup to three optional scalar arguments: a string to extract from, a delimiter 1335b39c5158Smillertspecifier, and a prefix pattern. As before, a missing prefix defaults to 1336b39c5158Smillertoptional whitespace and a missing text defaults to C<$_>. However, a missing 1337b39c5158Smillertdelimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1338b39c5158Smillert 1339b39c5158SmillertC<extract_bracketed> extracts a balanced-bracket-delimited 1340b39c5158Smillertsubstring (using any one (or more) of the user-specified delimiter 1341b39c5158Smillertbrackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1342b39c5158Smillertrespect quoted unbalanced brackets (see below). 1343b39c5158Smillert 1344b39c5158SmillertA "delimiter bracket" is a bracket in list of delimiters passed as 1345b39c5158SmillertC<extract_bracketed>'s second argument. Delimiter brackets are 1346b39c5158Smillertspecified by giving either the left or right (or both!) versions 1347b39c5158Smillertof the required bracket(s). Note that the order in which 1348b39c5158Smillerttwo or more delimiter brackets are specified is not significant. 1349b39c5158Smillert 1350b39c5158SmillertA "balanced-bracket-delimited substring" is a substring bounded by 1351b39c5158Smillertmatched brackets, such that any other (left or right) delimiter 1352b39c5158Smillertbracket I<within> the substring is also matched by an opposite 1353b39c5158Smillert(right or left) delimiter bracket I<at the same level of nesting>. Any 1354b39c5158Smillerttype of bracket not in the delimiter list is treated as an ordinary 1355b39c5158Smillertcharacter. 1356b39c5158Smillert 1357b39c5158SmillertIn other words, each type of bracket specified as a delimiter must be 1358b39c5158Smillertbalanced and correctly nested within the substring, and any other kind of 1359b39c5158Smillert("non-delimiter") bracket in the substring is ignored. 1360b39c5158Smillert 1361b39c5158SmillertFor example, given the string: 1362b39c5158Smillert 1363b39c5158Smillert $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1364b39c5158Smillert 1365b39c5158Smillertthen a call to C<extract_bracketed> in a list context: 1366b39c5158Smillert 1367b39c5158Smillert @result = extract_bracketed( $text, '{}' ); 1368b39c5158Smillert 1369b39c5158Smillertwould return: 1370b39c5158Smillert 1371b39c5158Smillert ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1372b39c5158Smillert 1373b39c5158Smillertsince both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1374b39c5158Smillert(In a scalar context just the first element of the array would be returned. In 1375b39c5158Smillerta void context, C<$text> would be replaced by an empty string.) 1376b39c5158Smillert 1377b39c5158SmillertLikewise the call in: 1378b39c5158Smillert 1379b39c5158Smillert @result = extract_bracketed( $text, '{[' ); 1380b39c5158Smillert 1381b39c5158Smillertwould return the same result, since all sets of both types of specified 1382b39c5158Smillertdelimiter brackets are correctly nested and balanced. 1383b39c5158Smillert 1384b39c5158SmillertHowever, the call in: 1385b39c5158Smillert 1386b39c5158Smillert @result = extract_bracketed( $text, '{([<' ); 1387b39c5158Smillert 1388b39c5158Smillertwould fail, returning: 1389b39c5158Smillert 1390b39c5158Smillert ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1391b39c5158Smillert 1392b39c5158Smillertbecause the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1393b39c5158Smillertthe embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1394b39c5158Smillertreturn an empty string. In a void context, C<$text> would be unchanged.) 1395b39c5158Smillert 1396b39c5158SmillertNote that the embedded single-quotes in the string don't help in this 1397b39c5158Smillertcase, since they have not been specified as acceptable delimiters and are 1398b39c5158Smillerttherefore treated as non-delimiter characters (and ignored). 1399b39c5158Smillert 1400b39c5158SmillertHowever, if a particular species of quote character is included in the 1401b39c5158Smillertdelimiter specification, then that type of quote will be correctly handled. 1402b39c5158Smillertfor example, if C<$text> is: 1403b39c5158Smillert 1404b39c5158Smillert $text = '<A HREF=">>>>">link</A>'; 1405b39c5158Smillert 1406b39c5158Smillertthen 1407b39c5158Smillert 1408b39c5158Smillert @result = extract_bracketed( $text, '<">' ); 1409b39c5158Smillert 1410b39c5158Smillertreturns: 1411b39c5158Smillert 1412b39c5158Smillert ( '<A HREF=">>>>">', 'link</A>', "" ) 1413b39c5158Smillert 1414b39c5158Smillertas expected. Without the specification of C<"> as an embedded quoter: 1415b39c5158Smillert 1416b39c5158Smillert @result = extract_bracketed( $text, '<>' ); 1417b39c5158Smillert 1418b39c5158Smillertthe result would be: 1419b39c5158Smillert 1420b39c5158Smillert ( '<A HREF=">', '>>>">link</A>', "" ) 1421b39c5158Smillert 1422b39c5158SmillertIn addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1423b39c5158Smillertquoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1424b39c5158Smillertletter 'q' as a delimiter. Hence: 1425b39c5158Smillert 1426b39c5158Smillert @result = extract_bracketed( $text, '<q>' ); 1427b39c5158Smillert 1428b39c5158Smillertwould correctly match something like this: 1429b39c5158Smillert 1430b39c5158Smillert $text = '<leftop: conj /and/ conj>'; 1431b39c5158Smillert 1432b39c5158SmillertSee also: C<"extract_quotelike"> and C<"extract_codeblock">. 1433b39c5158Smillert 1434eac174f2Safresh1=item C<extract_variable> 1435b39c5158Smillert 1436b39c5158SmillertC<extract_variable> extracts any valid Perl variable or 1437b39c5158Smillertvariable-involved expression, including scalars, arrays, hashes, array 1438b39c5158Smillertaccesses, hash look-ups, method calls through objects, subroutine calls 1439b39c5158Smillertthrough subroutine references, etc. 1440b39c5158Smillert 1441b39c5158SmillertThe subroutine takes up to two optional arguments: 1442b39c5158Smillert 1443b39c5158Smillert=over 4 1444b39c5158Smillert 1445b39c5158Smillert=item 1. 1446b39c5158Smillert 1447b39c5158SmillertA string to be processed (C<$_> if the string is omitted or C<undef>) 1448b39c5158Smillert 1449b39c5158Smillert=item 2. 1450b39c5158Smillert 1451b39c5158SmillertA string specifying a pattern to be matched as a prefix (which is to be 1452b39c5158Smillertskipped). If omitted, optional whitespace is skipped. 1453b39c5158Smillert 1454b39c5158Smillert=back 1455b39c5158Smillert 1456b39c5158SmillertOn success in a list context, an array of 3 elements is returned. The 1457b39c5158Smillertelements are: 1458b39c5158Smillert 1459b39c5158Smillert=over 4 1460b39c5158Smillert 1461b39c5158Smillert=item [0] 1462b39c5158Smillert 1463b39c5158Smillertthe extracted variable, or variablish expression 1464b39c5158Smillert 1465b39c5158Smillert=item [1] 1466b39c5158Smillert 1467b39c5158Smillertthe remainder of the input text, 1468b39c5158Smillert 1469b39c5158Smillert=item [2] 1470b39c5158Smillert 1471b39c5158Smillertthe prefix substring (if any), 1472b39c5158Smillert 1473b39c5158Smillert=back 1474b39c5158Smillert 1475b39c5158SmillertOn failure, all of these values (except the remaining text) are C<undef>. 1476b39c5158Smillert 1477b39c5158SmillertIn a scalar context, C<extract_variable> returns just the complete 1478b39c5158Smillertsubstring that matched a variablish expression. C<undef> is returned on 1479b39c5158Smillertfailure. In addition, the original input text has the returned substring 1480b39c5158Smillert(and any prefix) removed from it. 1481b39c5158Smillert 1482b39c5158SmillertIn a void context, the input text just has the matched substring (and 1483b39c5158Smillertany specified prefix) removed. 1484b39c5158Smillert 1485eac174f2Safresh1=item C<extract_tagged> 1486b39c5158Smillert 1487b39c5158SmillertC<extract_tagged> extracts and segments text between (balanced) 1488b39c5158Smillertspecified tags. 1489b39c5158Smillert 1490b39c5158SmillertThe subroutine takes up to five optional arguments: 1491b39c5158Smillert 1492b39c5158Smillert=over 4 1493b39c5158Smillert 1494b39c5158Smillert=item 1. 1495b39c5158Smillert 1496b39c5158SmillertA string to be processed (C<$_> if the string is omitted or C<undef>) 1497b39c5158Smillert 1498b39c5158Smillert=item 2. 1499b39c5158Smillert 1500*e0680481Safresh1A string specifying a pattern (i.e. regex) to be matched as the opening tag. 1501b39c5158SmillertIf the pattern string is omitted (or C<undef>) then a pattern 1502b39c5158Smillertthat matches any standard XML tag is used. 1503b39c5158Smillert 1504b39c5158Smillert=item 3. 1505b39c5158Smillert 1506b39c5158SmillertA string specifying a pattern to be matched at the closing tag. 1507b39c5158SmillertIf the pattern string is omitted (or C<undef>) then the closing 1508b39c5158Smillerttag is constructed by inserting a C</> after any leading bracket 1509b39c5158Smillertcharacters in the actual opening tag that was matched (I<not> the pattern 1510b39c5158Smillertthat matched the tag). For example, if the opening tag pattern 1511b39c5158Smillertis specified as C<'{{\w+}}'> and actually matched the opening tag 1512b39c5158SmillertC<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1513b39c5158Smillert 1514b39c5158Smillert=item 4. 1515b39c5158Smillert 1516b39c5158SmillertA string specifying a pattern to be matched as a prefix (which is to be 1517b39c5158Smillertskipped). If omitted, optional whitespace is skipped. 1518b39c5158Smillert 1519b39c5158Smillert=item 5. 1520b39c5158Smillert 1521b39c5158SmillertA hash reference containing various parsing options (see below) 1522b39c5158Smillert 1523b39c5158Smillert=back 1524b39c5158Smillert 1525b39c5158SmillertThe various options that can be specified are: 1526b39c5158Smillert 1527b39c5158Smillert=over 4 1528b39c5158Smillert 1529b39c5158Smillert=item C<reject =E<gt> $listref> 1530b39c5158Smillert 1531b39c5158SmillertThe list reference contains one or more strings specifying patterns 1532b39c5158Smillertthat must I<not> appear within the tagged text. 1533b39c5158Smillert 1534b39c5158SmillertFor example, to extract 1535b39c5158Smillertan HTML link (which should not contain nested links) use: 1536b39c5158Smillert 1537b39c5158Smillert extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1538b39c5158Smillert 1539b39c5158Smillert=item C<ignore =E<gt> $listref> 1540b39c5158Smillert 1541b39c5158SmillertThe list reference contains one or more strings specifying patterns 1542eac174f2Safresh1that are I<not> to be treated as nested tags within the tagged text 1543b39c5158Smillert(even if they would match the start tag pattern). 1544b39c5158Smillert 1545b39c5158SmillertFor example, to extract an arbitrary XML tag, but ignore "empty" elements: 1546b39c5158Smillert 1547b39c5158Smillert extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1548b39c5158Smillert 1549b39c5158Smillert(also see L<"gen_delimited_pat"> below). 1550b39c5158Smillert 1551b39c5158Smillert=item C<fail =E<gt> $str> 1552b39c5158Smillert 1553b39c5158SmillertThe C<fail> option indicates the action to be taken if a matching end 1554b39c5158Smillerttag is not encountered (i.e. before the end of the string or some 1555b39c5158SmillertC<reject> pattern matches). By default, a failure to match a closing 1556b39c5158Smillerttag causes C<extract_tagged> to immediately fail. 1557b39c5158Smillert 1558b39c5158SmillertHowever, if the string value associated with <reject> is "MAX", then 1559b39c5158SmillertC<extract_tagged> returns the complete text up to the point of failure. 1560b39c5158SmillertIf the string is "PARA", C<extract_tagged> returns only the first paragraph 1561b39c5158Smillertafter the tag (up to the first line that is either empty or contains 1562b39c5158Smillertonly whitespace characters). 15639dc91179Safresh1If the string is "", the default behaviour (i.e. failure) is reinstated. 1564b39c5158Smillert 1565b39c5158SmillertFor example, suppose the start tag "/para" introduces a paragraph, which then 1566b39c5158Smillertcontinues until the next "/endpara" tag or until another "/para" tag is 1567b39c5158Smillertencountered: 1568b39c5158Smillert 1569b39c5158Smillert $text = "/para line 1\n\nline 3\n/para line 4"; 1570b39c5158Smillert 1571b39c5158Smillert extract_tagged($text, '/para', '/endpara', undef, 1572b39c5158Smillert {reject => '/para', fail => MAX ); 1573b39c5158Smillert 1574b39c5158Smillert # EXTRACTED: "/para line 1\n\nline 3\n" 1575b39c5158Smillert 1576b39c5158SmillertSuppose instead, that if no matching "/endpara" tag is found, the "/para" 1577b39c5158Smillerttag refers only to the immediately following paragraph: 1578b39c5158Smillert 1579b39c5158Smillert $text = "/para line 1\n\nline 3\n/para line 4"; 1580b39c5158Smillert 1581b39c5158Smillert extract_tagged($text, '/para', '/endpara', undef, 1582b39c5158Smillert {reject => '/para', fail => MAX ); 1583b39c5158Smillert 1584b39c5158Smillert # EXTRACTED: "/para line 1\n" 1585b39c5158Smillert 1586b39c5158SmillertNote that the specified C<fail> behaviour applies to nested tags as well. 1587b39c5158Smillert 1588b39c5158Smillert=back 1589b39c5158Smillert 1590b39c5158SmillertOn success in a list context, an array of 6 elements is returned. The elements are: 1591b39c5158Smillert 1592b39c5158Smillert=over 4 1593b39c5158Smillert 1594b39c5158Smillert=item [0] 1595b39c5158Smillert 1596b39c5158Smillertthe extracted tagged substring (including the outermost tags), 1597b39c5158Smillert 1598b39c5158Smillert=item [1] 1599b39c5158Smillert 1600b39c5158Smillertthe remainder of the input text, 1601b39c5158Smillert 1602b39c5158Smillert=item [2] 1603b39c5158Smillert 1604b39c5158Smillertthe prefix substring (if any), 1605b39c5158Smillert 1606b39c5158Smillert=item [3] 1607b39c5158Smillert 1608b39c5158Smillertthe opening tag 1609b39c5158Smillert 1610b39c5158Smillert=item [4] 1611b39c5158Smillert 1612b39c5158Smillertthe text between the opening and closing tags 1613b39c5158Smillert 1614b39c5158Smillert=item [5] 1615b39c5158Smillert 1616b39c5158Smillertthe closing tag (or "" if no closing tag was found) 1617b39c5158Smillert 1618b39c5158Smillert=back 1619b39c5158Smillert 1620b39c5158SmillertOn failure, all of these values (except the remaining text) are C<undef>. 1621b39c5158Smillert 1622b39c5158SmillertIn a scalar context, C<extract_tagged> returns just the complete 1623b39c5158Smillertsubstring that matched a tagged text (including the start and end 1624b39c5158Smillerttags). C<undef> is returned on failure. In addition, the original input 1625b39c5158Smillerttext has the returned substring (and any prefix) removed from it. 1626b39c5158Smillert 1627b39c5158SmillertIn a void context, the input text just has the matched substring (and 1628b39c5158Smillertany specified prefix) removed. 1629b39c5158Smillert 1630eac174f2Safresh1=item C<gen_extract_tagged> 1631b39c5158Smillert 1632b39c5158SmillertC<gen_extract_tagged> generates a new anonymous subroutine which 1633b39c5158Smillertextracts text between (balanced) specified tags. In other words, 1634b39c5158Smillertit generates a function identical in function to C<extract_tagged>. 1635b39c5158Smillert 1636b39c5158SmillertThe difference between C<extract_tagged> and the anonymous 1637b39c5158Smillertsubroutines generated by 1638b39c5158SmillertC<gen_extract_tagged>, is that those generated subroutines: 1639b39c5158Smillert 1640b39c5158Smillert=over 4 1641b39c5158Smillert 1642b39c5158Smillert=item * 1643b39c5158Smillert 1644b39c5158Smillertdo not have to reparse tag specification or parsing options every time 1645b39c5158Smillertthey are called (whereas C<extract_tagged> has to effectively rebuild 1646b39c5158Smillertits tag parser on every call); 1647b39c5158Smillert 1648b39c5158Smillert=item * 1649b39c5158Smillert 1650b39c5158Smillertmake use of the new qr// construct to pre-compile the regexes they use 1651b39c5158Smillert(whereas C<extract_tagged> uses standard string variable interpolation 1652b39c5158Smillertto create tag-matching patterns). 1653b39c5158Smillert 1654b39c5158Smillert=back 1655b39c5158Smillert 1656b39c5158SmillertThe subroutine takes up to four optional arguments (the same set as 1657b39c5158SmillertC<extract_tagged> except for the string to be processed). It returns 1658b39c5158Smillerta reference to a subroutine which in turn takes a single argument (the text to 1659b39c5158Smillertbe extracted from). 1660b39c5158Smillert 1661b39c5158SmillertIn other words, the implementation of C<extract_tagged> is exactly 1662b39c5158Smillertequivalent to: 1663b39c5158Smillert 1664b39c5158Smillert sub extract_tagged 1665b39c5158Smillert { 1666b39c5158Smillert my $text = shift; 1667b39c5158Smillert $extractor = gen_extract_tagged(@_); 1668b39c5158Smillert return $extractor->($text); 1669b39c5158Smillert } 1670b39c5158Smillert 1671eac174f2Safresh1(although C<extract_tagged> is not currently implemented that way). 1672b39c5158Smillert 1673b39c5158SmillertUsing C<gen_extract_tagged> to create extraction functions for specific tags 1674b39c5158Smillertis a good idea if those functions are going to be called more than once, since 1675b39c5158Smillerttheir performance is typically twice as good as the more general-purpose 1676b39c5158SmillertC<extract_tagged>. 1677b39c5158Smillert 1678eac174f2Safresh1=item C<extract_quotelike> 1679b39c5158Smillert 1680b39c5158SmillertC<extract_quotelike> attempts to recognize, extract, and segment any 1681b39c5158Smillertone of the various Perl quotes and quotelike operators (see 1682b39c5158SmillertL<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1683b39c5158Smillertdelimiters (for the quotelike operators), and trailing modifiers are 1684b39c5158Smillertall caught. For example, in: 1685b39c5158Smillert 1686b39c5158Smillert extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1687b39c5158Smillert 1688b39c5158Smillert extract_quotelike ' "You said, \"Use sed\"." ' 1689b39c5158Smillert 1690b39c5158Smillert extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1691b39c5158Smillert 1692b39c5158Smillert extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1693b39c5158Smillert 1694b39c5158Smillertthe full Perl quotelike operations are all extracted correctly. 1695b39c5158Smillert 1696b39c5158SmillertNote too that, when using the /x modifier on a regex, any comment 1697b39c5158Smillertcontaining the current pattern delimiter will cause the regex to be 1698b39c5158Smillertimmediately terminated. In other words: 1699b39c5158Smillert 1700b39c5158Smillert 'm / 1701b39c5158Smillert (?i) # CASE INSENSITIVE 1702b39c5158Smillert [a-z_] # LEADING ALPHABETIC/UNDERSCORE 1703b39c5158Smillert [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1704b39c5158Smillert /x' 1705b39c5158Smillert 1706b39c5158Smillertwill be extracted as if it were: 1707b39c5158Smillert 1708b39c5158Smillert 'm / 1709b39c5158Smillert (?i) # CASE INSENSITIVE 1710b39c5158Smillert [a-z_] # LEADING ALPHABETIC/' 1711b39c5158Smillert 1712b39c5158SmillertThis behaviour is identical to that of the actual compiler. 1713b39c5158Smillert 1714b39c5158SmillertC<extract_quotelike> takes two arguments: the text to be processed and 1715b39c5158Smillerta prefix to be matched at the very beginning of the text. If no prefix 1716b39c5158Smillertis specified, optional whitespace is the default. If no text is given, 1717b39c5158SmillertC<$_> is used. 1718b39c5158Smillert 1719b39c5158SmillertIn a list context, an array of 11 elements is returned. The elements are: 1720b39c5158Smillert 1721b39c5158Smillert=over 4 1722b39c5158Smillert 1723b39c5158Smillert=item [0] 1724b39c5158Smillert 1725b39c5158Smillertthe extracted quotelike substring (including trailing modifiers), 1726b39c5158Smillert 1727b39c5158Smillert=item [1] 1728b39c5158Smillert 1729b39c5158Smillertthe remainder of the input text, 1730b39c5158Smillert 1731b39c5158Smillert=item [2] 1732b39c5158Smillert 1733b39c5158Smillertthe prefix substring (if any), 1734b39c5158Smillert 1735b39c5158Smillert=item [3] 1736b39c5158Smillert 1737b39c5158Smillertthe name of the quotelike operator (if any), 1738b39c5158Smillert 1739b39c5158Smillert=item [4] 1740b39c5158Smillert 1741b39c5158Smillertthe left delimiter of the first block of the operation, 1742b39c5158Smillert 1743b39c5158Smillert=item [5] 1744b39c5158Smillert 1745b39c5158Smillertthe text of the first block of the operation 1746b39c5158Smillert(that is, the contents of 1747b39c5158Smillerta quote, the regex of a match or substitution or the target list of a 1748b39c5158Smillerttranslation), 1749b39c5158Smillert 1750b39c5158Smillert=item [6] 1751b39c5158Smillert 1752b39c5158Smillertthe right delimiter of the first block of the operation, 1753b39c5158Smillert 1754b39c5158Smillert=item [7] 1755b39c5158Smillert 1756b39c5158Smillertthe left delimiter of the second block of the operation 1757b39c5158Smillert(that is, if it is a C<s>, C<tr>, or C<y>), 1758b39c5158Smillert 1759b39c5158Smillert=item [8] 1760b39c5158Smillert 1761b39c5158Smillertthe text of the second block of the operation 1762b39c5158Smillert(that is, the replacement of a substitution or the translation list 1763b39c5158Smillertof a translation), 1764b39c5158Smillert 1765b39c5158Smillert=item [9] 1766b39c5158Smillert 1767b39c5158Smillertthe right delimiter of the second block of the operation (if any), 1768b39c5158Smillert 1769b39c5158Smillert=item [10] 1770b39c5158Smillert 1771b39c5158Smillertthe trailing modifiers on the operation (if any). 1772b39c5158Smillert 1773b39c5158Smillert=back 1774b39c5158Smillert 1775b39c5158SmillertFor each of the fields marked "(if any)" the default value on success is 1776b39c5158Smillertan empty string. 1777b39c5158SmillertOn failure, all of these values (except the remaining text) are C<undef>. 1778b39c5158Smillert 1779b39c5158SmillertIn a scalar context, C<extract_quotelike> returns just the complete substring 1780b39c5158Smillertthat matched a quotelike operation (or C<undef> on failure). In a scalar or 1781b39c5158Smillertvoid context, the input text has the same substring (and any specified 1782b39c5158Smillertprefix) removed. 1783b39c5158Smillert 1784b39c5158SmillertExamples: 1785b39c5158Smillert 1786b39c5158Smillert # Remove the first quotelike literal that appears in text 1787b39c5158Smillert 1788b39c5158Smillert $quotelike = extract_quotelike($text,'.*?'); 1789b39c5158Smillert 1790b39c5158Smillert # Replace one or more leading whitespace-separated quotelike 1791b39c5158Smillert # literals in $_ with "<QLL>" 1792b39c5158Smillert 1793b39c5158Smillert do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1794b39c5158Smillert 1795b39c5158Smillert 1796b39c5158Smillert # Isolate the search pattern in a quotelike operation from $text 1797b39c5158Smillert 1798b39c5158Smillert ($op,$pat) = (extract_quotelike $text)[3,5]; 1799b39c5158Smillert if ($op =~ /[ms]/) 1800b39c5158Smillert { 1801b39c5158Smillert print "search pattern: $pat\n"; 1802b39c5158Smillert } 1803b39c5158Smillert else 1804b39c5158Smillert { 1805b39c5158Smillert print "$op is not a pattern matching operation\n"; 1806b39c5158Smillert } 1807b39c5158Smillert 1808eac174f2Safresh1=item C<extract_quotelike> 1809b39c5158Smillert 1810b39c5158SmillertC<extract_quotelike> can successfully extract "here documents" from an input 1811b39c5158Smillertstring, but with an important caveat in list contexts. 1812b39c5158Smillert 1813b39c5158SmillertUnlike other types of quote-like literals, a here document is rarely 1814b39c5158Smillerta contiguous substring. For example, a typical piece of code using 1815b39c5158Smillerthere document might look like this: 1816b39c5158Smillert 1817b39c5158Smillert <<'EOMSG' || die; 1818b39c5158Smillert This is the message. 1819b39c5158Smillert EOMSG 1820b39c5158Smillert exit; 1821b39c5158Smillert 1822b39c5158SmillertGiven this as an input string in a scalar context, C<extract_quotelike> 1823b39c5158Smillertwould correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1824b39c5158Smillertleaving the string " || die;\nexit;" in the original variable. In other words, 1825b39c5158Smillertthe two separate pieces of the here document are successfully extracted and 1826b39c5158Smillertconcatenated. 1827b39c5158Smillert 1828b39c5158SmillertIn a list context, C<extract_quotelike> would return the list 1829b39c5158Smillert 1830b39c5158Smillert=over 4 1831b39c5158Smillert 1832b39c5158Smillert=item [0] 1833b39c5158Smillert 1834b39c5158Smillert"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1835b39c5158Smillertincluding fore and aft delimiters), 1836b39c5158Smillert 1837b39c5158Smillert=item [1] 1838b39c5158Smillert 1839b39c5158Smillert" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1840b39c5158Smillert 1841b39c5158Smillert=item [2] 1842b39c5158Smillert 1843b39c5158Smillert"" (i.e. the prefix substring -- trivial in this case), 1844b39c5158Smillert 1845b39c5158Smillert=item [3] 1846b39c5158Smillert 1847b39c5158Smillert"<<" (i.e. the "name" of the quotelike operator) 1848b39c5158Smillert 1849b39c5158Smillert=item [4] 1850b39c5158Smillert 1851b39c5158Smillert"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1852b39c5158Smillert 1853b39c5158Smillert=item [5] 1854b39c5158Smillert 1855b39c5158Smillert"This is the message.\n" (i.e. the text of the here document), 1856b39c5158Smillert 1857b39c5158Smillert=item [6] 1858b39c5158Smillert 1859b39c5158Smillert"EOMSG" (i.e. the right delimiter of the here document), 1860b39c5158Smillert 1861b39c5158Smillert=item [7..10] 1862b39c5158Smillert 1863b39c5158Smillert"" (a here document has no second left delimiter, second text, second right 1864b39c5158Smillertdelimiter, or trailing modifiers). 1865b39c5158Smillert 1866b39c5158Smillert=back 1867b39c5158Smillert 1868b39c5158SmillertHowever, the matching position of the input variable would be set to 1869b39c5158Smillert"exit;" (i.e. I<after> the closing delimiter of the here document), 1870b39c5158Smillertwhich would cause the earlier " || die;\nexit;" to be skipped in any 1871b39c5158Smillertsequence of code fragment extractions. 1872b39c5158Smillert 1873b39c5158SmillertTo avoid this problem, when it encounters a here document whilst 1874b39c5158Smillertextracting from a modifiable string, C<extract_quotelike> silently 1875b39c5158Smillertrearranges the string to an equivalent piece of Perl: 1876b39c5158Smillert 1877b39c5158Smillert <<'EOMSG' 1878b39c5158Smillert This is the message. 1879b39c5158Smillert EOMSG 1880b39c5158Smillert || die; 1881b39c5158Smillert exit; 1882b39c5158Smillert 1883b39c5158Smillertin which the here document I<is> contiguous. It still leaves the 1884b39c5158Smillertmatching position after the here document, but now the rest of the line 1885b39c5158Smillerton which the here document starts is not skipped. 1886b39c5158Smillert 1887b39c5158SmillertTo prevent <extract_quotelike> from mucking about with the input in this way 1888b39c5158Smillert(this is the only case where a list-context C<extract_quotelike> does so), 1889b39c5158Smillertyou can pass the input variable as an interpolated literal: 1890b39c5158Smillert 1891b39c5158Smillert $quotelike = extract_quotelike("$var"); 1892b39c5158Smillert 1893eac174f2Safresh1=item C<extract_codeblock> 1894b39c5158Smillert 1895b39c5158SmillertC<extract_codeblock> attempts to recognize and extract a balanced 1896b39c5158Smillertbracket delimited substring that may contain unbalanced brackets 1897b39c5158Smillertinside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1898b39c5158Smillertis like a combination of C<"extract_bracketed"> and 1899b39c5158SmillertC<"extract_quotelike">. 1900b39c5158Smillert 1901b39c5158SmillertC<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1902b39c5158Smillerta text to process, a set of delimiter brackets to look for, and a prefix to 1903b39c5158Smillertmatch first. It also takes an optional fourth parameter, which allows the 1904*e0680481Safresh1outermost delimiter brackets to be specified separately (see below), 1905*e0680481Safresh1and a fifth parameter used only by L<Parse::RecDescent>. 1906b39c5158Smillert 1907b39c5158SmillertOmitting the first argument (input text) means process C<$_> instead. 1908b39c5158SmillertOmitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1909b39c5158SmillertOmitting the third argument (prefix argument) implies optional whitespace at the start. 1910b39c5158SmillertOmitting the fourth argument (outermost delimiter brackets) indicates that the 1911b39c5158Smillertvalue of the second argument is to be used for the outermost delimiters. 1912b39c5158Smillert 1913b39c5158SmillertOnce the prefix and the outermost opening delimiter bracket have been 1914b39c5158Smillertrecognized, code blocks are extracted by stepping through the input text and 1915b39c5158Smillerttrying the following alternatives in sequence: 1916b39c5158Smillert 1917b39c5158Smillert=over 4 1918b39c5158Smillert 1919b39c5158Smillert=item 1. 1920b39c5158Smillert 1921b39c5158SmillertTry and match a closing delimiter bracket. If the bracket was the same 1922b39c5158Smillertspecies as the last opening bracket, return the substring to that 1923b39c5158Smillertpoint. If the bracket was mismatched, return an error. 1924b39c5158Smillert 1925b39c5158Smillert=item 2. 1926b39c5158Smillert 1927b39c5158SmillertTry to match a quote or quotelike operator. If found, call 1928b39c5158SmillertC<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1929b39c5158Smillertthe error it returned. Otherwise go back to step 1. 1930b39c5158Smillert 1931b39c5158Smillert=item 3. 1932b39c5158Smillert 1933b39c5158SmillertTry to match an opening delimiter bracket. If found, call 1934b39c5158SmillertC<extract_codeblock> recursively to eat the embedded block. If the 1935b39c5158Smillertrecursive call fails, return an error. Otherwise, go back to step 1. 1936b39c5158Smillert 1937b39c5158Smillert=item 4. 1938b39c5158Smillert 1939b39c5158SmillertUnconditionally match a bareword or any other single character, and 1940b39c5158Smillertthen go back to step 1. 1941b39c5158Smillert 1942b39c5158Smillert=back 1943b39c5158Smillert 1944b39c5158SmillertExamples: 1945b39c5158Smillert 1946b39c5158Smillert # Find a while loop in the text 1947b39c5158Smillert 1948b39c5158Smillert if ($text =~ s/.*?while\s*\{/{/) 1949b39c5158Smillert { 1950b39c5158Smillert $loop = "while " . extract_codeblock($text); 1951b39c5158Smillert } 1952b39c5158Smillert 1953b39c5158Smillert # Remove the first round-bracketed list (which may include 1954b39c5158Smillert # round- or curly-bracketed code blocks or quotelike operators) 1955b39c5158Smillert 1956b39c5158Smillert extract_codeblock $text, "(){}", '[^(]*'; 1957b39c5158Smillert 1958b39c5158Smillert 1959b39c5158SmillertThe ability to specify a different outermost delimiter bracket is useful 1960b39c5158Smillertin some circumstances. For example, in the Parse::RecDescent module, 1961b39c5158Smillertparser actions which are to be performed only on a successful parse 1962b39c5158Smillertare specified using a C<E<lt>defer:...E<gt>> directive. For example: 1963b39c5158Smillert 1964b39c5158Smillert sentence: subject verb object 1965b39c5158Smillert <defer: {$::theVerb = $item{verb}} > 1966b39c5158Smillert 1967b39c5158SmillertParse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1968b39c5158Smillertwithin the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1969b39c5158Smillert 1970b39c5158SmillertA deferred action like this: 1971b39c5158Smillert 1972b39c5158Smillert <defer: {if ($count>10) {$count--}} > 1973b39c5158Smillert 1974b39c5158Smillertwill be incorrectly parsed as: 1975b39c5158Smillert 1976b39c5158Smillert <defer: {if ($count> 1977b39c5158Smillert 1978b39c5158Smillertbecause the "less than" operator is interpreted as a closing delimiter. 1979b39c5158Smillert 1980b39c5158SmillertBut, by extracting the directive using 1981b39c5158SmillertS<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1982b39c5158Smillertthe '>' character is only treated as a delimited at the outermost 1983b39c5158Smillertlevel of the code block, so the directive is parsed correctly. 1984b39c5158Smillert 1985eac174f2Safresh1=item C<extract_multiple> 1986b39c5158Smillert 1987b39c5158SmillertThe C<extract_multiple> subroutine takes a string to be processed and a 1988b39c5158Smillertlist of extractors (subroutines or regular expressions) to apply to that string. 1989b39c5158Smillert 1990b39c5158SmillertIn an array context C<extract_multiple> returns an array of substrings 1991b39c5158Smillertof the original string, as extracted by the specified extractors. 1992b39c5158SmillertIn a scalar context, C<extract_multiple> returns the first 1993b39c5158Smillertsubstring successfully extracted from the original string. In both 1994b39c5158Smillertscalar and void contexts the original string has the first successfully 1995b39c5158Smillertextracted substring removed from it. In all contexts 1996b39c5158SmillertC<extract_multiple> starts at the current C<pos> of the string, and 1997b39c5158Smillertsets that C<pos> appropriately after it matches. 1998b39c5158Smillert 1999eac174f2Safresh1Hence, the aim of a call to C<extract_multiple> in a list context 2000b39c5158Smillertis to split the processed string into as many non-overlapping fields as 2001b39c5158Smillertpossible, by repeatedly applying each of the specified extractors 2002b39c5158Smillertto the remainder of the string. Thus C<extract_multiple> is 2003b39c5158Smillerta generalized form of Perl's C<split> subroutine. 2004b39c5158Smillert 2005b39c5158SmillertThe subroutine takes up to four optional arguments: 2006b39c5158Smillert 2007b39c5158Smillert=over 4 2008b39c5158Smillert 2009b39c5158Smillert=item 1. 2010b39c5158Smillert 2011b39c5158SmillertA string to be processed (C<$_> if the string is omitted or C<undef>) 2012b39c5158Smillert 2013b39c5158Smillert=item 2. 2014b39c5158Smillert 2015b39c5158SmillertA reference to a list of subroutine references and/or qr// objects and/or 2016b39c5158Smillertliteral strings and/or hash references, specifying the extractors 2017b39c5158Smillertto be used to split the string. If this argument is omitted (or 2018b39c5158SmillertC<undef>) the list: 2019b39c5158Smillert 2020b39c5158Smillert [ 2021b39c5158Smillert sub { extract_variable($_[0], '') }, 2022b39c5158Smillert sub { extract_quotelike($_[0],'') }, 2023b39c5158Smillert sub { extract_codeblock($_[0],'{}','') }, 2024b39c5158Smillert ] 2025b39c5158Smillert 2026b39c5158Smillertis used. 2027b39c5158Smillert 2028b39c5158Smillert=item 3. 2029b39c5158Smillert 2030b39c5158SmillertAn number specifying the maximum number of fields to return. If this 2031b39c5158Smillertargument is omitted (or C<undef>), split continues as long as possible. 2032b39c5158Smillert 2033b39c5158SmillertIf the third argument is I<N>, then extraction continues until I<N> fields 2034b39c5158Smillerthave been successfully extracted, or until the string has been completely 2035b39c5158Smillertprocessed. 2036b39c5158Smillert 2037b39c5158SmillertNote that in scalar and void contexts the value of this argument is 2038b39c5158Smillertautomatically reset to 1 (under C<-w>, a warning is issued if the argument 2039b39c5158Smillerthas to be reset). 2040b39c5158Smillert 2041b39c5158Smillert=item 4. 2042b39c5158Smillert 2043b39c5158SmillertA value indicating whether unmatched substrings (see below) within the 2044b39c5158Smillerttext should be skipped or returned as fields. If the value is true, 2045b39c5158Smillertsuch substrings are skipped. Otherwise, they are returned. 2046b39c5158Smillert 2047b39c5158Smillert=back 2048b39c5158Smillert 2049b39c5158SmillertThe extraction process works by applying each extractor in 2050b39c5158Smillertsequence to the text string. 2051b39c5158Smillert 2052b39c5158SmillertIf the extractor is a subroutine it is called in a list context and is 2053b39c5158Smillertexpected to return a list of a single element, namely the extracted 2054b39c5158Smillerttext. It may optionally also return two further arguments: a string 2055b39c5158Smillertrepresenting the text left after extraction (like $' for a pattern 2056b39c5158Smillertmatch), and a string representing any prefix skipped before the 2057b39c5158Smillertextraction (like $` in a pattern match). Note that this is designed 2058b39c5158Smillertto facilitate the use of other Text::Balanced subroutines with 2059b39c5158SmillertC<extract_multiple>. Note too that the value returned by an extractor 2060b39c5158Smillertsubroutine need not bear any relationship to the corresponding substring 2061b39c5158Smillertof the original text (see examples below). 2062b39c5158Smillert 2063b39c5158SmillertIf the extractor is a precompiled regular expression or a string, 2064b39c5158Smillertit is matched against the text in a scalar context with a leading 2065b39c5158Smillert'\G' and the gc modifiers enabled. The extracted value is either 2066b39c5158Smillert$1 if that variable is defined after the match, or else the 2067b39c5158Smillertcomplete match (i.e. $&). 2068b39c5158Smillert 2069b39c5158SmillertIf the extractor is a hash reference, it must contain exactly one element. 2070b39c5158SmillertThe value of that element is one of the 2071b39c5158Smillertabove extractor types (subroutine reference, regular expression, or string). 2072b39c5158SmillertThe key of that element is the name of a class into which the successful 2073b39c5158Smillertreturn value of the extractor will be blessed. 2074b39c5158Smillert 2075b39c5158SmillertIf an extractor returns a defined value, that value is immediately 2076b39c5158Smillerttreated as the next extracted field and pushed onto the list of fields. 2077b39c5158SmillertIf the extractor was specified in a hash reference, the field is also 2078b39c5158Smillertblessed into the appropriate class, 2079b39c5158Smillert 2080b39c5158SmillertIf the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 2081b39c5158Smillertassumed to have failed to extract. 2082b39c5158SmillertIf none of the extractor subroutines succeeds, then one 2083b39c5158Smillertcharacter is extracted from the start of the text and the extraction 2084b39c5158Smillertsubroutines reapplied. Characters which are thus removed are accumulated and 2085b39c5158Smillerteventually become the next field (unless the fourth argument is true, in which 2086b39c5158Smillertcase they are discarded). 2087b39c5158Smillert 2088b39c5158SmillertFor example, the following extracts substrings that are valid Perl variables: 2089b39c5158Smillert 2090b39c5158Smillert @fields = extract_multiple($text, 2091b39c5158Smillert [ sub { extract_variable($_[0]) } ], 2092b39c5158Smillert undef, 1); 2093b39c5158Smillert 2094b39c5158SmillertThis example separates a text into fields which are quote delimited, 2095b39c5158Smillertcurly bracketed, and anything else. The delimited and bracketed 2096b39c5158Smillertparts are also blessed to identify them (the "anything else" is unblessed): 2097b39c5158Smillert 2098b39c5158Smillert @fields = extract_multiple($text, 2099b39c5158Smillert [ 2100b39c5158Smillert { Delim => sub { extract_delimited($_[0],q{'"}) } }, 2101b39c5158Smillert { Brack => sub { extract_bracketed($_[0],'{}') } }, 2102b39c5158Smillert ]); 2103b39c5158Smillert 2104b39c5158SmillertThis call extracts the next single substring that is a valid Perl quotelike 2105b39c5158Smillertoperator (and removes it from $text): 2106b39c5158Smillert 2107b39c5158Smillert $quotelike = extract_multiple($text, 2108b39c5158Smillert [ 2109b39c5158Smillert sub { extract_quotelike($_[0]) }, 2110b39c5158Smillert ], undef, 1); 2111b39c5158Smillert 2112b39c5158SmillertFinally, here is yet another way to do comma-separated value parsing: 2113b39c5158Smillert 2114*e0680481Safresh1 $csv_text = "a,'x b',c"; 2115b39c5158Smillert @fields = extract_multiple($csv_text, 2116b39c5158Smillert [ 2117b39c5158Smillert sub { extract_delimited($_[0],q{'"}) }, 2118*e0680481Safresh1 qr/([^,]+)/, 2119b39c5158Smillert ], 2120b39c5158Smillert undef,1); 2121*e0680481Safresh1 # @fields is now ('a', "'x b'", 'c') 2122b39c5158Smillert 2123b39c5158SmillertThe list in the second argument means: 2124b39c5158SmillertI<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 2125b39c5158SmillertThe undef third argument means: 2126b39c5158SmillertI<"...as many times as possible...">, 2127b39c5158Smillertand the true value in the fourth argument means 2128b39c5158SmillertI<"...discarding anything else that appears (i.e. the commas)">. 2129b39c5158Smillert 2130b39c5158SmillertIf you wanted the commas preserved as separate fields (i.e. like split 2131b39c5158Smillertdoes if your split pattern has capturing parentheses), you would 2132b39c5158Smillertjust make the last parameter undefined (or remove it). 2133b39c5158Smillert 2134eac174f2Safresh1=item C<gen_delimited_pat> 2135b39c5158Smillert 2136b39c5158SmillertThe C<gen_delimited_pat> subroutine takes a single (string) argument and 2137*e0680481Safresh1builds a Friedl-style optimized regex that matches a string delimited 2138b39c5158Smillertby any one of the characters in the single argument. For example: 2139b39c5158Smillert 2140b39c5158Smillert gen_delimited_pat(q{'"}) 2141b39c5158Smillert 2142b39c5158Smillertreturns the regex: 2143b39c5158Smillert 2144b39c5158Smillert (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 2145b39c5158Smillert 2146b39c5158SmillertNote that the specified delimiters are automatically quotemeta'd. 2147b39c5158Smillert 2148b39c5158SmillertA typical use of C<gen_delimited_pat> would be to build special purpose tags 2149b39c5158Smillertfor C<extract_tagged>. For example, to properly ignore "empty" XML elements 2150b39c5158Smillert(which might contain quoted strings): 2151b39c5158Smillert 2152b39c5158Smillert my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 2153b39c5158Smillert 2154b39c5158Smillert extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 2155b39c5158Smillert 2156b39c5158SmillertC<gen_delimited_pat> may also be called with an optional second argument, 2157b39c5158Smillertwhich specifies the "escape" character(s) to be used for each delimiter. 2158b39c5158SmillertFor example to match a Pascal-style string (where ' is the delimiter 2159b39c5158Smillertand '' is a literal ' within the string): 2160b39c5158Smillert 2161b39c5158Smillert gen_delimited_pat(q{'},q{'}); 2162b39c5158Smillert 2163b39c5158SmillertDifferent escape characters can be specified for different delimiters. 2164b39c5158SmillertFor example, to specify that '/' is the escape for single quotes 2165b39c5158Smillertand '%' is the escape for double quotes: 2166b39c5158Smillert 2167b39c5158Smillert gen_delimited_pat(q{'"},q{/%}); 2168b39c5158Smillert 2169b39c5158SmillertIf more delimiters than escape chars are specified, the last escape char 2170b39c5158Smillertis used for the remaining delimiters. 2171b39c5158SmillertIf no escape char is specified for a given specified delimiter, '\' is used. 2172b39c5158Smillert 2173eac174f2Safresh1=item C<delimited_pat> 2174b39c5158Smillert 2175b39c5158SmillertNote that C<gen_delimited_pat> was previously called C<delimited_pat>. 2176b39c5158SmillertThat name may still be used, but is now deprecated. 2177b39c5158Smillert 2178eac174f2Safresh1=back 2179b39c5158Smillert 2180b39c5158Smillert=head1 DIAGNOSTICS 2181b39c5158Smillert 2182b39c5158SmillertIn a list context, all the functions return C<(undef,$original_text)> 2183b39c5158Smillerton failure. In a scalar context, failure is indicated by returning C<undef> 2184b39c5158Smillert(in this case the input text is not modified in any way). 2185b39c5158Smillert 2186b39c5158SmillertIn addition, on failure in I<any> context, the C<$@> variable is set. 2187b39c5158SmillertAccessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 2188b39c5158Smillertbelow. 2189b39c5158SmillertAccessing C<$@-E<gt>{pos}> returns the offset into the original string at 2190b39c5158Smillertwhich the error was detected (although not necessarily where it occurred!) 2191b39c5158SmillertPrinting C<$@> directly produces the error message, with the offset appended. 2192b39c5158SmillertOn success, the C<$@> variable is guaranteed to be C<undef>. 2193b39c5158Smillert 2194b39c5158SmillertThe available diagnostics are: 2195b39c5158Smillert 2196b39c5158Smillert=over 4 2197b39c5158Smillert 2198b39c5158Smillert=item C<Did not find a suitable bracket: "%s"> 2199b39c5158Smillert 2200b39c5158SmillertThe delimiter provided to C<extract_bracketed> was not one of 2201b39c5158SmillertC<'()[]E<lt>E<gt>{}'>. 2202b39c5158Smillert 2203b39c5158Smillert=item C<Did not find prefix: /%s/> 2204b39c5158Smillert 2205b39c5158SmillertA non-optional prefix was specified but wasn't found at the start of the text. 2206b39c5158Smillert 2207b39c5158Smillert=item C<Did not find opening bracket after prefix: "%s"> 2208b39c5158Smillert 2209b39c5158SmillertC<extract_bracketed> or C<extract_codeblock> was expecting a 2210b39c5158Smillertparticular kind of bracket at the start of the text, and didn't find it. 2211b39c5158Smillert 2212b39c5158Smillert=item C<No quotelike operator found after prefix: "%s"> 2213b39c5158Smillert 2214b39c5158SmillertC<extract_quotelike> didn't find one of the quotelike operators C<q>, 2215b39c5158SmillertC<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 2216b39c5158Smillertit was extracting. 2217b39c5158Smillert 2218b39c5158Smillert=item C<Unmatched closing bracket: "%c"> 2219b39c5158Smillert 2220b39c5158SmillertC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 2221b39c5158Smillerta closing bracket where none was expected. 2222b39c5158Smillert 2223b39c5158Smillert=item C<Unmatched opening bracket(s): "%s"> 2224b39c5158Smillert 2225b39c5158SmillertC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 2226b39c5158Smillertout of characters in the text before closing one or more levels of nested 2227b39c5158Smillertbrackets. 2228b39c5158Smillert 2229b39c5158Smillert=item C<Unmatched embedded quote (%s)> 2230b39c5158Smillert 2231b39c5158SmillertC<extract_bracketed> attempted to match an embedded quoted substring, but 2232b39c5158Smillertfailed to find a closing quote to match it. 2233b39c5158Smillert 2234b39c5158Smillert=item C<Did not find closing delimiter to match '%s'> 2235b39c5158Smillert 2236b39c5158SmillertC<extract_quotelike> was unable to find a closing delimiter to match the 2237b39c5158Smillertone that opened the quote-like operation. 2238b39c5158Smillert 2239b39c5158Smillert=item C<Mismatched closing bracket: expected "%c" but found "%s"> 2240b39c5158Smillert 2241b39c5158SmillertC<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 2242b39c5158Smillerta valid bracket delimiter, but it was the wrong species. This usually 2243b39c5158Smillertindicates a nesting error, but may indicate incorrect quoting or escaping. 2244b39c5158Smillert 2245b39c5158Smillert=item C<No block delimiter found after quotelike "%s"> 2246b39c5158Smillert 2247b39c5158SmillertC<extract_quotelike> or C<extract_codeblock> found one of the 2248b39c5158Smillertquotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 2249b39c5158Smillertwithout a suitable block after it. 2250b39c5158Smillert 2251b39c5158Smillert=item C<Did not find leading dereferencer> 2252b39c5158Smillert 2253b39c5158SmillertC<extract_variable> was expecting one of '$', '@', or '%' at the start of 2254b39c5158Smillerta variable, but didn't find any of them. 2255b39c5158Smillert 2256b39c5158Smillert=item C<Bad identifier after dereferencer> 2257b39c5158Smillert 2258b39c5158SmillertC<extract_variable> found a '$', '@', or '%' indicating a variable, but that 2259b39c5158Smillertcharacter was not followed by a legal Perl identifier. 2260b39c5158Smillert 2261b39c5158Smillert=item C<Did not find expected opening bracket at %s> 2262b39c5158Smillert 2263b39c5158SmillertC<extract_codeblock> failed to find any of the outermost opening brackets 2264b39c5158Smillertthat were specified. 2265b39c5158Smillert 2266b39c5158Smillert=item C<Improperly nested codeblock at %s> 2267b39c5158Smillert 2268b39c5158SmillertA nested code block was found that started with a delimiter that was specified 2269b39c5158Smillertas being only to be used as an outermost bracket. 2270b39c5158Smillert 2271b39c5158Smillert=item C<Missing second block for quotelike "%s"> 2272b39c5158Smillert 2273b39c5158SmillertC<extract_codeblock> or C<extract_quotelike> found one of the 2274b39c5158Smillertquotelike operators C<s>, C<tr> or C<y> followed by only one block. 2275b39c5158Smillert 2276b39c5158Smillert=item C<No match found for opening bracket> 2277b39c5158Smillert 2278b39c5158SmillertC<extract_codeblock> failed to find a closing bracket to match the outermost 2279b39c5158Smillertopening bracket. 2280b39c5158Smillert 2281b39c5158Smillert=item C<Did not find opening tag: /%s/> 2282b39c5158Smillert 2283b39c5158SmillertC<extract_tagged> did not find a suitable opening tag (after any specified 2284b39c5158Smillertprefix was removed). 2285b39c5158Smillert 2286b39c5158Smillert=item C<Unable to construct closing tag to match: /%s/> 2287b39c5158Smillert 2288b39c5158SmillertC<extract_tagged> matched the specified opening tag and tried to 2289b39c5158Smillertmodify the matched text to produce a matching closing tag (because 2290b39c5158Smillertnone was specified). It failed to generate the closing tag, almost 2291b39c5158Smillertcertainly because the opening tag did not start with a 2292b39c5158Smillertbracket of some kind. 2293b39c5158Smillert 2294b39c5158Smillert=item C<Found invalid nested tag: %s> 2295b39c5158Smillert 2296b39c5158SmillertC<extract_tagged> found a nested tag that appeared in the "reject" list 2297b39c5158Smillert(and the failure mode was not "MAX" or "PARA"). 2298b39c5158Smillert 2299b39c5158Smillert=item C<Found unbalanced nested tag: %s> 2300b39c5158Smillert 2301b39c5158SmillertC<extract_tagged> found a nested opening tag that was not matched by a 2302b39c5158Smillertcorresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 2303b39c5158Smillert 2304b39c5158Smillert=item C<Did not find closing tag> 2305b39c5158Smillert 2306b39c5158SmillertC<extract_tagged> reached the end of the text without finding a closing tag 2307b39c5158Smillertto match the original opening tag (and the failure mode was not 2308b39c5158Smillert"MAX" or "PARA"). 2309b39c5158Smillert 2310b39c5158Smillert=back 2311b39c5158Smillert 2312eac174f2Safresh1=head1 EXPORTS 2313eac174f2Safresh1 2314eac174f2Safresh1The following symbols are, or can be, exported by this module: 2315eac174f2Safresh1 2316eac174f2Safresh1=over 4 2317eac174f2Safresh1 2318eac174f2Safresh1=item Default Exports 2319eac174f2Safresh1 2320eac174f2Safresh1I<None>. 2321eac174f2Safresh1 2322eac174f2Safresh1=item Optional Exports 2323eac174f2Safresh1 2324eac174f2Safresh1C<extract_delimited>, 2325eac174f2Safresh1C<extract_bracketed>, 2326eac174f2Safresh1C<extract_quotelike>, 2327eac174f2Safresh1C<extract_codeblock>, 2328eac174f2Safresh1C<extract_variable>, 2329eac174f2Safresh1C<extract_tagged>, 2330eac174f2Safresh1C<extract_multiple>, 2331eac174f2Safresh1C<gen_delimited_pat>, 2332eac174f2Safresh1C<gen_extract_tagged>, 2333eac174f2Safresh1C<delimited_pat>. 2334eac174f2Safresh1 2335eac174f2Safresh1=item Export Tags 2336eac174f2Safresh1 2337eac174f2Safresh1=over 4 2338eac174f2Safresh1 2339eac174f2Safresh1=item C<:ALL> 2340eac174f2Safresh1 2341eac174f2Safresh1C<extract_delimited>, 2342eac174f2Safresh1C<extract_bracketed>, 2343eac174f2Safresh1C<extract_quotelike>, 2344eac174f2Safresh1C<extract_codeblock>, 2345eac174f2Safresh1C<extract_variable>, 2346eac174f2Safresh1C<extract_tagged>, 2347eac174f2Safresh1C<extract_multiple>, 2348eac174f2Safresh1C<gen_delimited_pat>, 2349eac174f2Safresh1C<gen_extract_tagged>, 2350eac174f2Safresh1C<delimited_pat>. 2351eac174f2Safresh1 2352eac174f2Safresh1=back 2353eac174f2Safresh1 2354eac174f2Safresh1=back 2355eac174f2Safresh1 2356eac174f2Safresh1=head1 KNOWN BUGS 2357eac174f2Safresh1 2358eac174f2Safresh1See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>. 2359eac174f2Safresh1 2360eac174f2Safresh1=head1 FEEDBACK 2361eac174f2Safresh1 2362eac174f2Safresh1Patches, bug reports, suggestions or any other feedback is welcome. 2363eac174f2Safresh1 2364eac174f2Safresh1Patches can be sent as GitHub pull requests at 2365eac174f2Safresh1L<https://github.com/steve-m-hay/Text-Balanced/pulls>. 2366eac174f2Safresh1 2367eac174f2Safresh1Bug reports and suggestions can be made on the CPAN Request Tracker at 2368eac174f2Safresh1L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>. 2369eac174f2Safresh1 2370eac174f2Safresh1Currently active requests on the CPAN Request Tracker can be viewed at 2371eac174f2Safresh1L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>. 2372eac174f2Safresh1 2373eac174f2Safresh1Please test this distribution. See CPAN Testers Reports at 2374eac174f2Safresh1L<https://www.cpantesters.org/> for details of how to get involved. 2375eac174f2Safresh1 2376eac174f2Safresh1Previous test results on CPAN Testers Reports can be viewed at 2377eac174f2Safresh1L<https://www.cpantesters.org/distro/T/Text-Balanced.html>. 2378eac174f2Safresh1 2379eac174f2Safresh1Please rate this distribution on CPAN Ratings at 2380eac174f2Safresh1L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>. 2381eac174f2Safresh1 2382eac174f2Safresh1=head1 AVAILABILITY 2383eac174f2Safresh1 2384eac174f2Safresh1The latest version of this module is available from CPAN (see 2385eac174f2Safresh1L<perlmodlib/"CPAN"> for details) at 2386eac174f2Safresh1 2387eac174f2Safresh1L<https://metacpan.org/release/Text-Balanced> or 2388eac174f2Safresh1 2389eac174f2Safresh1L<https://www.cpan.org/authors/id/S/SH/SHAY/> or 2390eac174f2Safresh1 2391eac174f2Safresh1L<https://www.cpan.org/modules/by-module/Text/>. 2392eac174f2Safresh1 2393eac174f2Safresh1The latest source code is available from GitHub at 2394eac174f2Safresh1L<https://github.com/steve-m-hay/Text-Balanced>. 2395eac174f2Safresh1 2396eac174f2Safresh1=head1 INSTALLATION 2397eac174f2Safresh1 2398eac174f2Safresh1See the F<INSTALL> file. 2399eac174f2Safresh1 2400b39c5158Smillert=head1 AUTHOR 2401b39c5158Smillert 2402eac174f2Safresh1Damian Conway E<lt>L<damian@conway.org|mailto:damian@conway.org>E<gt>. 2403b39c5158Smillert 2404eac174f2Safresh1Steve Hay E<lt>L<shay@cpan.org|mailto:shay@cpan.org>E<gt> is now maintaining 2405eac174f2Safresh1Text::Balanced as of version 2.03. 2406b39c5158Smillert 2407b39c5158Smillert=head1 COPYRIGHT 2408b39c5158Smillert 2409eac174f2Safresh1Copyright (C) 1997-2001 Damian Conway. All rights reserved. 2410b39c5158Smillert 2411eac174f2Safresh1Copyright (C) 2009 Adam Kennedy. 2412b39c5158Smillert 2413*e0680481Safresh1Copyright (C) 2015, 2020, 2022 Steve Hay and other contributors. All rights 2414*e0680481Safresh1reserved. 2415eac174f2Safresh1 2416eac174f2Safresh1=head1 LICENCE 2417eac174f2Safresh1 2418eac174f2Safresh1This module is free software; you can redistribute it and/or modify it under the 2419eac174f2Safresh1same terms as Perl itself, i.e. under the terms of either the GNU General Public 2420eac174f2Safresh1License or the Artistic License, as specified in the F<LICENCE> file. 2421eac174f2Safresh1 2422eac174f2Safresh1=head1 VERSION 2423eac174f2Safresh1 2424*e0680481Safresh1Version 2.06 2425eac174f2Safresh1 2426eac174f2Safresh1=head1 DATE 2427eac174f2Safresh1 2428*e0680481Safresh105 Jun 2022 2429eac174f2Safresh1 2430eac174f2Safresh1=head1 HISTORY 2431eac174f2Safresh1 2432eac174f2Safresh1See the F<Changes> file. 2433b39c5158Smillert 2434b39c5158Smillert=cut 2435