1# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 2# FOR FULL DOCUMENTATION SEE Balanced.pod 3 4use 5.005; 5use strict; 6 7package Text::Balanced; 8 9use Exporter; 10use SelfLoader; 11use vars qw { $VERSION @ISA %EXPORT_TAGS }; 12 13$VERSION = '1.95'; 14@ISA = qw ( Exporter ); 15 16%EXPORT_TAGS = ( ALL => [ qw( 17 &extract_delimited 18 &extract_bracketed 19 &extract_quotelike 20 &extract_codeblock 21 &extract_variable 22 &extract_tagged 23 &extract_multiple 24 25 &gen_delimited_pat 26 &gen_extract_tagged 27 28 &delimited_pat 29 ) ] ); 30 31Exporter::export_ok_tags('ALL'); 32 33# PROTOTYPES 34 35sub _match_bracketed($$$$$$); 36sub _match_variable($$); 37sub _match_codeblock($$$$$$$); 38sub _match_quotelike($$$$); 39 40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 41 42sub _failmsg { 43 my ($message, $pos) = @_; 44 $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg"; 45} 46 47sub _fail 48{ 49 my ($wantarray, $textref, $message, $pos) = @_; 50 _failmsg $message, $pos if $message; 51 return ("",$$textref,"") if $wantarray; 52 return undef; 53} 54 55sub _succeed 56{ 57 $@ = undef; 58 my ($wantarray,$textref) = splice @_, 0, 2; 59 my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0); 60 my ($startlen) = $_[5]; 61 my $remainderpos = $_[2]; 62 if ($wantarray) 63 { 64 my @res; 65 while (my ($from, $len) = splice @_, 0, 2) 66 { 67 push @res, substr($$textref,$from,$len); 68 } 69 if ($extralen) { # CORRECT FILLET 70 my $extra = substr($res[0], $extrapos-$startlen, $extralen, "\n"); 71 $res[1] = "$extra$res[1]"; 72 eval { substr($$textref,$remainderpos,0) = $extra; 73 substr($$textref,$extrapos,$extralen,"\n")} ; 74 #REARRANGE HERE DOC AND FILLET IF POSSIBLE 75 pos($$textref) = $remainderpos-$extralen+1; # RESET \G 76 } 77 else { 78 pos($$textref) = $remainderpos; # RESET \G 79 } 80 return @res; 81 } 82 else 83 { 84 my $match = substr($$textref,$_[0],$_[1]); 85 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; 86 my $extra = $extralen 87 ? substr($$textref, $extrapos, $extralen)."\n" : ""; 88 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE 89 pos($$textref) = $_[4]; # RESET \G 90 return $match; 91 } 92} 93 94# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 95 96sub gen_delimited_pat($;$) # ($delimiters;$escapes) 97{ 98 my ($dels, $escs) = @_; 99 return "" unless $dels =~ /\S/; 100 $escs = '\\' unless $escs; 101 $escs .= substr($escs,-1) x (length($dels)-length($escs)); 102 my @pat = (); 103 my $i; 104 for ($i=0; $i<length $dels; $i++) 105 { 106 my $del = quotemeta substr($dels,$i,1); 107 my $esc = quotemeta substr($escs,$i,1); 108 if ($del eq $esc) 109 { 110 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; 111 } 112 else 113 { 114 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; 115 } 116 } 117 my $pat = join '|', @pat; 118 return "(?:$pat)"; 119} 120 121*delimited_pat = \&gen_delimited_pat; 122 123 124# THE EXTRACTION FUNCTIONS 125 126sub extract_delimited (;$$$$) 127{ 128 my $textref = defined $_[0] ? \$_[0] : \$_; 129 my $wantarray = wantarray; 130 my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; 131 my $pre = defined $_[2] ? $_[2] : '\s*'; 132 my $esc = defined $_[3] ? $_[3] : qq{\\}; 133 my $pat = gen_delimited_pat($del, $esc); 134 my $startpos = pos $$textref || 0; 135 return _fail($wantarray, $textref, "Not a delimited pattern", 0) 136 unless $$textref =~ m/\G($pre)($pat)/gc; 137 my $prelen = length($1); 138 my $matchpos = $startpos+$prelen; 139 my $endpos = pos $$textref; 140 return _succeed $wantarray, $textref, 141 $matchpos, $endpos-$matchpos, # MATCH 142 $endpos, length($$textref)-$endpos, # REMAINDER 143 $startpos, $prelen; # PREFIX 144} 145 146sub extract_bracketed (;$$$) 147{ 148 my $textref = defined $_[0] ? \$_[0] : \$_; 149 my $ldel = defined $_[1] ? $_[1] : '{([<'; 150 my $pre = defined $_[2] ? $_[2] : '\s*'; 151 my $wantarray = wantarray; 152 my $qdel = ""; 153 my $quotelike; 154 $ldel =~ s/'//g and $qdel .= q{'}; 155 $ldel =~ s/"//g and $qdel .= q{"}; 156 $ldel =~ s/`//g and $qdel .= q{`}; 157 $ldel =~ s/q//g and $quotelike = 1; 158 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; 159 my $rdel = $ldel; 160 unless ($rdel =~ tr/[({</])}>/) 161 { 162 return _fail $wantarray, $textref, 163 "Did not find a suitable bracket in delimiter: \"$_[1]\"", 164 0; 165 } 166 my $posbug = pos; 167 $ldel = join('|', map { quotemeta $_ } split('', $ldel)); 168 $rdel = join('|', map { quotemeta $_ } split('', $rdel)); 169 pos = $posbug; 170 171 my $startpos = pos $$textref || 0; 172 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); 173 174 return _fail ($wantarray, $textref) unless @match; 175 176 return _succeed ( $wantarray, $textref, 177 $match[2], $match[5]+2, # MATCH 178 @match[8,9], # REMAINDER 179 @match[0,1], # PREFIX 180 ); 181} 182 183sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel 184{ 185 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; 186 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); 187 unless ($$textref =~ m/\G$pre/gc) 188 { 189 _failmsg "Did not find prefix: /$pre/", $startpos; 190 return; 191 } 192 193 $ldelpos = pos $$textref; 194 195 unless ($$textref =~ m/\G($ldel)/gc) 196 { 197 _failmsg "Did not find opening bracket after prefix: \"$pre\"", 198 pos $$textref; 199 pos $$textref = $startpos; 200 return; 201 } 202 203 my @nesting = ( $1 ); 204 my $textlen = length $$textref; 205 while (pos $$textref < $textlen) 206 { 207 next if $$textref =~ m/\G\\./gcs; 208 209 if ($$textref =~ m/\G($ldel)/gc) 210 { 211 push @nesting, $1; 212 } 213 elsif ($$textref =~ m/\G($rdel)/gc) 214 { 215 my ($found, $brackettype) = ($1, $1); 216 if ($#nesting < 0) 217 { 218 _failmsg "Unmatched closing bracket: \"$found\"", 219 pos $$textref; 220 pos $$textref = $startpos; 221 return; 222 } 223 my $expected = pop(@nesting); 224 $expected =~ tr/({[</)}]>/; 225 if ($expected ne $brackettype) 226 { 227 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, 228 pos $$textref; 229 pos $$textref = $startpos; 230 return; 231 } 232 last if $#nesting < 0; 233 } 234 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) 235 { 236 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; 237 _failmsg "Unmatched embedded quote ($1)", 238 pos $$textref; 239 pos $$textref = $startpos; 240 return; 241 } 242 elsif ($quotelike && _match_quotelike($textref,"",1,0)) 243 { 244 next; 245 } 246 247 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } 248 } 249 if ($#nesting>=0) 250 { 251 _failmsg "Unmatched opening bracket(s): " 252 . join("..",@nesting)."..", 253 pos $$textref; 254 pos $$textref = $startpos; 255 return; 256 } 257 258 $endpos = pos $$textref; 259 260 return ( 261 $startpos, $ldelpos-$startpos, # PREFIX 262 $ldelpos, 1, # OPENING BRACKET 263 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS 264 $endpos-1, 1, # CLOSING BRACKET 265 $endpos, length($$textref)-$endpos, # REMAINDER 266 ); 267} 268 269sub revbracket($) 270{ 271 my $brack = reverse $_[0]; 272 $brack =~ tr/[({</])}>/; 273 return $brack; 274} 275 276my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; 277 278sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) 279{ 280 my $textref = defined $_[0] ? \$_[0] : \$_; 281 my $ldel = $_[1]; 282 my $rdel = $_[2]; 283 my $pre = defined $_[3] ? $_[3] : '\s*'; 284 my %options = defined $_[4] ? %{$_[4]} : (); 285 my $omode = defined $options{fail} ? $options{fail} : ''; 286 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 287 : defined($options{reject}) ? $options{reject} 288 : '' 289 ; 290 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 291 : defined($options{ignore}) ? $options{ignore} 292 : '' 293 ; 294 295 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 296 $@ = undef; 297 298 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 299 300 return _fail(wantarray, $textref) unless @match; 301 return _succeed wantarray, $textref, 302 $match[2], $match[3]+$match[5]+$match[7], # MATCH 303 @match[8..9,0..1,2..7]; # REM, PRE, BITS 304} 305 306sub _match_tagged # ($$$$$$$) 307{ 308 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; 309 my $rdelspec; 310 311 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); 312 313 unless ($$textref =~ m/\G($pre)/gc) 314 { 315 _failmsg "Did not find prefix: /$pre/", pos $$textref; 316 goto failed; 317 } 318 319 $opentagpos = pos($$textref); 320 321 unless ($$textref =~ m/\G$ldel/gc) 322 { 323 _failmsg "Did not find opening tag: /$ldel/", pos $$textref; 324 goto failed; 325 } 326 327 $textpos = pos($$textref); 328 329 if (!defined $rdel) 330 { 331 $rdelspec = $&; 332 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". revbracket($1) /oes) 333 { 334 _failmsg "Unable to construct closing tag to match: $rdel", 335 pos $$textref; 336 goto failed; 337 } 338 } 339 else 340 { 341 $rdelspec = eval "qq{$rdel}" || do { 342 my $del; 343 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) 344 { next if $rdel =~ /\Q$_/; $del = $_; last } 345 unless ($del) { 346 use Carp; 347 croak "Can't interpolate right delimiter $rdel" 348 } 349 eval "qq$del$rdel$del"; 350 }; 351 } 352 353 while (pos($$textref) < length($$textref)) 354 { 355 next if $$textref =~ m/\G\\./gc; 356 357 if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) 358 { 359 $parapos = pos($$textref) - length($1) 360 unless defined $parapos; 361 } 362 elsif ($$textref =~ m/\G($rdelspec)/gc ) 363 { 364 $closetagpos = pos($$textref)-length($1); 365 goto matched; 366 } 367 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) 368 { 369 next; 370 } 371 elsif ($bad && $$textref =~ m/\G($bad)/gcs) 372 { 373 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS 374 goto short if ($omode eq 'PARA' || $omode eq 'MAX'); 375 _failmsg "Found invalid nested tag: $1", pos $$textref; 376 goto failed; 377 } 378 elsif ($$textref =~ m/\G($ldel)/gc) 379 { 380 my $tag = $1; 381 pos($$textref) -= length($tag); # REWIND TO NESTED TAG 382 unless (_match_tagged(@_)) # MATCH NESTED TAG 383 { 384 goto short if $omode eq 'PARA' || $omode eq 'MAX'; 385 _failmsg "Found unbalanced nested tag: $tag", 386 pos $$textref; 387 goto failed; 388 } 389 } 390 else { $$textref =~ m/./gcs } 391 } 392 393short: 394 $closetagpos = pos($$textref); 395 goto matched if $omode eq 'MAX'; 396 goto failed unless $omode eq 'PARA'; 397 398 if (defined $parapos) { pos($$textref) = $parapos } 399 else { $parapos = pos($$textref) } 400 401 return ( 402 $startpos, $opentagpos-$startpos, # PREFIX 403 $opentagpos, $textpos-$opentagpos, # OPENING TAG 404 $textpos, $parapos-$textpos, # TEXT 405 $parapos, 0, # NO CLOSING TAG 406 $parapos, length($$textref)-$parapos, # REMAINDER 407 ); 408 409matched: 410 $endpos = pos($$textref); 411 return ( 412 $startpos, $opentagpos-$startpos, # PREFIX 413 $opentagpos, $textpos-$opentagpos, # OPENING TAG 414 $textpos, $closetagpos-$textpos, # TEXT 415 $closetagpos, $endpos-$closetagpos, # CLOSING TAG 416 $endpos, length($$textref)-$endpos, # REMAINDER 417 ); 418 419failed: 420 _failmsg "Did not find closing tag", pos $$textref unless $@; 421 pos($$textref) = $startpos; 422 return; 423} 424 425sub extract_variable (;$$) 426{ 427 my $textref = defined $_[0] ? \$_[0] : \$_; 428 return ("","","") unless defined $$textref; 429 my $pre = defined $_[1] ? $_[1] : '\s*'; 430 431 my @match = _match_variable($textref,$pre); 432 433 return _fail wantarray, $textref unless @match; 434 435 return _succeed wantarray, $textref, 436 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX 437} 438 439sub _match_variable($$) 440{ 441# $# 442# $^ 443# $$ 444 my ($textref, $pre) = @_; 445 my $startpos = pos($$textref) = pos($$textref)||0; 446 unless ($$textref =~ m/\G($pre)/gc) 447 { 448 _failmsg "Did not find prefix: /$pre/", pos $$textref; 449 return; 450 } 451 my $varpos = pos($$textref); 452 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 453 { 454 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) 455 { 456 _failmsg "Did not find leading dereferencer", pos $$textref; 457 pos $$textref = $startpos; 458 return; 459 } 460 my $deref = $1; 461 462 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci 463 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) 464 or $deref eq '$#' or $deref eq '$$' ) 465 { 466 _failmsg "Bad identifier after dereferencer", pos $$textref; 467 pos $$textref = $startpos; 468 return; 469 } 470 } 471 472 while (1) 473 { 474 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; 475 next if _match_codeblock($textref, 476 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, 477 qr/[({[]/, qr/[)}\]]/, 478 qr/[({[]/, qr/[)}\]]/, 0); 479 next if _match_codeblock($textref, 480 qr/\s*/, qr/[{[]/, qr/[}\]]/, 481 qr/[{[]/, qr/[}\]]/, 0); 482 next if _match_variable($textref,'\s*->\s*'); 483 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; 484 last; 485 } 486 487 my $endpos = pos($$textref); 488 return ($startpos, $varpos-$startpos, 489 $varpos, $endpos-$varpos, 490 $endpos, length($$textref)-$endpos 491 ); 492} 493 494sub extract_codeblock (;$$$$$) 495{ 496 my $textref = defined $_[0] ? \$_[0] : \$_; 497 my $wantarray = wantarray; 498 my $ldel_inner = defined $_[1] ? $_[1] : '{'; 499 my $pre = defined $_[2] ? $_[2] : '\s*'; 500 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; 501 my $rd = $_[4]; 502 my $rdel_inner = $ldel_inner; 503 my $rdel_outer = $ldel_outer; 504 my $posbug = pos; 505 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } 506 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } 507 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) 508 { 509 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' 510 } 511 pos = $posbug; 512 513 my @match = _match_codeblock($textref, $pre, 514 $ldel_outer, $rdel_outer, 515 $ldel_inner, $rdel_inner, 516 $rd); 517 return _fail($wantarray, $textref) unless @match; 518 return _succeed($wantarray, $textref, 519 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX 520 ); 521 522} 523 524sub _match_codeblock($$$$$$$) 525{ 526 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; 527 my $startpos = pos($$textref) = pos($$textref) || 0; 528 unless ($$textref =~ m/\G($pre)/gc) 529 { 530 _failmsg qq{Did not match prefix /$pre/ at"} . 531 substr($$textref,pos($$textref),20) . 532 q{..."}, 533 pos $$textref; 534 return; 535 } 536 my $codepos = pos($$textref); 537 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER 538 { 539 _failmsg qq{Did not find expected opening bracket at "} . 540 substr($$textref,pos($$textref),20) . 541 q{..."}, 542 pos $$textref; 543 pos $$textref = $startpos; 544 return; 545 } 546 my $closing = $1; 547 $closing =~ tr/([<{/)]>}/; 548 my $matched; 549 my $patvalid = 1; 550 while (pos($$textref) < length($$textref)) 551 { 552 $matched = ''; 553 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) 554 { 555 $patvalid = 0; 556 next; 557 } 558 559 if ($$textref =~ m/\G\s*#.*/gc) 560 { 561 next; 562 } 563 564 if ($$textref =~ m/\G\s*($rdel_outer)/gc) 565 { 566 unless ($matched = ($closing && $1 eq $closing) ) 567 { 568 next if $1 eq '>'; # MIGHT BE A "LESS THAN" 569 _failmsg q{Mismatched closing bracket at "} . 570 substr($$textref,pos($$textref),20) . 571 qq{...". Expected '$closing'}, 572 pos $$textref; 573 } 574 last; 575 } 576 577 if (_match_variable($textref,'\s*') || 578 _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) 579 { 580 $patvalid = 0; 581 next; 582 } 583 584 585 # NEED TO COVER MANY MORE CASES HERE!!! 586 if ($$textref =~ m#\G\s*(?!$ldel_inner) 587 ( [-+*x/%^&|.]=? 588 | [!=]~ 589 | =(?!>) 590 | (\*\*|&&|\|\||<<|>>)=? 591 | split|grep|map|return 592 | [([] 593 )#gcx) 594 { 595 $patvalid = 1; 596 next; 597 } 598 599 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) 600 { 601 $patvalid = 1; 602 next; 603 } 604 605 if ($$textref =~ m/\G\s*$ldel_outer/gc) 606 { 607 _failmsg q{Improperly nested codeblock at "} . 608 substr($$textref,pos($$textref),20) . 609 q{..."}, 610 pos $$textref; 611 last; 612 } 613 614 $patvalid = 0; 615 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; 616 } 617 continue { $@ = undef } 618 619 unless ($matched) 620 { 621 _failmsg 'No match found for opening bracket', pos $$textref 622 unless $@; 623 return; 624 } 625 626 my $endpos = pos($$textref); 627 return ( $startpos, $codepos-$startpos, 628 $codepos, $endpos-$codepos, 629 $endpos, length($$textref)-$endpos, 630 ); 631} 632 633 634my %mods = ( 635 'none' => '[cgimsox]*', 636 'm' => '[cgimsox]*', 637 's' => '[cegimsox]*', 638 'tr' => '[cds]*', 639 'y' => '[cds]*', 640 'qq' => '', 641 'qx' => '', 642 'qw' => '', 643 'qr' => '[imsx]*', 644 'q' => '', 645 ); 646 647sub extract_quotelike (;$$) 648{ 649 my $textref = $_[0] ? \$_[0] : \$_; 650 my $wantarray = wantarray; 651 my $pre = defined $_[1] ? $_[1] : '\s*'; 652 653 my @match = _match_quotelike($textref,$pre,1,0); 654 return _fail($wantarray, $textref) unless @match; 655 return _succeed($wantarray, $textref, 656 $match[2], $match[18]-$match[2], # MATCH 657 @match[18,19], # REMAINDER 658 @match[0,1], # PREFIX 659 @match[2..17], # THE BITS 660 @match[20,21], # ANY FILLET? 661 ); 662}; 663 664sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) 665{ 666 my ($textref, $pre, $rawmatch, $qmark) = @_; 667 668 my ($textlen,$startpos, 669 $oppos, 670 $preld1pos,$ld1pos,$str1pos,$rd1pos, 671 $preld2pos,$ld2pos,$str2pos,$rd2pos, 672 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); 673 674 unless ($$textref =~ m/\G($pre)/gc) 675 { 676 _failmsg qq{Did not find prefix /$pre/ at "} . 677 substr($$textref, pos($$textref), 20) . 678 q{..."}, 679 pos $$textref; 680 return; 681 } 682 $oppos = pos($$textref); 683 684 my $initial = substr($$textref,$oppos,1); 685 686 if ($initial && $initial =~ m|^[\"\'\`]| 687 || $rawmatch && $initial =~ m|^/| 688 || $qmark && $initial =~ m|^\?|) 689 { 690 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) 691 { 692 _failmsg qq{Did not find closing delimiter to match '$initial' at "} . 693 substr($$textref, $oppos, 20) . 694 q{..."}, 695 pos $$textref; 696 pos $$textref = $startpos; 697 return; 698 } 699 $modpos= pos($$textref); 700 $rd1pos = $modpos-1; 701 702 if ($initial eq '/' || $initial eq '?') 703 { 704 $$textref =~ m/\G$mods{none}/gc 705 } 706 707 my $endpos = pos($$textref); 708 return ( 709 $startpos, $oppos-$startpos, # PREFIX 710 $oppos, 0, # NO OPERATOR 711 $oppos, 1, # LEFT DEL 712 $oppos+1, $rd1pos-$oppos-1, # STR/PAT 713 $rd1pos, 1, # RIGHT DEL 714 $modpos, 0, # NO 2ND LDEL 715 $modpos, 0, # NO 2ND STR 716 $modpos, 0, # NO 2ND RDEL 717 $modpos, $endpos-$modpos, # MODIFIERS 718 $endpos, $textlen-$endpos, # REMAINDER 719 ); 720 } 721 722 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) 723 { 724 _failmsg q{No quotelike operator found after prefix at "} . 725 substr($$textref, pos($$textref), 20) . 726 q{..."}, 727 pos $$textref; 728 pos $$textref = $startpos; 729 return; 730 } 731 732 my $op = $1; 733 $preld1pos = pos($$textref); 734 if ($op eq '<<') { 735 $ld1pos = pos($$textref); 736 my $label; 737 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { 738 $label = $1; 739 } 740 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' 741 | \G " ([^"\\]* (?:\\.[^"\\]*)*) " 742 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` 743 }gcsx) { 744 $label = $+; 745 } 746 else { 747 $label = ""; 748 } 749 my $extrapos = pos($$textref); 750 $$textref =~ m{.*\n}gc; 751 $str1pos = pos($$textref); 752 unless ($$textref =~ m{.*?\n(?=$label\n)}gc) { 753 _failmsg qq{Missing here doc terminator ('$label') after "} . 754 substr($$textref, $startpos, 20) . 755 q{..."}, 756 pos $$textref; 757 pos $$textref = $startpos; 758 return; 759 } 760 $rd1pos = pos($$textref); 761 $$textref =~ m{$label\n}gc; 762 $ld2pos = pos($$textref); 763 return ( 764 $startpos, $oppos-$startpos, # PREFIX 765 $oppos, length($op), # OPERATOR 766 $ld1pos, $extrapos-$ld1pos, # LEFT DEL 767 $str1pos, $rd1pos-$str1pos, # STR/PAT 768 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL 769 $ld2pos, 0, # NO 2ND LDEL 770 $ld2pos, 0, # NO 2ND STR 771 $ld2pos, 0, # NO 2ND RDEL 772 $ld2pos, 0, # NO MODIFIERS 773 $ld2pos, $textlen-$ld2pos, # REMAINDER 774 $extrapos, $str1pos-$extrapos, # FILLETED BIT 775 ); 776 } 777 778 $$textref =~ m/\G\s*/gc; 779 $ld1pos = pos($$textref); 780 $str1pos = $ld1pos+1; 781 782 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD 783 { 784 _failmsg "No block delimiter found after quotelike $op", 785 pos $$textref; 786 pos $$textref = $startpos; 787 return; 788 } 789 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN 790 my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); 791 if ($ldel1 =~ /[[(<{]/) 792 { 793 $rdel1 =~ tr/[({</])}>/; 794 _match_bracketed($textref,"",$ldel1,"","",$rdel1) 795 || do { pos $$textref = $startpos; return }; 796 } 797 else 798 { 799 $$textref =~ /$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs 800 || do { pos $$textref = $startpos; return }; 801 } 802 $ld2pos = $rd1pos = pos($$textref)-1; 803 804 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; 805 if ($second_arg) 806 { 807 my ($ldel2, $rdel2); 808 if ($ldel1 =~ /[[(<{]/) 809 { 810 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD 811 { 812 _failmsg "Missing second block for quotelike $op", 813 pos $$textref; 814 pos $$textref = $startpos; 815 return; 816 } 817 $ldel2 = $rdel2 = "\Q$1"; 818 $rdel2 =~ tr/[({</])}>/; 819 } 820 else 821 { 822 $ldel2 = $rdel2 = $ldel1; 823 } 824 $str2pos = $ld2pos+1; 825 826 if ($ldel2 =~ /[[(<{]/) 827 { 828 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD 829 _match_bracketed($textref,"",$ldel2,"","",$rdel2) 830 || do { pos $$textref = $startpos; return }; 831 } 832 else 833 { 834 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs 835 || do { pos $$textref = $startpos; return }; 836 } 837 $rd2pos = pos($$textref)-1; 838 } 839 else 840 { 841 $ld2pos = $str2pos = $rd2pos = $rd1pos; 842 } 843 844 $modpos = pos $$textref; 845 846 $$textref =~ m/\G($mods{$op})/gc; 847 my $endpos = pos $$textref; 848 849 return ( 850 $startpos, $oppos-$startpos, # PREFIX 851 $oppos, length($op), # OPERATOR 852 $ld1pos, 1, # LEFT DEL 853 $str1pos, $rd1pos-$str1pos, # STR/PAT 854 $rd1pos, 1, # RIGHT DEL 855 $ld2pos, $second_arg, # 2ND LDEL (MAYBE) 856 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) 857 $rd2pos, $second_arg, # 2ND RDEL (MAYBE) 858 $modpos, $endpos-$modpos, # MODIFIERS 859 $endpos, $textlen-$endpos, # REMAINDER 860 ); 861} 862 863my $def_func = 864[ 865 sub { extract_variable($_[0], '') }, 866 sub { extract_quotelike($_[0],'') }, 867 sub { extract_codeblock($_[0],'{}','') }, 868]; 869 870sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) 871{ 872 my $textref = defined($_[0]) ? \$_[0] : \$_; 873 my $posbug = pos; 874 my ($lastpos, $firstpos); 875 my @fields = (); 876 877 #for ($$textref) 878 { 879 my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; 880 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; 881 my $igunk = $_[3]; 882 883 pos $$textref ||= 0; 884 885 unless (wantarray) 886 { 887 use Carp; 888 carp "extract_multiple reset maximal count to 1 in scalar context" 889 if $^W && defined($_[2]) && $max > 1; 890 $max = 1 891 } 892 893 my $unkpos; 894 my $func; 895 my $class; 896 897 my @class; 898 foreach $func ( @func ) 899 { 900 if (ref($func) eq 'HASH') 901 { 902 push @class, (keys %$func)[0]; 903 $func = (values %$func)[0]; 904 } 905 else 906 { 907 push @class, undef; 908 } 909 } 910 911 FIELD: while (pos($$textref) < length($$textref)) 912 { 913 my ($field, $rem); 914 my @bits; 915 foreach my $i ( 0..$#func ) 916 { 917 my $pref; 918 $func = $func[$i]; 919 $class = $class[$i]; 920 $lastpos = pos $$textref; 921 if (ref($func) eq 'CODE') 922 { ($field,$rem,$pref) = @bits = $func->($$textref); 923 # print "[$field|$rem]" if $field; 924 } 925 elsif (ref($func) eq 'Text::Balanced::Extractor') 926 { @bits = $field = $func->extract($$textref) } 927 elsif( $$textref =~ m/\G$func/gc ) 928 { @bits = $field = defined($1) ? $1 : $& } 929 $pref ||= ""; 930 if (defined($field) && length($field)) 931 { 932 if (!$igunk) { 933 $unkpos = pos $$textref 934 if length($pref) && !defined($unkpos); 935 if (defined $unkpos) 936 { 937 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; 938 $firstpos = $unkpos unless defined $firstpos; 939 undef $unkpos; 940 last FIELD if @fields == $max; 941 } 942 } 943 push @fields, $class 944 ? bless (\$field, $class) 945 : $field; 946 $firstpos = $lastpos unless defined $firstpos; 947 $lastpos = pos $$textref; 948 last FIELD if @fields == $max; 949 next FIELD; 950 } 951 } 952 if ($$textref =~ /\G(.)/gcs) 953 { 954 $unkpos = pos($$textref)-1 955 unless $igunk || defined $unkpos; 956 } 957 } 958 959 if (defined $unkpos) 960 { 961 push @fields, substr($$textref, $unkpos); 962 $firstpos = $unkpos unless defined $firstpos; 963 $lastpos = length $$textref; 964 } 965 last; 966 } 967 968 pos $$textref = $lastpos; 969 return @fields if wantarray; 970 971 $firstpos ||= 0; 972 eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; 973 pos $$textref = $firstpos }; 974 return $fields[0]; 975} 976 977 978sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) 979{ 980 my $ldel = $_[0]; 981 my $rdel = $_[1]; 982 my $pre = defined $_[2] ? $_[2] : '\s*'; 983 my %options = defined $_[3] ? %{$_[3]} : (); 984 my $omode = defined $options{fail} ? $options{fail} : ''; 985 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 986 : defined($options{reject}) ? $options{reject} 987 : '' 988 ; 989 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 990 : defined($options{ignore}) ? $options{ignore} 991 : '' 992 ; 993 994 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 995 996 my $posbug = pos; 997 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } 998 pos = $posbug; 999 1000 my $closure = sub 1001 { 1002 my $textref = defined $_[0] ? \$_[0] : \$_; 1003 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 1004 1005 return _fail(wantarray, $textref) unless @match; 1006 return _succeed wantarray, $textref, 1007 $match[2], $match[3]+$match[5]+$match[7], # MATCH 1008 @match[8..9,0..1,2..7]; # REM, PRE, BITS 1009 }; 1010 1011 bless $closure, 'Text::Balanced::Extractor'; 1012} 1013 1014package Text::Balanced::Extractor; 1015 1016sub extract($$) # ($self, $text) 1017{ 1018 &{$_[0]}($_[1]); 1019} 1020 1021package Text::Balanced::ErrorMsg; 1022 1023use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; 1024 10251; 1026 1027__END__ 1028 1029=head1 NAME 1030 1031Text::Balanced - Extract delimited text sequences from strings. 1032 1033 1034=head1 SYNOPSIS 1035 1036 use Text::Balanced qw ( 1037 extract_delimited 1038 extract_bracketed 1039 extract_quotelike 1040 extract_codeblock 1041 extract_variable 1042 extract_tagged 1043 extract_multiple 1044 1045 gen_delimited_pat 1046 gen_extract_tagged 1047 ); 1048 1049 # Extract the initial substring of $text that is delimited by 1050 # two (unescaped) instances of the first character in $delim. 1051 1052 ($extracted, $remainder) = extract_delimited($text,$delim); 1053 1054 1055 # Extract the initial substring of $text that is bracketed 1056 # with a delimiter(s) specified by $delim (where the string 1057 # in $delim contains one or more of '(){}[]<>'). 1058 1059 ($extracted, $remainder) = extract_bracketed($text,$delim); 1060 1061 1062 # Extract the initial substring of $text that is bounded by 1063 # an XML tag. 1064 1065 ($extracted, $remainder) = extract_tagged($text); 1066 1067 1068 # Extract the initial substring of $text that is bounded by 1069 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1070 1071 ($extracted, $remainder) = 1072 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1073 1074 1075 # Extract the initial substring of $text that represents a 1076 # Perl "quote or quote-like operation" 1077 1078 ($extracted, $remainder) = extract_quotelike($text); 1079 1080 1081 # Extract the initial substring of $text that represents a block 1082 # of Perl code, bracketed by any of character(s) specified by $delim 1083 # (where the string $delim contains one or more of '(){}[]<>'). 1084 1085 ($extracted, $remainder) = extract_codeblock($text,$delim); 1086 1087 1088 # Extract the initial substrings of $text that would be extracted by 1089 # one or more sequential applications of the specified functions 1090 # or regular expressions 1091 1092 @extracted = extract_multiple($text, 1093 [ \&extract_bracketed, 1094 \&extract_quotelike, 1095 \&some_other_extractor_sub, 1096 qr/[xyz]*/, 1097 'literal', 1098 ]); 1099 1100# Create a string representing an optimized pattern (a la Friedl) 1101# that matches a substring delimited by any of the specified characters 1102# (in this case: any type of quote or a slash) 1103 1104 $patstring = gen_delimited_pat(q{'"`/}); 1105 1106 1107# Generate a reference to an anonymous sub that is just like extract_tagged 1108# but pre-compiled and optimized for a specific pair of tags, and consequently 1109# much faster (i.e. 3 times faster). It uses qr// for better performance on 1110# repeated calls, so it only works under Perl 5.005 or later. 1111 1112 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1113 1114 ($extracted, $remainder) = $extract_head->($text); 1115 1116 1117=head1 DESCRIPTION 1118 1119The various C<extract_...> subroutines may be used to 1120extract a delimited substring, possibly after skipping a 1121specified prefix string. By default, that prefix is 1122optional whitespace (C</\s*/>), but you can change it to whatever 1123you wish (see below). 1124 1125The substring to be extracted must appear at the 1126current C<pos> location of the string's variable 1127(or at index zero, if no C<pos> position is defined). 1128In other words, the C<extract_...> subroutines I<don't> 1129extract the first occurance of a substring anywhere 1130in a string (like an unanchored regex would). Rather, 1131they extract an occurance of the substring appearing 1132immediately at the current matching position in the 1133string (like a C<\G>-anchored regex would). 1134 1135 1136 1137=head2 General behaviour in list contexts 1138 1139In a list context, all the subroutines return a list, the first three 1140elements of which are always: 1141 1142=over 4 1143 1144=item [0] 1145 1146The extracted string, including the specified delimiters. 1147If the extraction fails an empty string is returned. 1148 1149=item [1] 1150 1151The remainder of the input string (i.e. the characters after the 1152extracted string). On failure, the entire string is returned. 1153 1154=item [2] 1155 1156The skipped prefix (i.e. the characters before the extracted string). 1157On failure, the empty string is returned. 1158 1159=back 1160 1161Note that in a list context, the contents of the original input text (the first 1162argument) are not modified in any way. 1163 1164However, if the input text was passed in a variable, that variable's 1165C<pos> value is updated to point at the first character after the 1166extracted text. That means that in a list context the various 1167subroutines can be used much like regular expressions. For example: 1168 1169 while ( $next = (extract_quotelike($text))[0] ) 1170 { 1171 # process next quote-like (in $next) 1172 } 1173 1174 1175=head2 General behaviour in scalar and void contexts 1176 1177In a scalar context, the extracted string is returned, having first been 1178removed from the input text. Thus, the following code also processes 1179each quote-like operation, but actually removes them from $text: 1180 1181 while ( $next = extract_quotelike($text) ) 1182 { 1183 # process next quote-like (in $next) 1184 } 1185 1186Note that if the input text is a read-only string (i.e. a literal), 1187no attempt is made to remove the extracted text. 1188 1189In a void context the behaviour of the extraction subroutines is 1190exactly the same as in a scalar context, except (of course) that the 1191extracted substring is not returned. 1192 1193=head2 A note about prefixes 1194 1195Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1196This can bite you if you're expecting a prefix specification like 1197'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1198pattern will only succeed if the <H1> tag is on the current line, since 1199. normally doesn't match newlines. 1200 1201To overcome this limitation, you need to turn on /s matching within 1202the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1203 1204 1205=head2 C<extract_delimited> 1206 1207The C<extract_delimited> function formalizes the common idiom 1208of extracting a single-character-delimited substring from the start of 1209a string. For example, to extract a single-quote delimited string, the 1210following code is typically used: 1211 1212 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1213 $extracted = $1; 1214 1215but with C<extract_delimited> it can be simplified to: 1216 1217 ($extracted,$remainder) = extract_delimited($text, "'"); 1218 1219C<extract_delimited> takes up to four scalars (the input text, the 1220delimiters, a prefix pattern to be skipped, and any escape characters) 1221and extracts the initial substring of the text that 1222is appropriately delimited. If the delimiter string has multiple 1223characters, the first one encountered in the text is taken to delimit 1224the substring. 1225The third argument specifies a prefix pattern that is to be skipped 1226(but must be present!) before the substring is extracted. 1227The final argument specifies the escape character to be used for each 1228delimiter. 1229 1230All arguments are optional. If the escape characters are not specified, 1231every delimiter is escaped with a backslash (C<\>). 1232If the prefix is not specified, the 1233pattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1234is also not specified, the set C</["'`]/> is used. If the text to be processed 1235is not specified either, C<$_> is used. 1236 1237In list context, C<extract_delimited> returns a array of three 1238elements, the extracted substring (I<including the surrounding 1239delimiters>), the remainder of the text, and the skipped prefix (if 1240any). If a suitable delimited substring is not found, the first 1241element of the array is the empty string, the second is the complete 1242original text, and the prefix returned in the third element is an 1243empty string. 1244 1245In a scalar context, just the extracted substring is returned. In 1246a void context, the extracted substring (and any prefix) are simply 1247removed from the beginning of the first argument. 1248 1249Examples: 1250 1251 # Remove a single-quoted substring from the very beginning of $text: 1252 1253 $substring = extract_delimited($text, "'", ''); 1254 1255 # Remove a single-quoted Pascalish substring (i.e. one in which 1256 # doubling the quote character escapes it) from the very 1257 # beginning of $text: 1258 1259 $substring = extract_delimited($text, "'", '', "'"); 1260 1261 # Extract a single- or double- quoted substring from the 1262 # beginning of $text, optionally after some whitespace 1263 # (note the list context to protect $text from modification): 1264 1265 ($substring) = extract_delimited $text, q{"'}; 1266 1267 1268 # Delete the substring delimited by the first '/' in $text: 1269 1270 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1271 1272Note that this last example is I<not> the same as deleting the first 1273quote-like pattern. For instance, if C<$text> contained the string: 1274 1275 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1276 1277then after the deletion it would contain: 1278 1279 "if ('.$UNIXCMD/s) { $cmd = $1; }" 1280 1281not: 1282 1283 "if ('./cmd' =~ ms) { $cmd = $1; }" 1284 1285 1286See L<"extract_quotelike"> for a (partial) solution to this problem. 1287 1288 1289=head2 C<extract_bracketed> 1290 1291Like C<"extract_delimited">, the C<extract_bracketed> function takes 1292up to three optional scalar arguments: a string to extract from, a delimiter 1293specifier, and a prefix pattern. As before, a missing prefix defaults to 1294optional whitespace and a missing text defaults to C<$_>. However, a missing 1295delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1296 1297C<extract_bracketed> extracts a balanced-bracket-delimited 1298substring (using any one (or more) of the user-specified delimiter 1299brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1300respect quoted unbalanced brackets (see below). 1301 1302A "delimiter bracket" is a bracket in list of delimiters passed as 1303C<extract_bracketed>'s second argument. Delimiter brackets are 1304specified by giving either the left or right (or both!) versions 1305of the required bracket(s). Note that the order in which 1306two or more delimiter brackets are specified is not significant. 1307 1308A "balanced-bracket-delimited substring" is a substring bounded by 1309matched brackets, such that any other (left or right) delimiter 1310bracket I<within> the substring is also matched by an opposite 1311(right or left) delimiter bracket I<at the same level of nesting>. Any 1312type of bracket not in the delimiter list is treated as an ordinary 1313character. 1314 1315In other words, each type of bracket specified as a delimiter must be 1316balanced and correctly nested within the substring, and any other kind of 1317("non-delimiter") bracket in the substring is ignored. 1318 1319For example, given the string: 1320 1321 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1322 1323then a call to C<extract_bracketed> in a list context: 1324 1325 @result = extract_bracketed( $text, '{}' ); 1326 1327would return: 1328 1329 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1330 1331since both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1332(In a scalar context just the first element of the array would be returned. In 1333a void context, C<$text> would be replaced by an empty string.) 1334 1335Likewise the call in: 1336 1337 @result = extract_bracketed( $text, '{[' ); 1338 1339would return the same result, since all sets of both types of specified 1340delimiter brackets are correctly nested and balanced. 1341 1342However, the call in: 1343 1344 @result = extract_bracketed( $text, '{([<' ); 1345 1346would fail, returning: 1347 1348 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1349 1350because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1351the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1352return an empty string. In a void context, C<$text> would be unchanged.) 1353 1354Note that the embedded single-quotes in the string don't help in this 1355case, since they have not been specified as acceptable delimiters and are 1356therefore treated as non-delimiter characters (and ignored). 1357 1358However, if a particular species of quote character is included in the 1359delimiter specification, then that type of quote will be correctly handled. 1360for example, if C<$text> is: 1361 1362 $text = '<A HREF=">>>>">link</A>'; 1363 1364then 1365 1366 @result = extract_bracketed( $text, '<">' ); 1367 1368returns: 1369 1370 ( '<A HREF=">>>>">', 'link</A>', "" ) 1371 1372as expected. Without the specification of C<"> as an embedded quoter: 1373 1374 @result = extract_bracketed( $text, '<>' ); 1375 1376the result would be: 1377 1378 ( '<A HREF=">', '>>>">link</A>', "" ) 1379 1380In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1381quoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1382letter 'q' as a delimiter. Hence: 1383 1384 @result = extract_bracketed( $text, '<q>' ); 1385 1386would correctly match something like this: 1387 1388 $text = '<leftop: conj /and/ conj>'; 1389 1390See also: C<"extract_quotelike"> and C<"extract_codeblock">. 1391 1392 1393=head2 C<extract_variable> 1394 1395C<extract_variable> extracts any valid Perl variable or 1396variable-involved expression, including scalars, arrays, hashes, array 1397accesses, hash look-ups, method calls through objects, subroutine calles 1398through subroutine references, etc. 1399 1400The subroutine takes up to two optional arguments: 1401 1402=over 4 1403 1404=item 1. 1405 1406A string to be processed (C<$_> if the string is omitted or C<undef>) 1407 1408=item 2. 1409 1410A string specifying a pattern to be matched as a prefix (which is to be 1411skipped). If omitted, optional whitespace is skipped. 1412 1413=back 1414 1415On success in a list context, an array of 3 elements is returned. The 1416elements are: 1417 1418=over 4 1419 1420=item [0] 1421 1422the extracted variable, or variablish expression 1423 1424=item [1] 1425 1426the remainder of the input text, 1427 1428=item [2] 1429 1430the prefix substring (if any), 1431 1432=back 1433 1434On failure, all of these values (except the remaining text) are C<undef>. 1435 1436In a scalar context, C<extract_variable> returns just the complete 1437substring that matched a variablish expression. C<undef> is returned on 1438failure. In addition, the original input text has the returned substring 1439(and any prefix) removed from it. 1440 1441In a void context, the input text just has the matched substring (and 1442any specified prefix) removed. 1443 1444 1445=head2 C<extract_tagged> 1446 1447C<extract_tagged> extracts and segments text between (balanced) 1448specified tags. 1449 1450The subroutine takes up to five optional arguments: 1451 1452=over 4 1453 1454=item 1. 1455 1456A string to be processed (C<$_> if the string is omitted or C<undef>) 1457 1458=item 2. 1459 1460A string specifying a pattern to be matched as the opening tag. 1461If the pattern string is omitted (or C<undef>) then a pattern 1462that matches any standard XML tag is used. 1463 1464=item 3. 1465 1466A string specifying a pattern to be matched at the closing tag. 1467If the pattern string is omitted (or C<undef>) then the closing 1468tag is constructed by inserting a C</> after any leading bracket 1469characters in the actual opening tag that was matched (I<not> the pattern 1470that matched the tag). For example, if the opening tag pattern 1471is specified as C<'{{\w+}}'> and actually matched the opening tag 1472C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1473 1474=item 4. 1475 1476A string specifying a pattern to be matched as a prefix (which is to be 1477skipped). If omitted, optional whitespace is skipped. 1478 1479=item 5. 1480 1481A hash reference containing various parsing options (see below) 1482 1483=back 1484 1485The various options that can be specified are: 1486 1487=over 4 1488 1489=item C<reject =E<gt> $listref> 1490 1491The list reference contains one or more strings specifying patterns 1492that must I<not> appear within the tagged text. 1493 1494For example, to extract 1495an HTML link (which should not contain nested links) use: 1496 1497 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1498 1499=item C<ignore =E<gt> $listref> 1500 1501The list reference contains one or more strings specifying patterns 1502that are I<not> be be treated as nested tags within the tagged text 1503(even if they would match the start tag pattern). 1504 1505For example, to extract an arbitrary XML tag, but ignore "empty" elements: 1506 1507 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1508 1509(also see L<"gen_delimited_pat"> below). 1510 1511 1512=item C<fail =E<gt> $str> 1513 1514The C<fail> option indicates the action to be taken if a matching end 1515tag is not encountered (i.e. before the end of the string or some 1516C<reject> pattern matches). By default, a failure to match a closing 1517tag causes C<extract_tagged> to immediately fail. 1518 1519However, if the string value associated with <reject> is "MAX", then 1520C<extract_tagged> returns the complete text up to the point of failure. 1521If the string is "PARA", C<extract_tagged> returns only the first paragraph 1522after the tag (up to the first line that is either empty or contains 1523only whitespace characters). 1524If the string is "", the the default behaviour (i.e. failure) is reinstated. 1525 1526For example, suppose the start tag "/para" introduces a paragraph, which then 1527continues until the next "/endpara" tag or until another "/para" tag is 1528encountered: 1529 1530 $text = "/para line 1\n\nline 3\n/para line 4"; 1531 1532 extract_tagged($text, '/para', '/endpara', undef, 1533 {reject => '/para', fail => MAX ); 1534 1535 # EXTRACTED: "/para line 1\n\nline 3\n" 1536 1537Suppose instead, that if no matching "/endpara" tag is found, the "/para" 1538tag refers only to the immediately following paragraph: 1539 1540 $text = "/para line 1\n\nline 3\n/para line 4"; 1541 1542 extract_tagged($text, '/para', '/endpara', undef, 1543 {reject => '/para', fail => MAX ); 1544 1545 # EXTRACTED: "/para line 1\n" 1546 1547Note that the specified C<fail> behaviour applies to nested tags as well. 1548 1549=back 1550 1551On success in a list context, an array of 6 elements is returned. The elements are: 1552 1553=over 4 1554 1555=item [0] 1556 1557the extracted tagged substring (including the outermost tags), 1558 1559=item [1] 1560 1561the remainder of the input text, 1562 1563=item [2] 1564 1565the prefix substring (if any), 1566 1567=item [3] 1568 1569the opening tag 1570 1571=item [4] 1572 1573the text between the opening and closing tags 1574 1575=item [5] 1576 1577the closing tag (or "" if no closing tag was found) 1578 1579=back 1580 1581On failure, all of these values (except the remaining text) are C<undef>. 1582 1583In a scalar context, C<extract_tagged> returns just the complete 1584substring that matched a tagged text (including the start and end 1585tags). C<undef> is returned on failure. In addition, the original input 1586text has the returned substring (and any prefix) removed from it. 1587 1588In a void context, the input text just has the matched substring (and 1589any specified prefix) removed. 1590 1591 1592=head2 C<gen_extract_tagged> 1593 1594(Note: This subroutine is only available under Perl5.005) 1595 1596C<gen_extract_tagged> generates a new anonymous subroutine which 1597extracts text between (balanced) specified tags. In other words, 1598it generates a function identical in function to C<extract_tagged>. 1599 1600The difference between C<extract_tagged> and the anonymous 1601subroutines generated by 1602C<gen_extract_tagged>, is that those generated subroutines: 1603 1604=over 4 1605 1606=item * 1607 1608do not have to reparse tag specification or parsing options every time 1609they are called (whereas C<extract_tagged> has to effectively rebuild 1610its tag parser on every call); 1611 1612=item * 1613 1614make use of the new qr// construct to pre-compile the regexes they use 1615(whereas C<extract_tagged> uses standard string variable interpolation 1616to create tag-matching patterns). 1617 1618=back 1619 1620The subroutine takes up to four optional arguments (the same set as 1621C<extract_tagged> except for the string to be processed). It returns 1622a reference to a subroutine which in turn takes a single argument (the text to 1623be extracted from). 1624 1625In other words, the implementation of C<extract_tagged> is exactly 1626equivalent to: 1627 1628 sub extract_tagged 1629 { 1630 my $text = shift; 1631 $extractor = gen_extract_tagged(@_); 1632 return $extractor->($text); 1633 } 1634 1635(although C<extract_tagged> is not currently implemented that way, in order 1636to preserve pre-5.005 compatibility). 1637 1638Using C<gen_extract_tagged> to create extraction functions for specific tags 1639is a good idea if those functions are going to be called more than once, since 1640their performance is typically twice as good as the more general-purpose 1641C<extract_tagged>. 1642 1643 1644=head2 C<extract_quotelike> 1645 1646C<extract_quotelike> attempts to recognize, extract, and segment any 1647one of the various Perl quotes and quotelike operators (see 1648L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1649delimiters (for the quotelike operators), and trailing modifiers are 1650all caught. For example, in: 1651 1652 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1653 1654 extract_quotelike ' "You said, \"Use sed\"." ' 1655 1656 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1657 1658 extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1659 1660the full Perl quotelike operations are all extracted correctly. 1661 1662Note too that, when using the /x modifier on a regex, any comment 1663containing the current pattern delimiter will cause the regex to be 1664immediately terminated. In other words: 1665 1666 'm / 1667 (?i) # CASE INSENSITIVE 1668 [a-z_] # LEADING ALPHABETIC/UNDERSCORE 1669 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1670 /x' 1671 1672will be extracted as if it were: 1673 1674 'm / 1675 (?i) # CASE INSENSITIVE 1676 [a-z_] # LEADING ALPHABETIC/' 1677 1678This behaviour is identical to that of the actual compiler. 1679 1680C<extract_quotelike> takes two arguments: the text to be processed and 1681a prefix to be matched at the very beginning of the text. If no prefix 1682is specified, optional whitespace is the default. If no text is given, 1683C<$_> is used. 1684 1685In a list context, an array of 11 elements is returned. The elements are: 1686 1687=over 4 1688 1689=item [0] 1690 1691the extracted quotelike substring (including trailing modifiers), 1692 1693=item [1] 1694 1695the remainder of the input text, 1696 1697=item [2] 1698 1699the prefix substring (if any), 1700 1701=item [3] 1702 1703the name of the quotelike operator (if any), 1704 1705=item [4] 1706 1707the left delimiter of the first block of the operation, 1708 1709=item [5] 1710 1711the text of the first block of the operation 1712(that is, the contents of 1713a quote, the regex of a match or substitution or the target list of a 1714translation), 1715 1716=item [6] 1717 1718the right delimiter of the first block of the operation, 1719 1720=item [7] 1721 1722the left delimiter of the second block of the operation 1723(that is, if it is a C<s>, C<tr>, or C<y>), 1724 1725=item [8] 1726 1727the text of the second block of the operation 1728(that is, the replacement of a substitution or the translation list 1729of a translation), 1730 1731=item [9] 1732 1733the right delimiter of the second block of the operation (if any), 1734 1735=item [10] 1736 1737the trailing modifiers on the operation (if any). 1738 1739=back 1740 1741For each of the fields marked "(if any)" the default value on success is 1742an empty string. 1743On failure, all of these values (except the remaining text) are C<undef>. 1744 1745 1746In a scalar context, C<extract_quotelike> returns just the complete substring 1747that matched a quotelike operation (or C<undef> on failure). In a scalar or 1748void context, the input text has the same substring (and any specified 1749prefix) removed. 1750 1751Examples: 1752 1753 # Remove the first quotelike literal that appears in text 1754 1755 $quotelike = extract_quotelike($text,'.*?'); 1756 1757 # Replace one or more leading whitespace-separated quotelike 1758 # literals in $_ with "<QLL>" 1759 1760 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1761 1762 1763 # Isolate the search pattern in a quotelike operation from $text 1764 1765 ($op,$pat) = (extract_quotelike $text)[3,5]; 1766 if ($op =~ /[ms]/) 1767 { 1768 print "search pattern: $pat\n"; 1769 } 1770 else 1771 { 1772 print "$op is not a pattern matching operation\n"; 1773 } 1774 1775 1776=head2 C<extract_quotelike> and "here documents" 1777 1778C<extract_quotelike> can successfully extract "here documents" from an input 1779string, but with an important caveat in list contexts. 1780 1781Unlike other types of quote-like literals, a here document is rarely 1782a contiguous substring. For example, a typical piece of code using 1783here document might look like this: 1784 1785 <<'EOMSG' || die; 1786 This is the message. 1787 EOMSG 1788 exit; 1789 1790Given this as an input string in a scalar context, C<extract_quotelike> 1791would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1792leaving the string " || die;\nexit;" in the original variable. In other words, 1793the two separate pieces of the here document are successfully extracted and 1794concatenated. 1795 1796In a list context, C<extract_quotelike> would return the list 1797 1798=over 4 1799 1800=item [0] 1801 1802"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1803including fore and aft delimiters), 1804 1805=item [1] 1806 1807" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1808 1809=item [2] 1810 1811"" (i.e. the prefix substring -- trivial in this case), 1812 1813=item [3] 1814 1815"<<" (i.e. the "name" of the quotelike operator) 1816 1817=item [4] 1818 1819"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1820 1821=item [5] 1822 1823"This is the message.\n" (i.e. the text of the here document), 1824 1825=item [6] 1826 1827"EOMSG" (i.e. the right delimiter of the here document), 1828 1829=item [7..10] 1830 1831"" (a here document has no second left delimiter, second text, second right 1832delimiter, or trailing modifiers). 1833 1834=back 1835 1836However, the matching position of the input variable would be set to 1837"exit;" (i.e. I<after> the closing delimiter of the here document), 1838which would cause the earlier " || die;\nexit;" to be skipped in any 1839sequence of code fragment extractions. 1840 1841To avoid this problem, when it encounters a here document whilst 1842extracting from a modifiable string, C<extract_quotelike> silently 1843rearranges the string to an equivalent piece of Perl: 1844 1845 <<'EOMSG' 1846 This is the message. 1847 EOMSG 1848 || die; 1849 exit; 1850 1851in which the here document I<is> contiguous. It still leaves the 1852matching position after the here document, but now the rest of the line 1853on which the here document starts is not skipped. 1854 1855To prevent <extract_quotelike> from mucking about with the input in this way 1856(this is the only case where a list-context C<extract_quotelike> does so), 1857you can pass the input variable as an interpolated literal: 1858 1859 $quotelike = extract_quotelike("$var"); 1860 1861 1862=head2 C<extract_codeblock> 1863 1864C<extract_codeblock> attempts to recognize and extract a balanced 1865bracket delimited substring that may contain unbalanced brackets 1866inside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1867is like a combination of C<"extract_bracketed"> and 1868C<"extract_quotelike">. 1869 1870C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1871a text to process, a set of delimiter brackets to look for, and a prefix to 1872match first. It also takes an optional fourth parameter, which allows the 1873outermost delimiter brackets to be specified separately (see below). 1874 1875Omitting the first argument (input text) means process C<$_> instead. 1876Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1877Omitting the third argument (prefix argument) implies optional whitespace at the start. 1878Omitting the fourth argument (outermost delimiter brackets) indicates that the 1879value of the second argument is to be used for the outermost delimiters. 1880 1881Once the prefix an dthe outermost opening delimiter bracket have been 1882recognized, code blocks are extracted by stepping through the input text and 1883trying the following alternatives in sequence: 1884 1885=over 4 1886 1887=item 1. 1888 1889Try and match a closing delimiter bracket. If the bracket was the same 1890species as the last opening bracket, return the substring to that 1891point. If the bracket was mismatched, return an error. 1892 1893=item 2. 1894 1895Try to match a quote or quotelike operator. If found, call 1896C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1897the error it returned. Otherwise go back to step 1. 1898 1899=item 3. 1900 1901Try to match an opening delimiter bracket. If found, call 1902C<extract_codeblock> recursively to eat the embedded block. If the 1903recursive call fails, return an error. Otherwise, go back to step 1. 1904 1905=item 4. 1906 1907Unconditionally match a bareword or any other single character, and 1908then go back to step 1. 1909 1910=back 1911 1912 1913Examples: 1914 1915 # Find a while loop in the text 1916 1917 if ($text =~ s/.*?while\s*\{/{/) 1918 { 1919 $loop = "while " . extract_codeblock($text); 1920 } 1921 1922 # Remove the first round-bracketed list (which may include 1923 # round- or curly-bracketed code blocks or quotelike operators) 1924 1925 extract_codeblock $text, "(){}", '[^(]*'; 1926 1927 1928The ability to specify a different outermost delimiter bracket is useful 1929in some circumstances. For example, in the Parse::RecDescent module, 1930parser actions which are to be performed only on a successful parse 1931are specified using a C<E<lt>defer:...E<gt>> directive. For example: 1932 1933 sentence: subject verb object 1934 <defer: {$::theVerb = $item{verb}} > 1935 1936Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1937within the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1938 1939A deferred action like this: 1940 1941 <defer: {if ($count>10) {$count--}} > 1942 1943will be incorrectly parsed as: 1944 1945 <defer: {if ($count> 1946 1947because the "less than" operator is interpreted as a closing delimiter. 1948 1949But, by extracting the directive using 1950S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1951the '>' character is only treated as a delimited at the outermost 1952level of the code block, so the directive is parsed correctly. 1953 1954=head2 C<extract_multiple> 1955 1956The C<extract_multiple> subroutine takes a string to be processed and a 1957list of extractors (subroutines or regular expressions) to apply to that string. 1958 1959In an array context C<extract_multiple> returns an array of substrings 1960of the original string, as extracted by the specified extractors. 1961In a scalar context, C<extract_multiple> returns the first 1962substring successfully extracted from the original string. In both 1963scalar and void contexts the original string has the first successfully 1964extracted substring removed from it. In all contexts 1965C<extract_multiple> starts at the current C<pos> of the string, and 1966sets that C<pos> appropriately after it matches. 1967 1968Hence, the aim of of a call to C<extract_multiple> in a list context 1969is to split the processed string into as many non-overlapping fields as 1970possible, by repeatedly applying each of the specified extractors 1971to the remainder of the string. Thus C<extract_multiple> is 1972a generalized form of Perl's C<split> subroutine. 1973 1974The subroutine takes up to four optional arguments: 1975 1976=over 4 1977 1978=item 1. 1979 1980A string to be processed (C<$_> if the string is omitted or C<undef>) 1981 1982=item 2. 1983 1984A reference to a list of subroutine references and/or qr// objects and/or 1985literal strings and/or hash references, specifying the extractors 1986to be used to split the string. If this argument is omitted (or 1987C<undef>) the list: 1988 1989 [ 1990 sub { extract_variable($_[0], '') }, 1991 sub { extract_quotelike($_[0],'') }, 1992 sub { extract_codeblock($_[0],'{}','') }, 1993 ] 1994 1995is used. 1996 1997 1998=item 3. 1999 2000An number specifying the maximum number of fields to return. If this 2001argument is omitted (or C<undef>), split continues as long as possible. 2002 2003If the third argument is I<N>, then extraction continues until I<N> fields 2004have been successfully extracted, or until the string has been completely 2005processed. 2006 2007Note that in scalar and void contexts the value of this argument is 2008automatically reset to 1 (under C<-w>, a warning is issued if the argument 2009has to be reset). 2010 2011=item 4. 2012 2013A value indicating whether unmatched substrings (see below) within the 2014text should be skipped or returned as fields. If the value is true, 2015such substrings are skipped. Otherwise, they are returned. 2016 2017=back 2018 2019The extraction process works by applying each extractor in 2020sequence to the text string. 2021 2022If the extractor is a subroutine it is called in a list context and is 2023expected to return a list of a single element, namely the extracted 2024text. It may optionally also return two further arguments: a string 2025representing the text left after extraction (like $' for a pattern 2026match), and a string representing any prefix skipped before the 2027extraction (like $` in a pattern match). Note that this is designed 2028to facilitate the use of other Text::Balanced subroutines with 2029C<extract_multiple>. Note too that the value returned by an extractor 2030subroutine need not bear any relationship to the corresponding substring 2031of the original text (see examples below). 2032 2033If the extractor is a precompiled regular expression or a string, 2034it is matched against the text in a scalar context with a leading 2035'\G' and the gc modifiers enabled. The extracted value is either 2036$1 if that variable is defined after the match, or else the 2037complete match (i.e. $&). 2038 2039If the extractor is a hash reference, it must contain exactly one element. 2040The value of that element is one of the 2041above extractor types (subroutine reference, regular expression, or string). 2042The key of that element is the name of a class into which the successful 2043return value of the extractor will be blessed. 2044 2045If an extractor returns a defined value, that value is immediately 2046treated as the next extracted field and pushed onto the list of fields. 2047If the extractor was specified in a hash reference, the field is also 2048blessed into the appropriate class, 2049 2050If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 2051assumed to have failed to extract. 2052If none of the extractor subroutines succeeds, then one 2053character is extracted from the start of the text and the extraction 2054subroutines reapplied. Characters which are thus removed are accumulated and 2055eventually become the next field (unless the fourth argument is true, in which 2056case they are disgarded). 2057 2058For example, the following extracts substrings that are valid Perl variables: 2059 2060 @fields = extract_multiple($text, 2061 [ sub { extract_variable($_[0]) } ], 2062 undef, 1); 2063 2064This example separates a text into fields which are quote delimited, 2065curly bracketed, and anything else. The delimited and bracketed 2066parts are also blessed to identify them (the "anything else" is unblessed): 2067 2068 @fields = extract_multiple($text, 2069 [ 2070 { Delim => sub { extract_delimited($_[0],q{'"}) } }, 2071 { Brack => sub { extract_bracketed($_[0],'{}') } }, 2072 ]); 2073 2074This call extracts the next single substring that is a valid Perl quotelike 2075operator (and removes it from $text): 2076 2077 $quotelike = extract_multiple($text, 2078 [ 2079 sub { extract_quotelike($_[0]) }, 2080 ], undef, 1); 2081 2082Finally, here is yet another way to do comma-separated value parsing: 2083 2084 @fields = extract_multiple($csv_text, 2085 [ 2086 sub { extract_delimited($_[0],q{'"}) }, 2087 qr/([^,]+)(.*)/, 2088 ], 2089 undef,1); 2090 2091The list in the second argument means: 2092I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 2093The undef third argument means: 2094I<"...as many times as possible...">, 2095and the true value in the fourth argument means 2096I<"...discarding anything else that appears (i.e. the commas)">. 2097 2098If you wanted the commas preserved as separate fields (i.e. like split 2099does if your split pattern has capturing parentheses), you would 2100just make the last parameter undefined (or remove it). 2101 2102 2103=head2 C<gen_delimited_pat> 2104 2105The C<gen_delimited_pat> subroutine takes a single (string) argument and 2106 > builds a Friedl-style optimized regex that matches a string delimited 2107by any one of the characters in the single argument. For example: 2108 2109 gen_delimited_pat(q{'"}) 2110 2111returns the regex: 2112 2113 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 2114 2115Note that the specified delimiters are automatically quotemeta'd. 2116 2117A typical use of C<gen_delimited_pat> would be to build special purpose tags 2118for C<extract_tagged>. For example, to properly ignore "empty" XML elements 2119(which might contain quoted strings): 2120 2121 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 2122 2123 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 2124 2125 2126C<gen_delimited_pat> may also be called with an optional second argument, 2127which specifies the "escape" character(s) to be used for each delimiter. 2128For example to match a Pascal-style string (where ' is the delimiter 2129and '' is a literal ' within the string): 2130 2131 gen_delimited_pat(q{'},q{'}); 2132 2133Different escape characters can be specified for different delimiters. 2134For example, to specify that '/' is the escape for single quotes 2135and '%' is the escape for double quotes: 2136 2137 gen_delimited_pat(q{'"},q{/%}); 2138 2139If more delimiters than escape chars are specified, the last escape char 2140is used for the remaining delimiters. 2141If no escape char is specified for a given specified delimiter, '\' is used. 2142 2143Note that 2144C<gen_delimited_pat> was previously called 2145C<delimited_pat>. That name may still be used, but is now deprecated. 2146 2147 2148=head1 DIAGNOSTICS 2149 2150In a list context, all the functions return C<(undef,$original_text)> 2151on failure. In a scalar context, failure is indicated by returning C<undef> 2152(in this case the input text is not modified in any way). 2153 2154In addition, on failure in I<any> context, the C<$@> variable is set. 2155Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 2156below. 2157Accessing C<$@-E<gt>{pos}> returns the offset into the original string at 2158which the error was detected (although not necessarily where it occurred!) 2159Printing C<$@> directly produces the error message, with the offset appended. 2160On success, the C<$@> variable is guaranteed to be C<undef>. 2161 2162The available diagnostics are: 2163 2164=over 4 2165 2166=item C<Did not find a suitable bracket: "%s"> 2167 2168The delimiter provided to C<extract_bracketed> was not one of 2169C<'()[]E<lt>E<gt>{}'>. 2170 2171=item C<Did not find prefix: /%s/> 2172 2173A non-optional prefix was specified but wasn't found at the start of the text. 2174 2175=item C<Did not find opening bracket after prefix: "%s"> 2176 2177C<extract_bracketed> or C<extract_codeblock> was expecting a 2178particular kind of bracket at the start of the text, and didn't find it. 2179 2180=item C<No quotelike operator found after prefix: "%s"> 2181 2182C<extract_quotelike> didn't find one of the quotelike operators C<q>, 2183C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 2184it was extracting. 2185 2186=item C<Unmatched closing bracket: "%c"> 2187 2188C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 2189a closing bracket where none was expected. 2190 2191=item C<Unmatched opening bracket(s): "%s"> 2192 2193C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 2194out of characters in the text before closing one or more levels of nested 2195brackets. 2196 2197=item C<Unmatched embedded quote (%s)> 2198 2199C<extract_bracketed> attempted to match an embedded quoted substring, but 2200failed to find a closing quote to match it. 2201 2202=item C<Did not find closing delimiter to match '%s'> 2203 2204C<extract_quotelike> was unable to find a closing delimiter to match the 2205one that opened the quote-like operation. 2206 2207=item C<Mismatched closing bracket: expected "%c" but found "%s"> 2208 2209C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 2210a valid bracket delimiter, but it was the wrong species. This usually 2211indicates a nesting error, but may indicate incorrect quoting or escaping. 2212 2213=item C<No block delimiter found after quotelike "%s"> 2214 2215C<extract_quotelike> or C<extract_codeblock> found one of the 2216quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 2217without a suitable block after it. 2218 2219=item C<Did not find leading dereferencer> 2220 2221C<extract_variable> was expecting one of '$', '@', or '%' at the start of 2222a variable, but didn't find any of them. 2223 2224=item C<Bad identifier after dereferencer> 2225 2226C<extract_variable> found a '$', '@', or '%' indicating a variable, but that 2227character was not followed by a legal Perl identifier. 2228 2229=item C<Did not find expected opening bracket at %s> 2230 2231C<extract_codeblock> failed to find any of the outermost opening brackets 2232that were specified. 2233 2234=item C<Improperly nested codeblock at %s> 2235 2236A nested code block was found that started with a delimiter that was specified 2237as being only to be used as an outermost bracket. 2238 2239=item C<Missing second block for quotelike "%s"> 2240 2241C<extract_codeblock> or C<extract_quotelike> found one of the 2242quotelike operators C<s>, C<tr> or C<y> followed by only one block. 2243 2244=item C<No match found for opening bracket> 2245 2246C<extract_codeblock> failed to find a closing bracket to match the outermost 2247opening bracket. 2248 2249=item C<Did not find opening tag: /%s/> 2250 2251C<extract_tagged> did not find a suitable opening tag (after any specified 2252prefix was removed). 2253 2254=item C<Unable to construct closing tag to match: /%s/> 2255 2256C<extract_tagged> matched the specified opening tag and tried to 2257modify the matched text to produce a matching closing tag (because 2258none was specified). It failed to generate the closing tag, almost 2259certainly because the opening tag did not start with a 2260bracket of some kind. 2261 2262=item C<Found invalid nested tag: %s> 2263 2264C<extract_tagged> found a nested tag that appeared in the "reject" list 2265(and the failure mode was not "MAX" or "PARA"). 2266 2267=item C<Found unbalanced nested tag: %s> 2268 2269C<extract_tagged> found a nested opening tag that was not matched by a 2270corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 2271 2272=item C<Did not find closing tag> 2273 2274C<extract_tagged> reached the end of the text without finding a closing tag 2275to match the original opening tag (and the failure mode was not 2276"MAX" or "PARA"). 2277 2278 2279 2280 2281=back 2282 2283 2284=head1 AUTHOR 2285 2286Damian Conway (damian@conway.org) 2287 2288 2289=head1 BUGS AND IRRITATIONS 2290 2291There are undoubtedly serious bugs lurking somewhere in this code, if 2292only because parts of it give the impression of understanding a great deal 2293more about Perl than they really do. 2294 2295Bug reports and other feedback are most welcome. 2296 2297 2298=head1 COPYRIGHT 2299 2300 Copyright (c) 1997-2001, Damian Conway. All Rights Reserved. 2301 This module is free software. It may be used, redistributed 2302 and/or modified under the same terms as Perl itself. 2303