1package Pod::Simple::BlackBox; 2# 3# "What's in the box?" "Pain." 4# 5########################################################################### 6# 7# This is where all the scary things happen: parsing lines into 8# paragraphs; and then into directives, verbatims, and then also 9# turning formatting sequences into treelets. 10# 11# Are you really sure you want to read this code? 12# 13#----------------------------------------------------------------------------- 14# 15# The basic work of this module Pod::Simple::BlackBox is doing the dirty work 16# of parsing Pod into treelets (generally one per non-verbatim paragraph), and 17# to call the proper callbacks on the treelets. 18# 19# Every node in a treelet is a ['name', {attrhash}, ...children...] 20 21use integer; # vroom! 22use strict; 23use Carp (); 24use vars qw($VERSION ); 25$VERSION = '3.32'; 26#use constant DEBUG => 7; 27BEGIN { 28 require Pod::Simple; 29 *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG 30} 31 32# Matches a character iff the character will have a different meaning 33# if we choose CP1252 vs UTF-8 if there is no =encoding line. 34# This is broken for early Perls on non-ASCII platforms. 35my $non_ascii_re = eval "qr/[[:^ascii:]]/"; 36$non_ascii_re = qr/[\x80-\xFF]/ if ! defined $non_ascii_re; 37 38my $utf8_bom; 39if (($] ge 5.007_003)) { 40 $utf8_bom = "\x{FEFF}"; 41 utf8::encode($utf8_bom); 42} else { 43 $utf8_bom = "\xEF\xBB\xBF"; # No EBCDIC BOM detection for early Perls. 44} 45 46#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 47 48sub parse_line { shift->parse_lines(@_) } # alias 49 50# - - - Turn back now! Run away! - - - 51 52sub parse_lines { # Usage: $parser->parse_lines(@lines) 53 # an undef means end-of-stream 54 my $self = shift; 55 56 my $code_handler = $self->{'code_handler'}; 57 my $cut_handler = $self->{'cut_handler'}; 58 my $wl_handler = $self->{'whiteline_handler'}; 59 $self->{'line_count'} ||= 0; 60 61 my $scratch; 62 63 DEBUG > 4 and 64 print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n"; 65 66 DEBUG > 5 and 67 print STDERR "# About to parse lines: ", 68 join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n"; 69 70 my $paras = ($self->{'paras'} ||= []); 71 # paragraph buffer. Because we need to defer processing of =over 72 # directives and verbatim paragraphs. We call _ponder_paragraph_buffer 73 # to process this. 74 75 $self->{'pod_para_count'} ||= 0; 76 77 my $line; 78 foreach my $source_line (@_) { 79 if( $self->{'source_dead'} ) { 80 DEBUG > 4 and print STDERR "# Source is dead.\n"; 81 last; 82 } 83 84 unless( defined $source_line ) { 85 DEBUG > 4 and print STDERR "# Undef-line seen.\n"; 86 87 push @$paras, ['~end', {'start_line' => $self->{'line_count'}}]; 88 push @$paras, $paras->[-1], $paras->[-1]; 89 # So that it definitely fills the buffer. 90 $self->{'source_dead'} = 1; 91 $self->_ponder_paragraph_buffer; 92 next; 93 } 94 95 96 if( $self->{'line_count'}++ ) { 97 ($line = $source_line) =~ tr/\n\r//d; 98 # If we don't have two vars, we'll end up with that there 99 # tr/// modding the (potentially read-only) original source line! 100 101 } else { 102 DEBUG > 2 and print STDERR "First line: [$source_line]\n"; 103 104 if( ($line = $source_line) =~ s/^$utf8_bom//s ) { 105 DEBUG and print STDERR "UTF-8 BOM seen. Faking a '=encoding utf8'.\n"; 106 $self->_handle_encoding_line( "=encoding utf8" ); 107 delete $self->{'_processed_encoding'}; 108 $line =~ tr/\n\r//d; 109 110 } elsif( $line =~ s/^\xFE\xFF//s ) { 111 DEBUG and print STDERR "Big-endian UTF-16 BOM seen. Aborting parsing.\n"; 112 $self->scream( 113 $self->{'line_count'}, 114 "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet." 115 ); 116 splice @_; 117 push @_, undef; 118 next; 119 120 # TODO: implement somehow? 121 122 } elsif( $line =~ s/^\xFF\xFE//s ) { 123 DEBUG and print STDERR "Little-endian UTF-16 BOM seen. Aborting parsing.\n"; 124 $self->scream( 125 $self->{'line_count'}, 126 "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet." 127 ); 128 splice @_; 129 push @_, undef; 130 next; 131 132 # TODO: implement somehow? 133 134 } else { 135 DEBUG > 2 and print STDERR "First line is BOM-less.\n"; 136 ($line = $source_line) =~ tr/\n\r//d; 137 } 138 } 139 140 if(!$self->{'parse_characters'} && !$self->{'encoding'} 141 && ($self->{'in_pod'} || $line =~ /^=/s) 142 && $line =~ /$non_ascii_re/ 143 ) { 144 145 my $encoding; 146 147 # No =encoding line, and we are at the first line in the input that 148 # contains a non-ascii byte, that is one whose meaning varies depending 149 # on whether the file is encoded in UTF-8 or CP1252, which are the two 150 # possibilities permitted by the pod spec. (ASCII is assumed if the 151 # file only contains ASCII bytes.) In order to process this line, we 152 # need to figure out what encoding we will use for the file. 153 # 154 # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points 155 # 160-255, but it is used here, as it often colloquially is, to refer to 156 # the complete set of code points 0-255, including ASCII (0-127), the C1 157 # controls (128-159), and strict Latin 1 (160-255). 158 # 159 # CP1252 is effectively a superset of Latin 1, because it differs only 160 # from colloquial 8859-1 in the C1 controls, which are very unlikely to 161 # actually be present in 8859-1 files, so can be used for other purposes 162 # without conflict. CP 1252 uses most of them for graphic characters. 163 # 164 # Note that all ASCII-range bytes represent their corresponding code 165 # points in CP1252 and UTF-8. In ASCII platform UTF-8 all other code 166 # points require multiple (non-ASCII) bytes to represent. (A separate 167 # paragraph for EBCDIC is below.) The multi-byte representation is 168 # quite structured. If we find an isolated byte that requires multiple 169 # bytes to represent in UTF-8, we know that the encoding is not UTF-8. 170 # If we find a sequence of bytes that violates the UTF-8 structure, we 171 # also can presume the encoding isn't UTF-8, and hence must be 1252. 172 # 173 # But there are ambiguous cases where we could guess wrong. If so, the 174 # user will end up having to supply an =encoding line. We use all 175 # readily available information to improve our chances of guessing 176 # right. The odds of something not being UTF-8, but still passing a 177 # UTF-8 validity test go down very rapidly with increasing length of the 178 # sequence. Therefore we look at all the maximal length non-ascii 179 # sequences on the line. If any of the sequences can't be UTF-8, we 180 # quit there and choose CP1252. If all could be UTF-8, we guess UTF-8. 181 # 182 # On EBCDIC platforms, the situation is somewhat different. In 183 # UTF-EBCDIC, not only do ASCII-range bytes represent their code points, 184 # but so do the bytes that are for the C1 controls. Recall that these 185 # correspond to the unused portion of 8859-1 that 1252 mostly takes 186 # over. That means that there are fewer code points that are 187 # represented by multi-bytes. But, note that the these controls are 188 # very unlikely to be in pod text. So if we encounter one of them, it 189 # means that it is quite likely CP1252 and not UTF-8. The net result is 190 # the same code below is used for both platforms. 191 while ($line =~ m/($non_ascii_re+)/g) { 192 my $non_ascii_seq = $1; 193 194 if (length $non_ascii_seq == 1) { 195 $encoding = 'CP1252'; 196 goto guessed; 197 } elsif ($] ge 5.007_003) { 198 199 # On Perls that have this function, we can see if the sequence is 200 # valid UTF-8 or not. 201 if (! utf8::decode($non_ascii_seq)) { 202 $encoding = 'CP1252'; 203 goto guessed; 204 } 205 } elsif (ord("A") == 65) { # An early Perl, ASCII platform 206 207 # Without utf8::decode, it's a lot harder to do a rigorous check 208 # (though some early releases had a different function that 209 # accomplished the same thing). Since these are ancient Perls, not 210 # likely to be in use today, we take the easy way out, and look at 211 # just the first two bytes of the sequence to see if they are the 212 # start of a UTF-8 character. In ASCII UTF-8, continuation bytes 213 # must be between 0x80 and 0xBF. Start bytes can range from 0xC2 214 # through 0xFF, but anything above 0xF4 is not Unicode, and hence 215 # extremely unlikely to be in a pod. 216 if ($non_ascii_seq !~ /^[\xC2-\xF4][\x80-\xBF]/) { 217 $encoding = 'CP1252'; 218 goto guessed; 219 } 220 221 # We don't bother doing anything special for EBCDIC on early Perls. 222 # If there is a solitary variant, CP1252 will be chosen; otherwise 223 # UTF-8. 224 } 225 } # End of loop through all variant sequences on the line 226 227 # All sequences in the line could be UTF-8. Guess that. 228 $encoding = 'UTF-8'; 229 230 guessed: 231 $self->_handle_encoding_line( "=encoding $encoding" ); 232 delete $self->{'_processed_encoding'}; 233 $self->{'_transcoder'} && $self->{'_transcoder'}->($line); 234 235 my ($word) = $line =~ /(\S*$non_ascii_re\S*)/; 236 237 $self->whine( 238 $self->{'line_count'}, 239 "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding" 240 ); 241 } 242 243 DEBUG > 5 and print STDERR "# Parsing line: [$line]\n"; 244 245 if(!$self->{'in_pod'}) { 246 if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) { 247 if($1 eq 'cut') { 248 $self->scream( 249 $self->{'line_count'}, 250 "=cut found outside a pod block. Skipping to next block." 251 ); 252 253 ## Before there were errata sections in the world, it was 254 ## least-pessimal to abort processing the file. But now we can 255 ## just barrel on thru (but still not start a pod block). 256 #splice @_; 257 #push @_, undef; 258 259 next; 260 } else { 261 $self->{'in_pod'} = $self->{'start_of_pod_block'} 262 = $self->{'last_was_blank'} = 1; 263 # And fall thru to the pod-mode block further down 264 } 265 } else { 266 DEBUG > 5 and print STDERR "# It's a code-line.\n"; 267 $code_handler->(map $_, $line, $self->{'line_count'}, $self) 268 if $code_handler; 269 # Note: this may cause code to be processed out of order relative 270 # to pods, but in order relative to cuts. 271 272 # Note also that we haven't yet applied the transcoding to $line 273 # by time we call $code_handler! 274 275 if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) { 276 # That RE is from perlsyn, section "Plain Old Comments (Not!)", 277 #$fname = $2 if defined $2; 278 #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n"; 279 DEBUG > 1 and print STDERR "# Setting nextline to $1\n"; 280 $self->{'line_count'} = $1 - 1; 281 } 282 283 next; 284 } 285 } 286 287 # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 288 # Else we're in pod mode: 289 290 # Apply any necessary transcoding: 291 $self->{'_transcoder'} && $self->{'_transcoder'}->($line); 292 293 # HERE WE CATCH =encoding EARLY! 294 if( $line =~ m/^=encoding\s+\S+\s*$/s ) { 295 next if $self->parse_characters; # Ignore this line 296 $line = $self->_handle_encoding_line( $line ); 297 } 298 299 if($line =~ m/^=cut/s) { 300 # here ends the pod block, and therefore the previous pod para 301 DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n"; 302 $self->{'in_pod'} = 0; 303 # ++$self->{'pod_para_count'}; 304 $self->_ponder_paragraph_buffer(); 305 # by now it's safe to consider the previous paragraph as done. 306 $cut_handler->(map $_, $line, $self->{'line_count'}, $self) 307 if $cut_handler; 308 309 # TODO: add to docs: Note: this may cause cuts to be processed out 310 # of order relative to pods, but in order relative to code. 311 312 } elsif($line =~ m/^(\s*)$/s) { # it's a blank line 313 if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line 314 $wl_handler->(map $_, $line, $self->{'line_count'}, $self) 315 if $wl_handler; 316 } 317 318 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') { 319 DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n"; 320 push @{$paras->[-1]}, $line; 321 } # otherwise it's not interesting 322 323 if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) { 324 DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n"; 325 } 326 327 $self->{'last_was_blank'} = 1; 328 329 } elsif($self->{'last_was_blank'}) { # A non-blank line starting a new para... 330 331 if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(?:\s+|$)(.*)/s) { 332 # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS 333 my $new = [$1, {'start_line' => $self->{'line_count'}}, $2]; 334 # Note that in "=head1 foo", the WS is lost. 335 # Example: ['=head1', {'start_line' => 123}, ' foo'] 336 337 ++$self->{'pod_para_count'}; 338 339 $self->_ponder_paragraph_buffer(); 340 # by now it's safe to consider the previous paragraph as done. 341 342 push @$paras, $new; # the new incipient paragraph 343 DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n"; 344 345 } elsif($line =~ m/^\s/s) { 346 347 if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') { 348 DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n"; 349 push @{$paras->[-1]}, $line; 350 } else { 351 ++$self->{'pod_para_count'}; 352 $self->_ponder_paragraph_buffer(); 353 # by now it's safe to consider the previous paragraph as done. 354 DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n"; 355 push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line]; 356 } 357 } else { 358 ++$self->{'pod_para_count'}; 359 $self->_ponder_paragraph_buffer(); 360 # by now it's safe to consider the previous paragraph as done. 361 push @$paras, ['~Para', {'start_line' => $self->{'line_count'}}, $line]; 362 DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n"; 363 } 364 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0; 365 366 } else { 367 # It's a non-blank line /continuing/ the current para 368 if(@$paras) { 369 DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n"; 370 push @{$paras->[-1]}, $line; 371 } else { 372 # Unexpected case! 373 die "Continuing a paragraph but \@\$paras is empty?"; 374 } 375 $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0; 376 } 377 378 } # ends the big while loop 379 380 DEBUG > 1 and print STDERR (pretty(@$paras), "\n"); 381 return $self; 382} 383 384#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 385 386sub _handle_encoding_line { 387 my($self, $line) = @_; 388 389 return if $self->parse_characters; 390 391 # The point of this routine is to set $self->{'_transcoder'} as indicated. 392 393 return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s; 394 DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n"; 395 396 my $e = $1; 397 my $orig = $e; 398 push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig"; 399 400 my $enc_error; 401 402 # Cf. perldoc Encode and perldoc Encode::Supported 403 404 require Pod::Simple::Transcode; 405 406 if( $self->{'encoding'} ) { 407 my $norm_current = $self->{'encoding'}; 408 my $norm_e = $e; 409 foreach my $that ($norm_current, $norm_e) { 410 $that = lc($that); 411 $that =~ s/[-_]//g; 412 } 413 if($norm_current eq $norm_e) { 414 DEBUG > 1 and print STDERR "The '=encoding $orig' line is ", 415 "redundant. ($norm_current eq $norm_e). Ignoring.\n"; 416 $enc_error = ''; 417 # But that doesn't necessarily mean that the earlier one went okay 418 } else { 419 $enc_error = "Encoding is already set to " . $self->{'encoding'}; 420 DEBUG > 1 and print STDERR $enc_error; 421 } 422 } elsif ( 423 # OK, let's turn on the encoding 424 do { 425 DEBUG > 1 and print STDERR " Setting encoding to $e\n"; 426 $self->{'encoding'} = $e; 427 1; 428 } 429 and $e eq 'HACKRAW' 430 ) { 431 DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n"; 432 433 } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) { 434 435 die($enc_error = "WHAT? _transcoder is already set?!") 436 if $self->{'_transcoder'}; # should never happen 437 require Pod::Simple::Transcode; 438 $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e); 439 eval { 440 my @x = ('', "abc", "123"); 441 $self->{'_transcoder'}->(@x); 442 }; 443 $@ && die( $enc_error = 444 "Really unexpected error setting up encoding $e: $@\nAborting" 445 ); 446 $self->{'detected_encoding'} = $e; 447 448 } else { 449 my @supported = Pod::Simple::Transcode::->all_encodings; 450 451 # Note unsupported, and complain 452 DEBUG and print STDERR " Encoding [$e] is unsupported.", 453 "\nSupporteds: @supported\n"; 454 my $suggestion = ''; 455 456 # Look for a near match: 457 my $norm = lc($e); 458 $norm =~ tr[-_][]d; 459 my $n; 460 foreach my $enc (@supported) { 461 $n = lc($enc); 462 $n =~ tr[-_][]d; 463 next unless $n eq $norm; 464 $suggestion = " (Maybe \"$e\" should be \"$enc\"?)"; 465 last; 466 } 467 my $encmodver = Pod::Simple::Transcode::->encmodver; 468 $enc_error = join '' => 469 "This document probably does not appear as it should, because its ", 470 "\"=encoding $e\" line calls for an unsupported encoding.", 471 $suggestion, " [$encmodver\'s supported encodings are: @supported]" 472 ; 473 474 $self->scream( $self->{'line_count'}, $enc_error ); 475 } 476 push @{ $self->{'encoding_command_statuses'} }, $enc_error; 477 if (defined($self->{'_processed_encoding'})) { 478 # Double declaration. 479 $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives'); 480 } 481 $self->{'_processed_encoding'} = $orig; 482 483 return $line; 484} 485 486# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 487 488sub _handle_encoding_second_level { 489 # By time this is called, the encoding (if well formed) will already 490 # have been acted one. 491 my($self, $para) = @_; 492 my @x = @$para; 493 my $content = join ' ', splice @x, 2; 494 $content =~ s/^\s+//s; 495 $content =~ s/\s+$//s; 496 497 DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n"; 498 499 if (defined($self->{'_processed_encoding'})) { 500 #if($content ne $self->{'_processed_encoding'}) { 501 # Could it happen? 502 #} 503 delete $self->{'_processed_encoding'}; 504 # It's already been handled. Check for errors. 505 if(! $self->{'encoding_command_statuses'} ) { 506 DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n"; 507 } elsif( $self->{'encoding_command_statuses'}[-1] ) { 508 $self->whine( $para->[1]{'start_line'}, 509 sprintf "Couldn't do %s: %s", 510 $self->{'encoding_command_reqs' }[-1], 511 $self->{'encoding_command_statuses'}[-1], 512 ); 513 } else { 514 DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n"; 515 } 516 517 } else { 518 # Otherwise it's a syntax error 519 $self->whine( $para->[1]{'start_line'}, 520 "Invalid =encoding syntax: $content" 521 ); 522 } 523 524 return; 525} 526 527#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~` 528 529{ 530my $m = -321; # magic line number 531 532sub _gen_errata { 533 my $self = $_[0]; 534 # Return 0 or more fake-o paragraphs explaining the accumulated 535 # errors on this document. 536 537 return() unless $self->{'errata'} and keys %{$self->{'errata'}}; 538 539 my @out; 540 541 foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) { 542 push @out, 543 ['=item', {'start_line' => $m}, "Around line $line:"], 544 map( ['~Para', {'start_line' => $m, '~cooked' => 1}, 545 #['~Top', {'start_line' => $m}, 546 $_ 547 #] 548 ], 549 @{$self->{'errata'}{$line}} 550 ) 551 ; 552 } 553 554 # TODO: report of unknown entities? unrenderable characters? 555 556 unshift @out, 557 ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'], 558 ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1}, 559 "Hey! ", 560 ['B', {}, 561 'The above document had some coding errors, which are explained below:' 562 ] 563 ], 564 ['=over', {'start_line' => $m, 'errata' => 1}, ''], 565 ; 566 567 push @out, 568 ['=back', {'start_line' => $m, 'errata' => 1}, ''], 569 ; 570 571 DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n"; 572 573 return @out; 574} 575 576} 577 578#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 579 580############################################################################## 581## 582## stop reading now stop reading now stop reading now stop reading now stop 583## 584## HERE IT BECOMES REALLY SCARY 585## 586## stop reading now stop reading now stop reading now stop reading now stop 587## 588############################################################################## 589 590sub _ponder_paragraph_buffer { 591 592 # Para-token types as found in the buffer. 593 # ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end, 594 # =over, =back, =item 595 # and the null =pod (to be complained about if over one line) 596 # 597 # "~data" paragraphs are something we generate at this level, depending on 598 # a currently open =over region 599 600 # Events fired: Begin and end for: 601 # directivename (like head1 .. head4), item, extend, 602 # for (from =begin...=end, =for), 603 # over-bullet, over-number, over-text, over-block, 604 # item-bullet, item-number, item-text, 605 # Document, 606 # Data, Para, Verbatim 607 # B, C, longdirname (TODO -- wha?), etc. for all directives 608 # 609 610 my $self = $_[0]; 611 my $paras; 612 return unless @{$paras = $self->{'paras'}}; 613 my $curr_open = ($self->{'curr_open'} ||= []); 614 615 my $scratch; 616 617 DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n"; 618 619 # We have something in our buffer. So apparently the document has started. 620 unless($self->{'doc_has_started'}) { 621 $self->{'doc_has_started'} = 1; 622 623 my $starting_contentless; 624 $starting_contentless = 625 ( 626 !@$curr_open 627 and @$paras and ! grep $_->[0] ne '~end', @$paras 628 # i.e., if the paras is all ~ends 629 ) 630 ; 631 DEBUG and print STDERR "# Starting ", 632 $starting_contentless ? 'contentless' : 'contentful', 633 " document\n" 634 ; 635 636 $self->_handle_element_start( 637 ($scratch = 'Document'), 638 { 639 'start_line' => $paras->[0][1]{'start_line'}, 640 $starting_contentless ? ( 'contentless' => 1 ) : (), 641 }, 642 ); 643 } 644 645 my($para, $para_type); 646 while(@$paras) { 647 last if @$paras == 1 and 648 ( $paras->[0][0] eq '=over' or $paras->[0][0] eq '~Verbatim' 649 or $paras->[0][0] eq '=item' ) 650 ; 651 # Those're the three kinds of paragraphs that require lookahead. 652 # Actually, an "=item Foo" inside an <over type=text> region 653 # and any =item inside an <over type=block> region (rare) 654 # don't require any lookahead, but all others (bullets 655 # and numbers) do. 656 657# TODO: whinge about many kinds of directives in non-resolving =for regions? 658# TODO: many? like what? =head1 etc? 659 660 $para = shift @$paras; 661 $para_type = $para->[0]; 662 663 DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (", 664 $self->_dump_curr_open(), ")\n"; 665 666 if($para_type eq '=for') { 667 next if $self->_ponder_for($para,$curr_open,$paras); 668 669 } elsif($para_type eq '=begin') { 670 next if $self->_ponder_begin($para,$curr_open,$paras); 671 672 } elsif($para_type eq '=end') { 673 next if $self->_ponder_end($para,$curr_open,$paras); 674 675 } elsif($para_type eq '~end') { # The virtual end-document signal 676 next if $self->_ponder_doc_end($para,$curr_open,$paras); 677 } 678 679 680 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 681 #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 682 if(grep $_->[1]{'~ignore'}, @$curr_open) { 683 DEBUG > 1 and 684 print STDERR "Skipping $para_type paragraph because in ignore mode.\n"; 685 next; 686 } 687 #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 688 # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ 689 690 if($para_type eq '=pod') { 691 $self->_ponder_pod($para,$curr_open,$paras); 692 693 } elsif($para_type eq '=over') { 694 next if $self->_ponder_over($para,$curr_open,$paras); 695 696 } elsif($para_type eq '=back') { 697 next if $self->_ponder_back($para,$curr_open,$paras); 698 699 } else { 700 701 # All non-magical codes!!! 702 703 # Here we start using $para_type for our own twisted purposes, to 704 # mean how it should get treated, not as what the element name 705 # should be. 706 707 DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n"; 708 709 my $i; 710 711 # Enforce some =headN discipline 712 if($para_type =~ m/^=head\d$/s 713 and ! $self->{'accept_heads_anywhere'} 714 and @$curr_open 715 and $curr_open->[-1][0] eq '=over' 716 ) { 717 DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n"; 718 $self->whine( 719 $para->[1]{'start_line'}, 720 "You forgot a '=back' before '$para_type'" 721 ); 722 unshift @$paras, ['=back', {}, ''], $para; # close the =over 723 next; 724 } 725 726 727 if($para_type eq '=item') { 728 729 my $over; 730 unless(@$curr_open and 731 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) { 732 $self->whine( 733 $para->[1]{'start_line'}, 734 "'=item' outside of any '=over'" 735 ); 736 unshift @$paras, 737 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''], 738 $para 739 ; 740 next; 741 } 742 743 744 my $over_type = $over->[1]{'~type'}; 745 746 if(!$over_type) { 747 # Shouldn't happen1 748 die "Typeless over in stack, starting at line " 749 . $over->[1]{'start_line'}; 750 751 } elsif($over_type eq 'block') { 752 unless($curr_open->[-1][1]{'~bitched_about'}) { 753 $curr_open->[-1][1]{'~bitched_about'} = 1; 754 $self->whine( 755 $curr_open->[-1][1]{'start_line'}, 756 "You can't have =items (as at line " 757 . $para->[1]{'start_line'} 758 . ") unless the first thing after the =over is an =item" 759 ); 760 } 761 # Just turn it into a paragraph and reconsider it 762 $para->[0] = '~Para'; 763 unshift @$paras, $para; 764 next; 765 766 } elsif($over_type eq 'text') { 767 my $item_type = $self->_get_item_type($para); 768 # That kills the content of the item if it's a number or bullet. 769 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 770 771 if($item_type eq 'text') { 772 # Nothing special needs doing for 'text' 773 } elsif($item_type eq 'number' or $item_type eq 'bullet') { 774 $self->whine( 775 $para->[1]{'start_line'}, 776 "Expected text after =item, not a $item_type" 777 ); 778 # Undo our clobbering: 779 push @$para, $para->[1]{'~orig_content'}; 780 delete $para->[1]{'number'}; 781 # Only a PROPER item-number element is allowed 782 # to have a number attribute. 783 } else { 784 die "Unhandled item type $item_type"; # should never happen 785 } 786 787 # =item-text thingies don't need any assimilation, it seems. 788 789 } elsif($over_type eq 'number') { 790 my $item_type = $self->_get_item_type($para); 791 # That kills the content of the item if it's a number or bullet. 792 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 793 794 my $expected_value = ++ $curr_open->[-1][1]{'~counter'}; 795 796 if($item_type eq 'bullet') { 797 # Hm, it's not numeric. Correct for this. 798 $para->[1]{'number'} = $expected_value; 799 $self->whine( 800 $para->[1]{'start_line'}, 801 "Expected '=item $expected_value'" 802 ); 803 push @$para, $para->[1]{'~orig_content'}; 804 # restore the bullet, blocking the assimilation of next para 805 806 } elsif($item_type eq 'text') { 807 # Hm, it's not numeric. Correct for this. 808 $para->[1]{'number'} = $expected_value; 809 $self->whine( 810 $para->[1]{'start_line'}, 811 "Expected '=item $expected_value'" 812 ); 813 # Text content will still be there and will block next ~Para 814 815 } elsif($item_type ne 'number') { 816 die "Unknown item type $item_type"; # should never happen 817 818 } elsif($expected_value == $para->[1]{'number'}) { 819 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n"; 820 821 } else { 822 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'}, 823 " instead of the expected value of $expected_value\n"; 824 $self->whine( 825 $para->[1]{'start_line'}, 826 "You have '=item " . $para->[1]{'number'} . 827 "' instead of the expected '=item $expected_value'" 828 ); 829 $para->[1]{'number'} = $expected_value; # correcting!! 830 } 831 832 if(@$para == 2) { 833 # For the cases where we /didn't/ push to @$para 834 if($paras->[0][0] eq '~Para') { 835 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 836 push @$para, splice @{shift @$paras},2; 837 } else { 838 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 839 push @$para, ''; # Just so it's not contentless 840 } 841 } 842 843 844 } elsif($over_type eq 'bullet') { 845 my $item_type = $self->_get_item_type($para); 846 # That kills the content of the item if it's a number or bullet. 847 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 848 849 if($item_type eq 'bullet') { 850 # as expected! 851 852 if( $para->[1]{'~_freaky_para_hack'} ) { 853 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n"; 854 push @$para, delete $para->[1]{'~_freaky_para_hack'}; 855 } 856 857 } elsif($item_type eq 'number') { 858 $self->whine( 859 $para->[1]{'start_line'}, 860 "Expected '=item *'" 861 ); 862 push @$para, $para->[1]{'~orig_content'}; 863 # and block assimilation of the next paragraph 864 delete $para->[1]{'number'}; 865 # Only a PROPER item-number element is allowed 866 # to have a number attribute. 867 } elsif($item_type eq 'text') { 868 $self->whine( 869 $para->[1]{'start_line'}, 870 "Expected '=item *'" 871 ); 872 # But doesn't need processing. But it'll block assimilation 873 # of the next para. 874 } else { 875 die "Unhandled item type $item_type"; # should never happen 876 } 877 878 if(@$para == 2) { 879 # For the cases where we /didn't/ push to @$para 880 if($paras->[0][0] eq '~Para') { 881 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 882 push @$para, splice @{shift @$paras},2; 883 } else { 884 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 885 push @$para, ''; # Just so it's not contentless 886 } 887 } 888 889 } else { 890 die "Unhandled =over type \"$over_type\"?"; 891 # Shouldn't happen! 892 } 893 894 $para_type = 'Plain'; 895 $para->[0] .= '-' . $over_type; 896 # Whew. Now fall thru and process it. 897 898 899 } elsif($para_type eq '=extend') { 900 # Well, might as well implement it here. 901 $self->_ponder_extend($para); 902 next; # and skip 903 } elsif($para_type eq '=encoding') { 904 # Not actually acted on here, but we catch errors here. 905 $self->_handle_encoding_second_level($para); 906 next unless $self->keep_encoding_directive; 907 $para_type = 'Plain'; 908 } elsif($para_type eq '~Verbatim') { 909 $para->[0] = 'Verbatim'; 910 $para_type = '?Verbatim'; 911 } elsif($para_type eq '~Para') { 912 $para->[0] = 'Para'; 913 $para_type = '?Plain'; 914 } elsif($para_type eq 'Data') { 915 $para->[0] = 'Data'; 916 $para_type = '?Data'; 917 } elsif( $para_type =~ s/^=//s 918 and defined( $para_type = $self->{'accept_directives'}{$para_type} ) 919 ) { 920 DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n"; 921 } else { 922 # An unknown directive! 923 DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n", 924 $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} ) 925 ; 926 $self->whine( 927 $para->[1]{'start_line'}, 928 "Unknown directive: $para->[0]" 929 ); 930 931 # And maybe treat it as text instead of just letting it go? 932 next; 933 } 934 935 if($para_type =~ s/^\?//s) { 936 if(! @$curr_open) { # usual case 937 DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n"; 938 } else { 939 my @fors = grep $_->[0] eq '=for', @$curr_open; 940 DEBUG > 1 and print STDERR "Containing fors: ", 941 join(',', map $_->[1]{'target'}, @fors), "\n"; 942 943 if(! @fors) { 944 DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n"; 945 946 #} elsif(grep $_->[1]{'~resolve'}, @fors) { 947 #} elsif(not grep !$_->[1]{'~resolve'}, @fors) { 948 } elsif( $fors[-1][1]{'~resolve'} ) { 949 # Look to the immediately containing for 950 951 if($para_type eq 'Data') { 952 DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n"; 953 $para->[0] = 'Para'; 954 $para_type = 'Plain'; 955 } else { 956 DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n"; 957 } 958 } else { 959 DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n"; 960 $para->[0] = $para_type = 'Data'; 961 } 962 } 963 } 964 965 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 966 if($para_type eq 'Plain') { 967 $self->_ponder_Plain($para); 968 } elsif($para_type eq 'Verbatim') { 969 $self->_ponder_Verbatim($para); 970 } elsif($para_type eq 'Data') { 971 $self->_ponder_Data($para); 972 } else { 973 die "\$para type is $para_type -- how did that happen?"; 974 # Shouldn't happen. 975 } 976 977 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 978 $para->[0] =~ s/^[~=]//s; 979 980 DEBUG and print STDERR "\n", pretty($para), "\n"; 981 982 # traverse the treelet (which might well be just one string scalar) 983 $self->{'content_seen'} ||= 1; 984 $self->_traverse_treelet_bit(@$para); 985 } 986 } 987 988 return; 989} 990 991########################################################################### 992# The sub-ponderers... 993 994 995 996sub _ponder_for { 997 my ($self,$para,$curr_open,$paras) = @_; 998 999 # Fake it out as a begin/end 1000 my $target; 1001 1002 if(grep $_->[1]{'~ignore'}, @$curr_open) { 1003 DEBUG > 1 and print STDERR "Ignoring ignorable =for\n"; 1004 return 1; 1005 } 1006 1007 for(my $i = 2; $i < @$para; ++$i) { 1008 if($para->[$i] =~ s/^\s*(\S+)\s*//s) { 1009 $target = $1; 1010 last; 1011 } 1012 } 1013 unless(defined $target) { 1014 $self->whine( 1015 $para->[1]{'start_line'}, 1016 "=for without a target?" 1017 ); 1018 return 1; 1019 } 1020 DEBUG > 1 and 1021 print STDERR "Faking out a =for $target as a =begin $target / =end $target\n"; 1022 1023 $para->[0] = 'Data'; 1024 1025 unshift @$paras, 1026 ['=begin', 1027 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'}, 1028 $target, 1029 ], 1030 $para, 1031 ['=end', 1032 {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'}, 1033 $target, 1034 ], 1035 ; 1036 1037 return 1; 1038} 1039 1040sub _ponder_begin { 1041 my ($self,$para,$curr_open,$paras) = @_; 1042 my $content = join ' ', splice @$para, 2; 1043 $content =~ s/^\s+//s; 1044 $content =~ s/\s+$//s; 1045 unless(length($content)) { 1046 $self->whine( 1047 $para->[1]{'start_line'}, 1048 "=begin without a target?" 1049 ); 1050 DEBUG and print STDERR "Ignoring targetless =begin\n"; 1051 return 1; 1052 } 1053 1054 my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/; 1055 $para->[1]{'title'} = $title if ($title); 1056 $para->[1]{'target'} = $target; # without any ':' 1057 $content = $target; # strip off the title 1058 1059 $content =~ s/^:!/!:/s; 1060 my $neg; # whether this is a negation-match 1061 $neg = 1 if $content =~ s/^!//s; 1062 my $to_resolve; # whether to process formatting codes 1063 $to_resolve = 1 if $content =~ s/^://s; 1064 1065 my $dont_ignore; # whether this target matches us 1066 1067 foreach my $target_name ( 1068 split(',', $content, -1), 1069 $neg ? () : '*' 1070 ) { 1071 DEBUG > 2 and 1072 print STDERR " Considering whether =begin $content matches $target_name\n"; 1073 next unless $self->{'accept_targets'}{$target_name}; 1074 1075 DEBUG > 2 and 1076 print STDERR " It DOES match the acceptable target $target_name!\n"; 1077 $to_resolve = 1 1078 if $self->{'accept_targets'}{$target_name} eq 'force_resolve'; 1079 $dont_ignore = 1; 1080 $para->[1]{'target_matching'} = $target_name; 1081 last; # stop looking at other target names 1082 } 1083 1084 if($neg) { 1085 if( $dont_ignore ) { 1086 $dont_ignore = ''; 1087 delete $para->[1]{'target_matching'}; 1088 DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n"; 1089 } else { 1090 $dont_ignore = 1; 1091 $para->[1]{'target_matching'} = '!'; 1092 DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n"; 1093 } 1094 } 1095 1096 $para->[0] = '=for'; # Just what we happen to call these, internally 1097 $para->[1]{'~really'} ||= '=begin'; 1098 $para->[1]{'~ignore'} = (! $dont_ignore) || 0; 1099 $para->[1]{'~resolve'} = $to_resolve || 0; 1100 1101 DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '', 1102 "ignore contents of this region\n"; 1103 DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ", 1104 ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n"; 1105 DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n"; 1106 1107 push @$curr_open, $para; 1108 if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) { 1109 DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n"; 1110 } else { 1111 $self->{'content_seen'} ||= 1; 1112 $self->_handle_element_start((my $scratch='for'), $para->[1]); 1113 } 1114 1115 return 1; 1116} 1117 1118sub _ponder_end { 1119 my ($self,$para,$curr_open,$paras) = @_; 1120 my $content = join ' ', splice @$para, 2; 1121 $content =~ s/^\s+//s; 1122 $content =~ s/\s+$//s; 1123 DEBUG and print STDERR "Ogling '=end $content' directive\n"; 1124 1125 unless(length($content)) { 1126 $self->whine( 1127 $para->[1]{'start_line'}, 1128 "'=end' without a target?" . ( 1129 ( @$curr_open and $curr_open->[-1][0] eq '=for' ) 1130 ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' ) 1131 : '' 1132 ) 1133 ); 1134 DEBUG and print STDERR "Ignoring targetless =end\n"; 1135 return 1; 1136 } 1137 1138 unless($content =~ m/^\S+$/) { # i.e., unless it's one word 1139 $self->whine( 1140 $para->[1]{'start_line'}, 1141 "'=end $content' is invalid. (Stack: " 1142 . $self->_dump_curr_open() . ')' 1143 ); 1144 DEBUG and print STDERR "Ignoring mistargetted =end $content\n"; 1145 return 1; 1146 } 1147 1148 unless(@$curr_open and $curr_open->[-1][0] eq '=for') { 1149 $self->whine( 1150 $para->[1]{'start_line'}, 1151 "=end $content without matching =begin. (Stack: " 1152 . $self->_dump_curr_open() . ')' 1153 ); 1154 DEBUG and print STDERR "Ignoring mistargetted =end $content\n"; 1155 return 1; 1156 } 1157 1158 unless($content eq $curr_open->[-1][1]{'target'}) { 1159 $self->whine( 1160 $para->[1]{'start_line'}, 1161 "=end $content doesn't match =begin " 1162 . $curr_open->[-1][1]{'target'} 1163 . ". (Stack: " 1164 . $self->_dump_curr_open() . ')' 1165 ); 1166 DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n"; 1167 return 1; 1168 } 1169 1170 # Else it's okay to close... 1171 if(grep $_->[1]{'~ignore'}, @$curr_open) { 1172 DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n"; 1173 # And that may be because of this to-be-closed =for region, or some 1174 # other one, but it doesn't matter. 1175 } else { 1176 $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'}; 1177 # what's that for? 1178 1179 $self->{'content_seen'} ||= 1; 1180 $self->_handle_element_end( my $scratch = 'for', $para->[1]); 1181 } 1182 DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n"; 1183 pop @$curr_open; 1184 1185 return 1; 1186} 1187 1188sub _ponder_doc_end { 1189 my ($self,$para,$curr_open,$paras) = @_; 1190 if(@$curr_open) { # Deal with things left open 1191 DEBUG and print STDERR "Stack is nonempty at end-document: (", 1192 $self->_dump_curr_open(), ")\n"; 1193 1194 DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n"; 1195 unshift @$paras, $self->_closers_for_all_curr_open; 1196 # Make sure there is exactly one ~end in the parastack, at the end: 1197 @$paras = grep $_->[0] ne '~end', @$paras; 1198 push @$paras, $para, $para; 1199 # We need two -- once for the next cycle where we 1200 # generate errata, and then another to be at the end 1201 # when that loop back around to process the errata. 1202 return 1; 1203 1204 } else { 1205 DEBUG and print STDERR "Okay, stack is empty now.\n"; 1206 } 1207 1208 # Try generating errata section, if applicable 1209 unless($self->{'~tried_gen_errata'}) { 1210 $self->{'~tried_gen_errata'} = 1; 1211 my @extras = $self->_gen_errata(); 1212 if(@extras) { 1213 unshift @$paras, @extras; 1214 DEBUG and print STDERR "Generated errata... relooping...\n"; 1215 return 1; # I.e., loop around again to process these fake-o paragraphs 1216 } 1217 } 1218 1219 splice @$paras; # Well, that's that for this paragraph buffer. 1220 DEBUG and print STDERR "Throwing end-document event.\n"; 1221 1222 $self->_handle_element_end( my $scratch = 'Document' ); 1223 return 1; # Hasta la byebye 1224} 1225 1226sub _ponder_pod { 1227 my ($self,$para,$curr_open,$paras) = @_; 1228 $self->whine( 1229 $para->[1]{'start_line'}, 1230 "=pod directives shouldn't be over one line long! Ignoring all " 1231 . (@$para - 2) . " lines of content" 1232 ) if @$para > 3; 1233 1234 # Content ignored unless 'pod_handler' is set 1235 if (my $pod_handler = $self->{'pod_handler'}) { 1236 my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2]; 1237 $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output 1238 $pod_handler->($line, $line_num, $self); 1239 } 1240 1241 # The surrounding methods set content_seen, so let us remain consistent. 1242 # I do not know why it was not here before -- should it not be here? 1243 # $self->{'content_seen'} ||= 1; 1244 1245 return; 1246} 1247 1248sub _ponder_over { 1249 my ($self,$para,$curr_open,$paras) = @_; 1250 return 1 unless @$paras; 1251 my $list_type; 1252 1253 if($paras->[0][0] eq '=item') { # most common case 1254 $list_type = $self->_get_initial_item_type($paras->[0]); 1255 1256 } elsif($paras->[0][0] eq '=back') { 1257 # Ignore empty lists by default 1258 if ($self->{'parse_empty_lists'}) { 1259 $list_type = 'empty'; 1260 } else { 1261 shift @$paras; 1262 return 1; 1263 } 1264 } elsif($paras->[0][0] eq '~end') { 1265 $self->whine( 1266 $para->[1]{'start_line'}, 1267 "=over is the last thing in the document?!" 1268 ); 1269 return 1; # But feh, ignore it. 1270 } else { 1271 $list_type = 'block'; 1272 } 1273 $para->[1]{'~type'} = $list_type; 1274 push @$curr_open, $para; 1275 # yes, we reuse the paragraph as a stack item 1276 1277 my $content = join ' ', splice @$para, 2; 1278 my $overness; 1279 if($content =~ m/^\s*$/s) { 1280 $para->[1]{'indent'} = 4; 1281 } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) { 1282 no integer; 1283 $para->[1]{'indent'} = $1; 1284 if($1 == 0) { 1285 $self->whine( 1286 $para->[1]{'start_line'}, 1287 "Can't have a 0 in =over $content" 1288 ); 1289 $para->[1]{'indent'} = 4; 1290 } 1291 } else { 1292 $self->whine( 1293 $para->[1]{'start_line'}, 1294 "=over should be: '=over' or '=over positive_number'" 1295 ); 1296 $para->[1]{'indent'} = 4; 1297 } 1298 DEBUG > 1 and print STDERR "=over found of type $list_type\n"; 1299 1300 $self->{'content_seen'} ||= 1; 1301 $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]); 1302 1303 return; 1304} 1305 1306sub _ponder_back { 1307 my ($self,$para,$curr_open,$paras) = @_; 1308 # TODO: fire off </item-number> or </item-bullet> or </item-text> ?? 1309 1310 my $content = join ' ', splice @$para, 2; 1311 if($content =~ m/\S/) { 1312 $self->whine( 1313 $para->[1]{'start_line'}, 1314 "=back doesn't take any parameters, but you said =back $content" 1315 ); 1316 } 1317 1318 if(@$curr_open and $curr_open->[-1][0] eq '=over') { 1319 DEBUG > 1 and print STDERR "=back happily closes matching =over\n"; 1320 # Expected case: we're closing the most recently opened thing 1321 #my $over = pop @$curr_open; 1322 $self->{'content_seen'} ||= 1; 1323 $self->_handle_element_end( my $scratch = 1324 'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1] 1325 ); 1326 } else { 1327 DEBUG > 1 and print STDERR "=back found without a matching =over. Stack: (", 1328 join(', ', map $_->[0], @$curr_open), ").\n"; 1329 $self->whine( 1330 $para->[1]{'start_line'}, 1331 '=back without =over' 1332 ); 1333 return 1; # and ignore it 1334 } 1335} 1336 1337sub _ponder_item { 1338 my ($self,$para,$curr_open,$paras) = @_; 1339 my $over; 1340 unless(@$curr_open and 1341 $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) { 1342 $self->whine( 1343 $para->[1]{'start_line'}, 1344 "'=item' outside of any '=over'" 1345 ); 1346 unshift @$paras, 1347 ['=over', {'start_line' => $para->[1]{'start_line'}}, ''], 1348 $para 1349 ; 1350 return 1; 1351 } 1352 1353 1354 my $over_type = $over->[1]{'~type'}; 1355 1356 if(!$over_type) { 1357 # Shouldn't happen1 1358 die "Typeless over in stack, starting at line " 1359 . $over->[1]{'start_line'}; 1360 1361 } elsif($over_type eq 'block') { 1362 unless($curr_open->[-1][1]{'~bitched_about'}) { 1363 $curr_open->[-1][1]{'~bitched_about'} = 1; 1364 $self->whine( 1365 $curr_open->[-1][1]{'start_line'}, 1366 "You can't have =items (as at line " 1367 . $para->[1]{'start_line'} 1368 . ") unless the first thing after the =over is an =item" 1369 ); 1370 } 1371 # Just turn it into a paragraph and reconsider it 1372 $para->[0] = '~Para'; 1373 unshift @$paras, $para; 1374 return 1; 1375 1376 } elsif($over_type eq 'text') { 1377 my $item_type = $self->_get_item_type($para); 1378 # That kills the content of the item if it's a number or bullet. 1379 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 1380 1381 if($item_type eq 'text') { 1382 # Nothing special needs doing for 'text' 1383 } elsif($item_type eq 'number' or $item_type eq 'bullet') { 1384 $self->whine( 1385 $para->[1]{'start_line'}, 1386 "Expected text after =item, not a $item_type" 1387 ); 1388 # Undo our clobbering: 1389 push @$para, $para->[1]{'~orig_content'}; 1390 delete $para->[1]{'number'}; 1391 # Only a PROPER item-number element is allowed 1392 # to have a number attribute. 1393 } else { 1394 die "Unhandled item type $item_type"; # should never happen 1395 } 1396 1397 # =item-text thingies don't need any assimilation, it seems. 1398 1399 } elsif($over_type eq 'number') { 1400 my $item_type = $self->_get_item_type($para); 1401 # That kills the content of the item if it's a number or bullet. 1402 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 1403 1404 my $expected_value = ++ $curr_open->[-1][1]{'~counter'}; 1405 1406 if($item_type eq 'bullet') { 1407 # Hm, it's not numeric. Correct for this. 1408 $para->[1]{'number'} = $expected_value; 1409 $self->whine( 1410 $para->[1]{'start_line'}, 1411 "Expected '=item $expected_value'" 1412 ); 1413 push @$para, $para->[1]{'~orig_content'}; 1414 # restore the bullet, blocking the assimilation of next para 1415 1416 } elsif($item_type eq 'text') { 1417 # Hm, it's not numeric. Correct for this. 1418 $para->[1]{'number'} = $expected_value; 1419 $self->whine( 1420 $para->[1]{'start_line'}, 1421 "Expected '=item $expected_value'" 1422 ); 1423 # Text content will still be there and will block next ~Para 1424 1425 } elsif($item_type ne 'number') { 1426 die "Unknown item type $item_type"; # should never happen 1427 1428 } elsif($expected_value == $para->[1]{'number'}) { 1429 DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n"; 1430 1431 } else { 1432 DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'}, 1433 " instead of the expected value of $expected_value\n"; 1434 $self->whine( 1435 $para->[1]{'start_line'}, 1436 "You have '=item " . $para->[1]{'number'} . 1437 "' instead of the expected '=item $expected_value'" 1438 ); 1439 $para->[1]{'number'} = $expected_value; # correcting!! 1440 } 1441 1442 if(@$para == 2) { 1443 # For the cases where we /didn't/ push to @$para 1444 if($paras->[0][0] eq '~Para') { 1445 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 1446 push @$para, splice @{shift @$paras},2; 1447 } else { 1448 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 1449 push @$para, ''; # Just so it's not contentless 1450 } 1451 } 1452 1453 1454 } elsif($over_type eq 'bullet') { 1455 my $item_type = $self->_get_item_type($para); 1456 # That kills the content of the item if it's a number or bullet. 1457 DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n"; 1458 1459 if($item_type eq 'bullet') { 1460 # as expected! 1461 1462 if( $para->[1]{'~_freaky_para_hack'} ) { 1463 DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n"; 1464 push @$para, delete $para->[1]{'~_freaky_para_hack'}; 1465 } 1466 1467 } elsif($item_type eq 'number') { 1468 $self->whine( 1469 $para->[1]{'start_line'}, 1470 "Expected '=item *'" 1471 ); 1472 push @$para, $para->[1]{'~orig_content'}; 1473 # and block assimilation of the next paragraph 1474 delete $para->[1]{'number'}; 1475 # Only a PROPER item-number element is allowed 1476 # to have a number attribute. 1477 } elsif($item_type eq 'text') { 1478 $self->whine( 1479 $para->[1]{'start_line'}, 1480 "Expected '=item *'" 1481 ); 1482 # But doesn't need processing. But it'll block assimilation 1483 # of the next para. 1484 } else { 1485 die "Unhandled item type $item_type"; # should never happen 1486 } 1487 1488 if(@$para == 2) { 1489 # For the cases where we /didn't/ push to @$para 1490 if($paras->[0][0] eq '~Para') { 1491 DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n"; 1492 push @$para, splice @{shift @$paras},2; 1493 } else { 1494 DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n"; 1495 push @$para, ''; # Just so it's not contentless 1496 } 1497 } 1498 1499 } else { 1500 die "Unhandled =over type \"$over_type\"?"; 1501 # Shouldn't happen! 1502 } 1503 $para->[0] .= '-' . $over_type; 1504 1505 return; 1506} 1507 1508sub _ponder_Plain { 1509 my ($self,$para) = @_; 1510 DEBUG and print STDERR " giving plain treatment...\n"; 1511 unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' ) 1512 or $para->[1]{'~cooked'} 1513 ) { 1514 push @$para, 1515 @{$self->_make_treelet( 1516 join("\n", splice(@$para, 2)), 1517 $para->[1]{'start_line'} 1518 )}; 1519 } 1520 # Empty paragraphs don't need a treelet for any reason I can see. 1521 # And precooked paragraphs already have a treelet. 1522 return; 1523} 1524 1525sub _ponder_Verbatim { 1526 my ($self,$para) = @_; 1527 DEBUG and print STDERR " giving verbatim treatment...\n"; 1528 1529 $para->[1]{'xml:space'} = 'preserve'; 1530 1531 my $indent = $self->strip_verbatim_indent; 1532 if ($indent && ref $indent eq 'CODE') { 1533 my @shifted = (shift @{$para}, shift @{$para}); 1534 $indent = $indent->($para); 1535 unshift @{$para}, @shifted; 1536 } 1537 1538 for(my $i = 2; $i < @$para; $i++) { 1539 foreach my $line ($para->[$i]) { # just for aliasing 1540 # Strip indentation. 1541 $line =~ s/^\Q$indent// if $indent 1542 && !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted}); 1543 while( $line =~ 1544 # Sort of adapted from Text::Tabs -- yes, it's hardwired in that 1545 # tabs are at every EIGHTH column. For portability, it has to be 1546 # one setting everywhere, and 8th wins. 1547 s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e 1548 ) {} 1549 1550 # TODO: whinge about (or otherwise treat) unindented or overlong lines 1551 1552 } 1553 } 1554 1555 # Now the VerbatimFormatted hoodoo... 1556 if( $self->{'accept_codes'} and 1557 $self->{'accept_codes'}{'VerbatimFormatted'} 1558 ) { 1559 while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para } 1560 # Kill any number of terminal newlines 1561 $self->_verbatim_format($para); 1562 } elsif ($self->{'codes_in_verbatim'}) { 1563 push @$para, 1564 @{$self->_make_treelet( 1565 join("\n", splice(@$para, 2)), 1566 $para->[1]{'start_line'}, $para->[1]{'xml:space'} 1567 )}; 1568 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines 1569 } else { 1570 push @$para, join "\n", splice(@$para, 2) if @$para > 3; 1571 $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines 1572 } 1573 return; 1574} 1575 1576sub _ponder_Data { 1577 my ($self,$para) = @_; 1578 DEBUG and print STDERR " giving data treatment...\n"; 1579 $para->[1]{'xml:space'} = 'preserve'; 1580 push @$para, join "\n", splice(@$para, 2) if @$para > 3; 1581 return; 1582} 1583 1584 1585 1586 1587########################################################################### 1588 1589sub _traverse_treelet_bit { # for use only by the routine above 1590 my($self, $name) = splice @_,0,2; 1591 1592 my $scratch; 1593 $self->_handle_element_start(($scratch=$name), shift @_); 1594 1595 while (@_) { 1596 my $x = shift; 1597 if (ref($x)) { 1598 &_traverse_treelet_bit($self, @$x); 1599 } else { 1600 $x .= shift while @_ && !ref($_[0]); 1601 $self->_handle_text($x); 1602 } 1603 } 1604 1605 $self->_handle_element_end($scratch=$name); 1606 return; 1607} 1608 1609#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1610 1611sub _closers_for_all_curr_open { 1612 my $self = $_[0]; 1613 my @closers; 1614 foreach my $still_open (@{ $self->{'curr_open'} || return }) { 1615 my @copy = @$still_open; 1616 $copy[1] = {%{ $copy[1] }}; 1617 #$copy[1]{'start_line'} = -1; 1618 if($copy[0] eq '=for') { 1619 $copy[0] = '=end'; 1620 } elsif($copy[0] eq '=over') { 1621 $self->whine( 1622 $still_open->[1]{start_line} , 1623 "=over without closing =back" 1624 ); 1625 1626 $copy[0] = '=back'; 1627 } else { 1628 die "I don't know how to auto-close an open $copy[0] region"; 1629 } 1630 1631 unless( @copy > 2 ) { 1632 push @copy, $copy[1]{'target'}; 1633 $copy[-1] = '' unless defined $copy[-1]; 1634 # since =over's don't have targets 1635 } 1636 1637 $copy[1]{'fake-closer'} = 1; 1638 1639 DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n"; 1640 unshift @closers, \@copy; 1641 } 1642 return @closers; 1643} 1644 1645#-------------------------------------------------------------------------- 1646 1647sub _verbatim_format { 1648 my($it, $p) = @_; 1649 1650 my $formatting; 1651 1652 for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines 1653 DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n"; 1654 $p->[$i] .= "\n"; 1655 # Unlike with simple Verbatim blocks, we don't end up just doing 1656 # a join("\n", ...) on the contents, so we have to append a 1657 # newline to ever line, and then nix the last one later. 1658 } 1659 1660 if( DEBUG > 4 ) { 1661 print STDERR "<<\n"; 1662 for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines 1663 print STDERR "_verbatim_format $i: $p->[$i]"; 1664 } 1665 print STDERR ">>\n"; 1666 } 1667 1668 for(my $i = $#$p; $i > 2; $i--) { 1669 # work backwards over the lines, except the first (#2) 1670 1671 #next unless $p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s 1672 # and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s; 1673 # look at a formatty line preceding a nonformatty one 1674 DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n"; 1675 if($p->[$i] =~ m{^#:([ \^\/\%]*)\n?$}s) { 1676 DEBUG > 5 and print STDERR " It's a formatty line. ", 1677 "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n"; 1678 1679 if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) { 1680 DEBUG > 5 and print STDERR " Previous line is formatty! Skipping this one.\n"; 1681 next; 1682 } else { 1683 DEBUG > 5 and print STDERR " Previous line is non-formatty! Yay!\n"; 1684 } 1685 } else { 1686 DEBUG > 5 and print STDERR " It's not a formatty line. Ignoring\n"; 1687 next; 1688 } 1689 1690 # A formatty line has to have #: in the first two columns, and uses 1691 # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic. 1692 # Example: 1693 # What do you want? i like pie. [or whatever] 1694 # #:^^^^^^^^^^^^^^^^^ ///////////// 1695 1696 1697 DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n"; 1698 1699 $formatting = ' ' . $1; 1700 $formatting =~ s/\s+$//s; # nix trailing whitespace 1701 unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op 1702 splice @$p,$i,1; # remove this line 1703 $i--; # don't consider next line 1704 next; 1705 } 1706 1707 if( length($formatting) >= length($p->[$i-1]) ) { 1708 $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' '; 1709 } else { 1710 $formatting .= ' ' x (length($p->[$i-1]) - length($formatting)); 1711 } 1712 # Make $formatting and the previous line be exactly the same length, 1713 # with $formatting having a " " as the last character. 1714 1715 DEBUG > 4 and print STDERR "Formatting <$formatting> on <", $p->[$i-1], ">\n"; 1716 1717 1718 my @new_line; 1719 while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) { 1720 #print STDERR "Format matches $1\n"; 1721 1722 if($2) { 1723 #print STDERR "SKIPPING <$2>\n"; 1724 push @new_line, 1725 substr($p->[$i-1], pos($formatting)-length($1), length($1)); 1726 } else { 1727 #print STDERR "SNARING $+\n"; 1728 push @new_line, [ 1729 ( 1730 $3 ? 'VerbatimB' : 1731 $4 ? 'VerbatimI' : 1732 $5 ? 'VerbatimBI' : die("Should never get called") 1733 ), {}, 1734 substr($p->[$i-1], pos($formatting)-length($1), length($1)) 1735 ]; 1736 #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n"; 1737 } 1738 } 1739 my @nixed = 1740 splice @$p, $i-1, 2, @new_line; # replace myself and the next line 1741 DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n"; 1742 1743 DEBUG > 6 and print STDERR "New version of the above line is these tokens (", 1744 scalar(@new_line), "):", 1745 map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n"; 1746 $i--; # So the next line we scrutinize is the line before the one 1747 # that we just went and formatted 1748 } 1749 1750 $p->[0] = 'VerbatimFormatted'; 1751 1752 # Collapse adjacent text nodes, just for kicks. 1753 for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last 1754 if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) { 1755 DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n"; 1756 $p->[$i] .= splice @$p, $i+1, 1; # merge 1757 --$i; # and back up 1758 } 1759 } 1760 1761 # Now look for the last text token, and remove the terminal newline 1762 for( my $i = $#$p; $i >= 2; $i-- ) { 1763 # work backwards over the tokens, even the first 1764 if( !ref($p->[$i]) ) { 1765 if($p->[$i] =~ s/\n$//s) { 1766 DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n"; 1767 } else { 1768 DEBUG > 5 and print STDERR 1769 "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n"; 1770 } 1771 last; # we only want the next one 1772 } 1773 } 1774 1775 return; 1776} 1777 1778 1779#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1780 1781 1782sub _treelet_from_formatting_codes { 1783 # Given a paragraph, returns a treelet. Full of scary tokenizing code. 1784 # Like [ '~Top', {'start_line' => $start_line}, 1785 # "I like ", 1786 # [ 'B', {}, "pie" ], 1787 # "!" 1788 # ] 1789 1790 my($self, $para, $start_line, $preserve_space) = @_; 1791 1792 my $treelet = ['~Top', {'start_line' => $start_line},]; 1793 1794 unless ($preserve_space || $self->{'preserve_whitespace'}) { 1795 $para =~ s/\s+/ /g; # collapse and trim all whitespace first. 1796 $para =~ s/ $//; 1797 $para =~ s/^ //; 1798 } 1799 1800 # Only apparent problem the above code is that N<< >> turns into 1801 # N<< >>. But then, word wrapping does that too! So don't do that! 1802 1803 my @stack; 1804 my @lineage = ($treelet); 1805 my $raw = ''; # raw content of L<> fcode before splitting/processing 1806 # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed 1807 # into just 1 ' '. Is this the regex's doing or 'raw's? 1808 my $inL = 0; 1809 1810 DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n"; 1811 1812 # Here begins our frightening tokenizer RE. The following regex matches 1813 # text in four main parts: 1814 # 1815 # * Start-codes. The first alternative matches C< or C<<, the latter 1816 # followed by some whitespace. $1 will hold the entire start code 1817 # (including any space following a multiple-angle-bracket delimiter), 1818 # and $2 will hold only the additional brackets past the first in a 1819 # multiple-bracket delimiter. length($2) + 1 will be the number of 1820 # closing brackets we have to find. 1821 # 1822 # * Closing brackets. Match some amount of whitespace followed by 1823 # multiple close brackets. The logic to see if this closes anything 1824 # is down below. Note that in order to parse C<< >> correctly, we 1825 # have to use look-behind (?<=\s\s), since the match of the starting 1826 # code will have consumed the whitespace. 1827 # 1828 # * A single closing bracket, to close a simple code like C<>. 1829 # 1830 # * Something that isn't a start or end code. We have to be careful 1831 # about accepting whitespace, since perlpodspec says that any whitespace 1832 # before a multiple-bracket closing delimiter should be ignored. 1833 # 1834 while($para =~ 1835 m/\G 1836 (?: 1837 # Match starting codes, including the whitespace following a 1838 # multiple-delimiter start code. $1 gets the whole start code and 1839 # $2 gets all but one of the <s in the multiple-bracket case. 1840 ([A-Z]<(?:(<+)\s+)?) 1841 | 1842 # Match multiple-bracket end codes. $3 gets the whitespace that 1843 # should be discarded before an end bracket but kept in other cases 1844 # and $4 gets the end brackets themselves. 1845 (\s+|(?<=\s\s))(>{2,}) 1846 | 1847 (\s?>) # $5: simple end-codes 1848 | 1849 ( # $6: stuff containing no start-codes or end-codes 1850 (?: 1851 [^A-Z\s>] 1852 | 1853 (?: 1854 [A-Z](?!<) 1855 ) 1856 | 1857 # whitespace is ok, but we don't want to eat the whitespace before 1858 # a multiple-bracket end code. 1859 # NOTE: we may still have problems with e.g. S<< >> 1860 (?: 1861 \s(?!\s*>{2,}) 1862 ) 1863 )+ 1864 ) 1865 ) 1866 /xgo 1867 ) { 1868 DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n"; 1869 if(defined $1) { 1870 if(defined $2) { 1871 DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n"; 1872 push @stack, length($2) + 1; 1873 # length of the necessary complex end-code string 1874 } else { 1875 DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n"; 1876 push @stack, 0; # signal that we're looking for simple 1877 } 1878 push @lineage, [ substr($1,0,1), {}, ]; # new node object 1879 push @{ $lineage[-2] }, $lineage[-1]; 1880 if ('L' eq substr($1,0,1)) { 1881 $raw = $inL ? $raw.$1 : ''; # reset raw content accumulator 1882 $inL = 1; 1883 } else { 1884 $raw .= $1 if $inL; 1885 } 1886 1887 } elsif(defined $4) { 1888 DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n"; 1889 # This is where it gets messy... 1890 if(! @stack) { 1891 # We saw " >>>>" but needed nothing. This is ALL just stuff then. 1892 DEBUG > 4 and print STDERR " But it's really just stuff.\n"; 1893 push @{ $lineage[-1] }, $3, $4; 1894 next; 1895 } elsif(!$stack[-1]) { 1896 # We saw " >>>>" but needed only ">". Back pos up. 1897 DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n"; 1898 push @{ $lineage[-1] }, $3; # That was a for-real space, too. 1899 pos($para) = pos($para) - length($4) + 1; 1900 } elsif($stack[-1] == length($4)) { 1901 # We found " >>>>", and it was exactly what we needed. Commonest case. 1902 DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n"; 1903 } elsif($stack[-1] < length($4)) { 1904 # We saw " >>>>" but needed only " >>". Back pos up. 1905 DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n"; 1906 pos($para) = pos($para) - length($4) + $stack[-1]; 1907 } else { 1908 # We saw " >>>>" but needed " >>>>>>". So this is all just stuff! 1909 DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n"; 1910 push @{ $lineage[-1] }, $3, $4; 1911 next; 1912 } 1913 #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n"; 1914 1915 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] }; 1916 # Keep the element from being childless 1917 1918 pop @stack; 1919 pop @lineage; 1920 1921 unless (@stack) { # not in an L if there are no open fcodes 1922 $inL = 0; 1923 if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') { 1924 $lineage[-1][-1][1]{'raw'} = $raw 1925 } 1926 } 1927 $raw .= $3.$4 if $inL; 1928 1929 } elsif(defined $5) { 1930 DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n"; 1931 1932 if(@stack and ! $stack[-1]) { 1933 # We're indeed expecting a simple end-code 1934 DEBUG > 4 and print STDERR " It's indeed an end-code.\n"; 1935 1936 if(length($5) == 2) { # There was a space there: " >" 1937 push @{ $lineage[-1] }, ' '; 1938 } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element 1939 push @{ $lineage[-1] }, ''; # keep it from being really childless 1940 } 1941 1942 pop @stack; 1943 pop @lineage; 1944 } else { 1945 DEBUG > 4 and print STDERR " It's just stuff.\n"; 1946 push @{ $lineage[-1] }, $5; 1947 } 1948 1949 unless (@stack) { # not in an L if there are no open fcodes 1950 $inL = 0; 1951 if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') { 1952 $lineage[-1][-1][1]{'raw'} = $raw 1953 } 1954 } 1955 $raw .= $5 if $inL; 1956 1957 } elsif(defined $6) { 1958 DEBUG > 3 and print STDERR "Found stuff \"$6\"\n"; 1959 push @{ $lineage[-1] }, $6; 1960 $raw .= $6 if $inL; 1961 # XXX does not capture multiplace whitespaces -- 'raw' ends up with 1962 # at most 1 leading/trailing whitespace, why not all of it? 1963 1964 } else { 1965 # should never ever ever ever happen 1966 DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n"; 1967 die "SPORK 512512!"; 1968 } 1969 } 1970 1971 if(@stack) { # Uhoh, some sequences weren't closed. 1972 my $x= "..."; 1973 while(@stack) { 1974 push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] }; 1975 # Hmmmmm! 1976 1977 my $code = (pop @lineage)->[0]; 1978 my $ender_length = pop @stack; 1979 if($ender_length) { 1980 --$ender_length; 1981 $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length); 1982 } else { 1983 $x = $code . "<$x>"; 1984 } 1985 } 1986 DEBUG > 1 and print STDERR "Unterminated $x sequence\n"; 1987 $self->whine($start_line, 1988 "Unterminated $x sequence", 1989 ); 1990 } 1991 1992 return $treelet; 1993} 1994 1995#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1996 1997sub text_content_of_treelet { # method: $parser->text_content_of_treelet($lol) 1998 return stringify_lol($_[1]); 1999} 2000 2001sub stringify_lol { # function: stringify_lol($lol) 2002 my $string_form = ''; 2003 _stringify_lol( $_[0] => \$string_form ); 2004 return $string_form; 2005} 2006 2007sub _stringify_lol { # the real recursor 2008 my($lol, $to) = @_; 2009 for(my $i = 2; $i < @$lol; ++$i) { 2010 if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) { 2011 _stringify_lol( $lol->[$i], $to); # recurse! 2012 } else { 2013 $$to .= $lol->[$i]; 2014 } 2015 } 2016 return; 2017} 2018 2019#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2020 2021sub _dump_curr_open { # return a string representation of the stack 2022 my $curr_open = $_[0]{'curr_open'}; 2023 2024 return '[empty]' unless @$curr_open; 2025 return join '; ', 2026 map {; 2027 ($_->[0] eq '=for') 2028 ? ( ($_->[1]{'~really'} || '=over') 2029 . ' ' . $_->[1]{'target'}) 2030 : $_->[0] 2031 } 2032 @$curr_open 2033 ; 2034} 2035 2036########################################################################### 2037my %pretty_form = ( 2038 "\a" => '\a', # ding! 2039 "\b" => '\b', # BS 2040 "\e" => '\e', # ESC 2041 "\f" => '\f', # FF 2042 "\t" => '\t', # tab 2043 "\cm" => '\cm', 2044 "\cj" => '\cj', 2045 "\n" => '\n', # probably overrides one of either \cm or \cj 2046 '"' => '\"', 2047 '\\' => '\\\\', 2048 '$' => '\\$', 2049 '@' => '\\@', 2050 '%' => '\\%', 2051 '#' => '\\#', 2052); 2053 2054sub pretty { # adopted from Class::Classless 2055 # Not the most brilliant routine, but passable. 2056 # Don't give it a cyclic data structure! 2057 my @stuff = @_; # copy 2058 my $x; 2059 my $out = 2060 # join ",\n" . 2061 join ", ", 2062 map {; 2063 if(!defined($_)) { 2064 "undef"; 2065 } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') { 2066 $x = "[ " . pretty(@$_) . " ]" ; 2067 $x; 2068 } elsif(ref($_) eq 'SCALAR') { 2069 $x = "\\" . pretty($$_) ; 2070 $x; 2071 } elsif(ref($_) eq 'HASH') { 2072 my $hr = $_; 2073 $x = "{" . join(", ", 2074 map(pretty($_) . '=>' . pretty($hr->{$_}), 2075 sort keys %$hr ) ) . "}" ; 2076 $x; 2077 } elsif(!length($_)) { q{''} # empty string 2078 } elsif( 2079 $_ eq '0' # very common case 2080 or( 2081 m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s 2082 and $_ ne '-0' # the strange case that RE lets thru 2083 ) 2084 ) { $_; 2085 } else { 2086 # Yes, explicitly name every character desired. There are shorcuts one 2087 # could make, but I (Karl Williamson) was afraid that some Perl 2088 # releases would have bugs in some of them. For example [A-Z] works 2089 # even on EBCDIC platforms to match exactly the 26 uppercase English 2090 # letters, but I don't know if it has always worked without bugs. It 2091 # seemed safest just to list the characters. 2092 # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])> 2093 s<([^ !#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])> 2094 <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg; 2095 #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg; 2096 qq{"$_"}; 2097 } 2098 } @stuff; 2099 # $out =~ s/\n */ /g if length($out) < 75; 2100 return $out; 2101} 2102 2103#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2104 2105# A rather unsubtle method of blowing away all the state information 2106# from a parser object so it can be reused. Provided as a utility for 2107# backward compatibility in Pod::Man, etc. but not recommended for 2108# general use. 2109 2110sub reinit { 2111 my $self = shift; 2112 foreach (qw(source_dead source_filename doc_has_started 2113start_of_pod_block content_seen last_was_blank paras curr_open 2114line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen 2115Title)) { 2116 2117 delete $self->{$_}; 2118 } 2119} 2120 2121#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 21221; 2123 2124