xref: /openbsd-src/gnu/usr.bin/perl/cpan/Pod-Simple/lib/Pod/Simple/BlackBox.pm (revision c90a81c56dcebd6a1b73fe4aff9b03385b8e63b3)
1package Pod::Simple::BlackBox;
2#
3# "What's in the box?"  "Pain."
4#
5###########################################################################
6#
7# This is where all the scary things happen: parsing lines into
8#  paragraphs; and then into directives, verbatims, and then also
9#  turning formatting sequences into treelets.
10#
11# Are you really sure you want to read this code?
12#
13#-----------------------------------------------------------------------------
14#
15# The basic work of this module Pod::Simple::BlackBox is doing the dirty work
16# of parsing Pod into treelets (generally one per non-verbatim paragraph), and
17# to call the proper callbacks on the treelets.
18#
19# Every node in a treelet is a ['name', {attrhash}, ...children...]
20
21use integer; # vroom!
22use strict;
23use Carp ();
24use vars qw($VERSION );
25$VERSION = '3.32';
26#use constant DEBUG => 7;
27BEGIN {
28  require Pod::Simple;
29  *DEBUG = \&Pod::Simple::DEBUG unless defined &DEBUG
30}
31
32# Matches a character iff the character will have a different meaning
33# if we choose CP1252 vs UTF-8 if there is no =encoding line.
34# This is broken for early Perls on non-ASCII platforms.
35my $non_ascii_re = eval "qr/[[:^ascii:]]/";
36$non_ascii_re = qr/[\x80-\xFF]/ if ! defined $non_ascii_re;
37
38my $utf8_bom;
39if (($] ge 5.007_003)) {
40  $utf8_bom = "\x{FEFF}";
41  utf8::encode($utf8_bom);
42} else {
43  $utf8_bom = "\xEF\xBB\xBF";   # No EBCDIC BOM detection for early Perls.
44}
45
46#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
47
48sub parse_line { shift->parse_lines(@_) } # alias
49
50# - - -  Turn back now!  Run away!  - - -
51
52sub parse_lines {             # Usage: $parser->parse_lines(@lines)
53  # an undef means end-of-stream
54  my $self = shift;
55
56  my $code_handler = $self->{'code_handler'};
57  my $cut_handler  = $self->{'cut_handler'};
58  my $wl_handler   = $self->{'whiteline_handler'};
59  $self->{'line_count'} ||= 0;
60
61  my $scratch;
62
63  DEBUG > 4 and
64   print STDERR "# Parsing starting at line ", $self->{'line_count'}, ".\n";
65
66  DEBUG > 5 and
67   print STDERR "#  About to parse lines: ",
68     join(' ', map defined($_) ? "[$_]" : "EOF", @_), "\n";
69
70  my $paras = ($self->{'paras'} ||= []);
71   # paragraph buffer.  Because we need to defer processing of =over
72   # directives and verbatim paragraphs.  We call _ponder_paragraph_buffer
73   # to process this.
74
75  $self->{'pod_para_count'} ||= 0;
76
77  my $line;
78  foreach my $source_line (@_) {
79    if( $self->{'source_dead'} ) {
80      DEBUG > 4 and print STDERR "# Source is dead.\n";
81      last;
82    }
83
84    unless( defined $source_line ) {
85      DEBUG > 4 and print STDERR "# Undef-line seen.\n";
86
87      push @$paras, ['~end', {'start_line' => $self->{'line_count'}}];
88      push @$paras, $paras->[-1], $paras->[-1];
89       # So that it definitely fills the buffer.
90      $self->{'source_dead'} = 1;
91      $self->_ponder_paragraph_buffer;
92      next;
93    }
94
95
96    if( $self->{'line_count'}++ ) {
97      ($line = $source_line) =~ tr/\n\r//d;
98       # If we don't have two vars, we'll end up with that there
99       # tr/// modding the (potentially read-only) original source line!
100
101    } else {
102      DEBUG > 2 and print STDERR "First line: [$source_line]\n";
103
104      if( ($line = $source_line) =~ s/^$utf8_bom//s ) {
105        DEBUG and print STDERR "UTF-8 BOM seen.  Faking a '=encoding utf8'.\n";
106        $self->_handle_encoding_line( "=encoding utf8" );
107        delete $self->{'_processed_encoding'};
108        $line =~ tr/\n\r//d;
109
110      } elsif( $line =~ s/^\xFE\xFF//s ) {
111        DEBUG and print STDERR "Big-endian UTF-16 BOM seen.  Aborting parsing.\n";
112        $self->scream(
113          $self->{'line_count'},
114          "UTF16-BE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
115        );
116        splice @_;
117        push @_, undef;
118        next;
119
120        # TODO: implement somehow?
121
122      } elsif( $line =~ s/^\xFF\xFE//s ) {
123        DEBUG and print STDERR "Little-endian UTF-16 BOM seen.  Aborting parsing.\n";
124        $self->scream(
125          $self->{'line_count'},
126          "UTF16-LE Byte Encoding Mark found; but Pod::Simple v$Pod::Simple::VERSION doesn't implement UTF16 yet."
127        );
128        splice @_;
129        push @_, undef;
130        next;
131
132        # TODO: implement somehow?
133
134      } else {
135        DEBUG > 2 and print STDERR "First line is BOM-less.\n";
136        ($line = $source_line) =~ tr/\n\r//d;
137      }
138    }
139
140    if(!$self->{'parse_characters'} && !$self->{'encoding'}
141      && ($self->{'in_pod'} || $line =~ /^=/s)
142      && $line =~ /$non_ascii_re/
143    ) {
144
145      my $encoding;
146
147      # No =encoding line, and we are at the first line in the input that
148      # contains a non-ascii byte, that is one whose meaning varies depending
149      # on whether the file is encoded in UTF-8 or CP1252, which are the two
150      # possibilities permitted by the pod spec.  (ASCII is assumed if the
151      # file only contains ASCII bytes.)  In order to process this line, we
152      # need to figure out what encoding we will use for the file.
153      #
154      # Strictly speaking ISO 8859-1 (Latin 1) refers to the code points
155      # 160-255, but it is used here, as it often colloquially is, to refer to
156      # the complete set of code points 0-255, including ASCII (0-127), the C1
157      # controls (128-159), and strict Latin 1 (160-255).
158      #
159      # CP1252 is effectively a superset of Latin 1, because it differs only
160      # from colloquial 8859-1 in the C1 controls, which are very unlikely to
161      # actually be present in 8859-1 files, so can be used for other purposes
162      # without conflict.  CP 1252 uses most of them for graphic characters.
163      #
164      # Note that all ASCII-range bytes represent their corresponding code
165      # points in CP1252 and UTF-8.  In ASCII platform UTF-8 all other code
166      # points require multiple (non-ASCII) bytes to represent.  (A separate
167      # paragraph for EBCDIC is below.)  The multi-byte representation is
168      # quite structured.  If we find an isolated byte that requires multiple
169      # bytes to represent in UTF-8, we know that the encoding is not UTF-8.
170      # If we find a sequence of bytes that violates the UTF-8 structure, we
171      # also can presume the encoding isn't UTF-8, and hence must be 1252.
172      #
173      # But there are ambiguous cases where we could guess wrong.  If so, the
174      # user will end up having to supply an =encoding line.  We use all
175      # readily available information to improve our chances of guessing
176      # right.  The odds of something not being UTF-8, but still passing a
177      # UTF-8 validity test go down very rapidly with increasing length of the
178      # sequence.  Therefore we look at all the maximal length non-ascii
179      # sequences on the line.  If any of the sequences can't be UTF-8, we
180      # quit there and choose CP1252.  If all could be UTF-8, we guess UTF-8.
181      #
182      # On EBCDIC platforms, the situation is somewhat different.  In
183      # UTF-EBCDIC, not only do ASCII-range bytes represent their code points,
184      # but so do the bytes that are for the C1 controls.  Recall that these
185      # correspond to the unused portion of 8859-1 that 1252 mostly takes
186      # over.  That means that there are fewer code points that are
187      # represented by multi-bytes.  But, note that the these controls are
188      # very unlikely to be in pod text.  So if we encounter one of them, it
189      # means that it is quite likely CP1252 and not UTF-8.  The net result is
190      # the same code below is used for both platforms.
191      while ($line =~ m/($non_ascii_re+)/g) {
192        my $non_ascii_seq = $1;
193
194        if (length $non_ascii_seq == 1) {
195          $encoding = 'CP1252';
196          goto guessed;
197        } elsif ($] ge 5.007_003) {
198
199          # On Perls that have this function, we can see if the sequence is
200          # valid UTF-8 or not.
201          if (! utf8::decode($non_ascii_seq)) {
202            $encoding = 'CP1252';
203            goto guessed;
204          }
205        } elsif (ord("A") == 65) {  # An early Perl, ASCII platform
206
207          # Without utf8::decode, it's a lot harder to do a rigorous check
208          # (though some early releases had a different function that
209          # accomplished the same thing).  Since these are ancient Perls, not
210          # likely to be in use today, we take the easy way out, and look at
211          # just the first two bytes of the sequence to see if they are the
212          # start of a UTF-8 character.  In ASCII UTF-8, continuation bytes
213          # must be between 0x80 and 0xBF.  Start bytes can range from 0xC2
214          # through 0xFF, but anything above 0xF4 is not Unicode, and hence
215          # extremely unlikely to be in a pod.
216          if ($non_ascii_seq !~ /^[\xC2-\xF4][\x80-\xBF]/) {
217            $encoding = 'CP1252';
218            goto guessed;
219          }
220
221          # We don't bother doing anything special for EBCDIC on early Perls.
222          # If there is a solitary variant, CP1252 will be chosen; otherwise
223          # UTF-8.
224        }
225      } # End of loop through all variant sequences on the line
226
227      # All sequences in the line could be UTF-8.  Guess that.
228      $encoding = 'UTF-8';
229
230    guessed:
231      $self->_handle_encoding_line( "=encoding $encoding" );
232      delete $self->{'_processed_encoding'};
233      $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
234
235      my ($word) = $line =~ /(\S*$non_ascii_re\S*)/;
236
237      $self->whine(
238        $self->{'line_count'},
239        "Non-ASCII character seen before =encoding in '$word'. Assuming $encoding"
240      );
241    }
242
243    DEBUG > 5 and print STDERR "# Parsing line: [$line]\n";
244
245    if(!$self->{'in_pod'}) {
246      if($line =~ m/^=([a-zA-Z][a-zA-Z0-9]*)(?:\s|$)/s) {
247        if($1 eq 'cut') {
248          $self->scream(
249            $self->{'line_count'},
250            "=cut found outside a pod block.  Skipping to next block."
251          );
252
253          ## Before there were errata sections in the world, it was
254          ## least-pessimal to abort processing the file.  But now we can
255          ## just barrel on thru (but still not start a pod block).
256          #splice @_;
257          #push @_, undef;
258
259          next;
260        } else {
261          $self->{'in_pod'} = $self->{'start_of_pod_block'}
262                            = $self->{'last_was_blank'}     = 1;
263          # And fall thru to the pod-mode block further down
264        }
265      } else {
266        DEBUG > 5 and print STDERR "# It's a code-line.\n";
267        $code_handler->(map $_, $line, $self->{'line_count'}, $self)
268         if $code_handler;
269        # Note: this may cause code to be processed out of order relative
270        #  to pods, but in order relative to cuts.
271
272        # Note also that we haven't yet applied the transcoding to $line
273        #  by time we call $code_handler!
274
275        if( $line =~ m/^#\s*line\s+(\d+)\s*(?:\s"([^"]+)")?\s*$/ ) {
276          # That RE is from perlsyn, section "Plain Old Comments (Not!)",
277          #$fname = $2 if defined $2;
278          #DEBUG > 1 and defined $2 and print STDERR "# Setting fname to \"$fname\"\n";
279          DEBUG > 1 and print STDERR "# Setting nextline to $1\n";
280          $self->{'line_count'} = $1 - 1;
281        }
282
283        next;
284      }
285    }
286
287    # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
288    # Else we're in pod mode:
289
290    # Apply any necessary transcoding:
291    $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
292
293    # HERE WE CATCH =encoding EARLY!
294    if( $line =~ m/^=encoding\s+\S+\s*$/s ) {
295      next if $self->parse_characters;   # Ignore this line
296      $line = $self->_handle_encoding_line( $line );
297    }
298
299    if($line =~ m/^=cut/s) {
300      # here ends the pod block, and therefore the previous pod para
301      DEBUG > 1 and print STDERR "Noting =cut at line ${$self}{'line_count'}\n";
302      $self->{'in_pod'} = 0;
303      # ++$self->{'pod_para_count'};
304      $self->_ponder_paragraph_buffer();
305       # by now it's safe to consider the previous paragraph as done.
306      $cut_handler->(map $_, $line, $self->{'line_count'}, $self)
307       if $cut_handler;
308
309      # TODO: add to docs: Note: this may cause cuts to be processed out
310      #  of order relative to pods, but in order relative to code.
311
312    } elsif($line =~ m/^(\s*)$/s) {  # it's a blank line
313      if (defined $1 and $1 =~ /[^\S\r\n]/) { # it's a white line
314        $wl_handler->(map $_, $line, $self->{'line_count'}, $self)
315          if $wl_handler;
316      }
317
318      if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
319        DEBUG > 1 and print STDERR "Saving blank line at line ${$self}{'line_count'}\n";
320        push @{$paras->[-1]}, $line;
321      }  # otherwise it's not interesting
322
323      if(!$self->{'start_of_pod_block'} and !$self->{'last_was_blank'}) {
324        DEBUG > 1 and print STDERR "Noting para ends with blank line at ${$self}{'line_count'}\n";
325      }
326
327      $self->{'last_was_blank'} = 1;
328
329    } elsif($self->{'last_was_blank'}) {  # A non-blank line starting a new para...
330
331      if($line =~ m/^(=[a-zA-Z][a-zA-Z0-9]*)(?:\s+|$)(.*)/s) {
332        # THIS IS THE ONE PLACE WHERE WE CONSTRUCT NEW DIRECTIVE OBJECTS
333        my $new = [$1, {'start_line' => $self->{'line_count'}}, $2];
334         # Note that in "=head1 foo", the WS is lost.
335         # Example: ['=head1', {'start_line' => 123}, ' foo']
336
337        ++$self->{'pod_para_count'};
338
339        $self->_ponder_paragraph_buffer();
340         # by now it's safe to consider the previous paragraph as done.
341
342        push @$paras, $new; # the new incipient paragraph
343        DEBUG > 1 and print STDERR "Starting new ${$paras}[-1][0] para at line ${$self}{'line_count'}\n";
344
345      } elsif($line =~ m/^\s/s) {
346
347        if(!$self->{'start_of_pod_block'} and @$paras and $paras->[-1][0] eq '~Verbatim') {
348          DEBUG > 1 and print STDERR "Resuming verbatim para at line ${$self}{'line_count'}\n";
349          push @{$paras->[-1]}, $line;
350        } else {
351          ++$self->{'pod_para_count'};
352          $self->_ponder_paragraph_buffer();
353           # by now it's safe to consider the previous paragraph as done.
354          DEBUG > 1 and print STDERR "Starting verbatim para at line ${$self}{'line_count'}\n";
355          push @$paras, ['~Verbatim', {'start_line' => $self->{'line_count'}}, $line];
356        }
357      } else {
358        ++$self->{'pod_para_count'};
359        $self->_ponder_paragraph_buffer();
360         # by now it's safe to consider the previous paragraph as done.
361        push @$paras, ['~Para',  {'start_line' => $self->{'line_count'}}, $line];
362        DEBUG > 1 and print STDERR "Starting plain para at line ${$self}{'line_count'}\n";
363      }
364      $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
365
366    } else {
367      # It's a non-blank line /continuing/ the current para
368      if(@$paras) {
369        DEBUG > 2 and print STDERR "Line ${$self}{'line_count'} continues current paragraph\n";
370        push @{$paras->[-1]}, $line;
371      } else {
372        # Unexpected case!
373        die "Continuing a paragraph but \@\$paras is empty?";
374      }
375      $self->{'last_was_blank'} = $self->{'start_of_pod_block'} = 0;
376    }
377
378  } # ends the big while loop
379
380  DEBUG > 1 and print STDERR (pretty(@$paras), "\n");
381  return $self;
382}
383
384#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
385
386sub _handle_encoding_line {
387  my($self, $line) = @_;
388
389  return if $self->parse_characters;
390
391  # The point of this routine is to set $self->{'_transcoder'} as indicated.
392
393  return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
394  DEBUG > 1 and print STDERR "Found an encoding line \"=encoding $1\"\n";
395
396  my $e    = $1;
397  my $orig = $e;
398  push @{ $self->{'encoding_command_reqs'} }, "=encoding $orig";
399
400  my $enc_error;
401
402  # Cf.   perldoc Encode   and   perldoc Encode::Supported
403
404  require Pod::Simple::Transcode;
405
406  if( $self->{'encoding'} ) {
407    my $norm_current = $self->{'encoding'};
408    my $norm_e = $e;
409    foreach my $that ($norm_current, $norm_e) {
410      $that =  lc($that);
411      $that =~ s/[-_]//g;
412    }
413    if($norm_current eq $norm_e) {
414      DEBUG > 1 and print STDERR "The '=encoding $orig' line is ",
415       "redundant.  ($norm_current eq $norm_e).  Ignoring.\n";
416      $enc_error = '';
417       # But that doesn't necessarily mean that the earlier one went okay
418    } else {
419      $enc_error = "Encoding is already set to " . $self->{'encoding'};
420      DEBUG > 1 and print STDERR $enc_error;
421    }
422  } elsif (
423    # OK, let's turn on the encoding
424    do {
425      DEBUG > 1 and print STDERR " Setting encoding to $e\n";
426      $self->{'encoding'} = $e;
427      1;
428    }
429    and $e eq 'HACKRAW'
430  ) {
431    DEBUG and print STDERR " Putting in HACKRAW (no-op) encoding mode.\n";
432
433  } elsif( Pod::Simple::Transcode::->encoding_is_available($e) ) {
434
435    die($enc_error = "WHAT? _transcoder is already set?!")
436     if $self->{'_transcoder'};   # should never happen
437    require Pod::Simple::Transcode;
438    $self->{'_transcoder'} = Pod::Simple::Transcode::->make_transcoder($e);
439    eval {
440      my @x = ('', "abc", "123");
441      $self->{'_transcoder'}->(@x);
442    };
443    $@ && die( $enc_error =
444      "Really unexpected error setting up encoding $e: $@\nAborting"
445    );
446    $self->{'detected_encoding'} = $e;
447
448  } else {
449    my @supported = Pod::Simple::Transcode::->all_encodings;
450
451    # Note unsupported, and complain
452    DEBUG and print STDERR " Encoding [$e] is unsupported.",
453      "\nSupporteds: @supported\n";
454    my $suggestion = '';
455
456    # Look for a near match:
457    my $norm = lc($e);
458    $norm =~ tr[-_][]d;
459    my $n;
460    foreach my $enc (@supported) {
461      $n = lc($enc);
462      $n =~ tr[-_][]d;
463      next unless $n eq $norm;
464      $suggestion = "  (Maybe \"$e\" should be \"$enc\"?)";
465      last;
466    }
467    my $encmodver = Pod::Simple::Transcode::->encmodver;
468    $enc_error = join '' =>
469      "This document probably does not appear as it should, because its ",
470      "\"=encoding $e\" line calls for an unsupported encoding.",
471      $suggestion, "  [$encmodver\'s supported encodings are: @supported]"
472    ;
473
474    $self->scream( $self->{'line_count'}, $enc_error );
475  }
476  push @{ $self->{'encoding_command_statuses'} }, $enc_error;
477  if (defined($self->{'_processed_encoding'})) {
478    # Double declaration.
479    $self->scream( $self->{'line_count'}, 'Cannot have multiple =encoding directives');
480  }
481  $self->{'_processed_encoding'} = $orig;
482
483  return $line;
484}
485
486# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
487
488sub _handle_encoding_second_level {
489  # By time this is called, the encoding (if well formed) will already
490  #  have been acted one.
491  my($self, $para) = @_;
492  my @x = @$para;
493  my $content = join ' ', splice @x, 2;
494  $content =~ s/^\s+//s;
495  $content =~ s/\s+$//s;
496
497  DEBUG > 2 and print STDERR "Ogling encoding directive: =encoding $content\n";
498
499  if (defined($self->{'_processed_encoding'})) {
500    #if($content ne $self->{'_processed_encoding'}) {
501    #  Could it happen?
502    #}
503    delete $self->{'_processed_encoding'};
504    # It's already been handled.  Check for errors.
505    if(! $self->{'encoding_command_statuses'} ) {
506      DEBUG > 2 and print STDERR " CRAZY ERROR: It wasn't really handled?!\n";
507    } elsif( $self->{'encoding_command_statuses'}[-1] ) {
508      $self->whine( $para->[1]{'start_line'},
509        sprintf "Couldn't do %s: %s",
510          $self->{'encoding_command_reqs'  }[-1],
511          $self->{'encoding_command_statuses'}[-1],
512      );
513    } else {
514      DEBUG > 2 and print STDERR " (Yup, it was successfully handled already.)\n";
515    }
516
517  } else {
518    # Otherwise it's a syntax error
519    $self->whine( $para->[1]{'start_line'},
520      "Invalid =encoding syntax: $content"
521    );
522  }
523
524  return;
525}
526
527#~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`~`
528
529{
530my $m = -321;   # magic line number
531
532sub _gen_errata {
533  my $self = $_[0];
534  # Return 0 or more fake-o paragraphs explaining the accumulated
535  #  errors on this document.
536
537  return() unless $self->{'errata'} and keys %{$self->{'errata'}};
538
539  my @out;
540
541  foreach my $line (sort {$a <=> $b} keys %{$self->{'errata'}}) {
542    push @out,
543      ['=item', {'start_line' => $m}, "Around line $line:"],
544      map( ['~Para', {'start_line' => $m, '~cooked' => 1},
545        #['~Top', {'start_line' => $m},
546        $_
547        #]
548        ],
549        @{$self->{'errata'}{$line}}
550      )
551    ;
552  }
553
554  # TODO: report of unknown entities? unrenderable characters?
555
556  unshift @out,
557    ['=head1', {'start_line' => $m, 'errata' => 1}, 'POD ERRORS'],
558    ['~Para', {'start_line' => $m, '~cooked' => 1, 'errata' => 1},
559     "Hey! ",
560     ['B', {},
561      'The above document had some coding errors, which are explained below:'
562     ]
563    ],
564    ['=over',  {'start_line' => $m, 'errata' => 1}, ''],
565  ;
566
567  push @out,
568    ['=back',  {'start_line' => $m, 'errata' => 1}, ''],
569  ;
570
571  DEBUG and print STDERR "\n<<\n", pretty(\@out), "\n>>\n\n";
572
573  return @out;
574}
575
576}
577
578#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
579
580##############################################################################
581##
582##  stop reading now stop reading now stop reading now stop reading now stop
583##
584##                         HERE IT BECOMES REALLY SCARY
585##
586##  stop reading now stop reading now stop reading now stop reading now stop
587##
588##############################################################################
589
590sub _ponder_paragraph_buffer {
591
592  # Para-token types as found in the buffer.
593  #   ~Verbatim, ~Para, ~end, =head1..4, =for, =begin, =end,
594  #   =over, =back, =item
595  #   and the null =pod (to be complained about if over one line)
596  #
597  # "~data" paragraphs are something we generate at this level, depending on
598  # a currently open =over region
599
600  # Events fired:  Begin and end for:
601  #                   directivename (like head1 .. head4), item, extend,
602  #                   for (from =begin...=end, =for),
603  #                   over-bullet, over-number, over-text, over-block,
604  #                   item-bullet, item-number, item-text,
605  #                   Document,
606  #                   Data, Para, Verbatim
607  #                   B, C, longdirname (TODO -- wha?), etc. for all directives
608  #
609
610  my $self = $_[0];
611  my $paras;
612  return unless @{$paras = $self->{'paras'}};
613  my $curr_open = ($self->{'curr_open'} ||= []);
614
615  my $scratch;
616
617  DEBUG > 10 and print STDERR "# Paragraph buffer: <<", pretty($paras), ">>\n";
618
619  # We have something in our buffer.  So apparently the document has started.
620  unless($self->{'doc_has_started'}) {
621    $self->{'doc_has_started'} = 1;
622
623    my $starting_contentless;
624    $starting_contentless =
625     (
626       !@$curr_open
627       and @$paras and ! grep $_->[0] ne '~end', @$paras
628        # i.e., if the paras is all ~ends
629     )
630    ;
631    DEBUG and print STDERR "# Starting ",
632      $starting_contentless ? 'contentless' : 'contentful',
633      " document\n"
634    ;
635
636    $self->_handle_element_start(
637      ($scratch = 'Document'),
638      {
639        'start_line' => $paras->[0][1]{'start_line'},
640        $starting_contentless ? ( 'contentless' => 1 ) : (),
641      },
642    );
643  }
644
645  my($para, $para_type);
646  while(@$paras) {
647    last if @$paras == 1 and
648      ( $paras->[0][0] eq '=over' or $paras->[0][0] eq '~Verbatim'
649        or $paras->[0][0] eq '=item' )
650    ;
651    # Those're the three kinds of paragraphs that require lookahead.
652    #   Actually, an "=item Foo" inside an <over type=text> region
653    #   and any =item inside an <over type=block> region (rare)
654    #   don't require any lookahead, but all others (bullets
655    #   and numbers) do.
656
657# TODO: whinge about many kinds of directives in non-resolving =for regions?
658# TODO: many?  like what?  =head1 etc?
659
660    $para = shift @$paras;
661    $para_type = $para->[0];
662
663    DEBUG > 1 and print STDERR "Pondering a $para_type paragraph, given the stack: (",
664      $self->_dump_curr_open(), ")\n";
665
666    if($para_type eq '=for') {
667      next if $self->_ponder_for($para,$curr_open,$paras);
668
669    } elsif($para_type eq '=begin') {
670      next if $self->_ponder_begin($para,$curr_open,$paras);
671
672    } elsif($para_type eq '=end') {
673      next if $self->_ponder_end($para,$curr_open,$paras);
674
675    } elsif($para_type eq '~end') { # The virtual end-document signal
676      next if $self->_ponder_doc_end($para,$curr_open,$paras);
677    }
678
679
680    # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
681    #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
682    if(grep $_->[1]{'~ignore'}, @$curr_open) {
683      DEBUG > 1 and
684       print STDERR "Skipping $para_type paragraph because in ignore mode.\n";
685      next;
686    }
687    #~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
688    # ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
689
690    if($para_type eq '=pod') {
691      $self->_ponder_pod($para,$curr_open,$paras);
692
693    } elsif($para_type eq '=over') {
694      next if $self->_ponder_over($para,$curr_open,$paras);
695
696    } elsif($para_type eq '=back') {
697      next if $self->_ponder_back($para,$curr_open,$paras);
698
699    } else {
700
701      # All non-magical codes!!!
702
703      # Here we start using $para_type for our own twisted purposes, to
704      #  mean how it should get treated, not as what the element name
705      #  should be.
706
707      DEBUG > 1 and print STDERR "Pondering non-magical $para_type\n";
708
709      my $i;
710
711      # Enforce some =headN discipline
712      if($para_type =~ m/^=head\d$/s
713         and ! $self->{'accept_heads_anywhere'}
714         and @$curr_open
715         and $curr_open->[-1][0] eq '=over'
716      ) {
717        DEBUG > 2 and print STDERR "'=$para_type' inside an '=over'!\n";
718        $self->whine(
719          $para->[1]{'start_line'},
720          "You forgot a '=back' before '$para_type'"
721        );
722        unshift @$paras, ['=back', {}, ''], $para;   # close the =over
723        next;
724      }
725
726
727      if($para_type eq '=item') {
728
729        my $over;
730        unless(@$curr_open and
731               $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
732          $self->whine(
733            $para->[1]{'start_line'},
734            "'=item' outside of any '=over'"
735          );
736          unshift @$paras,
737            ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
738            $para
739          ;
740          next;
741        }
742
743
744        my $over_type = $over->[1]{'~type'};
745
746        if(!$over_type) {
747          # Shouldn't happen1
748          die "Typeless over in stack, starting at line "
749           . $over->[1]{'start_line'};
750
751        } elsif($over_type eq 'block') {
752          unless($curr_open->[-1][1]{'~bitched_about'}) {
753            $curr_open->[-1][1]{'~bitched_about'} = 1;
754            $self->whine(
755              $curr_open->[-1][1]{'start_line'},
756              "You can't have =items (as at line "
757              . $para->[1]{'start_line'}
758              . ") unless the first thing after the =over is an =item"
759            );
760          }
761          # Just turn it into a paragraph and reconsider it
762          $para->[0] = '~Para';
763          unshift @$paras, $para;
764          next;
765
766        } elsif($over_type eq 'text') {
767          my $item_type = $self->_get_item_type($para);
768            # That kills the content of the item if it's a number or bullet.
769          DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
770
771          if($item_type eq 'text') {
772            # Nothing special needs doing for 'text'
773          } elsif($item_type eq 'number' or $item_type eq 'bullet') {
774            $self->whine(
775              $para->[1]{'start_line'},
776              "Expected text after =item, not a $item_type"
777            );
778            # Undo our clobbering:
779            push @$para, $para->[1]{'~orig_content'};
780            delete $para->[1]{'number'};
781             # Only a PROPER item-number element is allowed
782             #  to have a number attribute.
783          } else {
784            die "Unhandled item type $item_type"; # should never happen
785          }
786
787          # =item-text thingies don't need any assimilation, it seems.
788
789        } elsif($over_type eq 'number') {
790          my $item_type = $self->_get_item_type($para);
791            # That kills the content of the item if it's a number or bullet.
792          DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
793
794          my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
795
796          if($item_type eq 'bullet') {
797            # Hm, it's not numeric.  Correct for this.
798            $para->[1]{'number'} = $expected_value;
799            $self->whine(
800              $para->[1]{'start_line'},
801              "Expected '=item $expected_value'"
802            );
803            push @$para, $para->[1]{'~orig_content'};
804              # restore the bullet, blocking the assimilation of next para
805
806          } elsif($item_type eq 'text') {
807            # Hm, it's not numeric.  Correct for this.
808            $para->[1]{'number'} = $expected_value;
809            $self->whine(
810              $para->[1]{'start_line'},
811              "Expected '=item $expected_value'"
812            );
813            # Text content will still be there and will block next ~Para
814
815          } elsif($item_type ne 'number') {
816            die "Unknown item type $item_type"; # should never happen
817
818          } elsif($expected_value == $para->[1]{'number'}) {
819            DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
820
821          } else {
822            DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
823             " instead of the expected value of $expected_value\n";
824            $self->whine(
825              $para->[1]{'start_line'},
826              "You have '=item " . $para->[1]{'number'} .
827              "' instead of the expected '=item $expected_value'"
828            );
829            $para->[1]{'number'} = $expected_value;  # correcting!!
830          }
831
832          if(@$para == 2) {
833            # For the cases where we /didn't/ push to @$para
834            if($paras->[0][0] eq '~Para') {
835              DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
836              push @$para, splice @{shift @$paras},2;
837            } else {
838              DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
839              push @$para, '';  # Just so it's not contentless
840            }
841          }
842
843
844        } elsif($over_type eq 'bullet') {
845          my $item_type = $self->_get_item_type($para);
846            # That kills the content of the item if it's a number or bullet.
847          DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
848
849          if($item_type eq 'bullet') {
850            # as expected!
851
852            if( $para->[1]{'~_freaky_para_hack'} ) {
853              DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
854              push @$para, delete $para->[1]{'~_freaky_para_hack'};
855            }
856
857          } elsif($item_type eq 'number') {
858            $self->whine(
859              $para->[1]{'start_line'},
860              "Expected '=item *'"
861            );
862            push @$para, $para->[1]{'~orig_content'};
863             # and block assimilation of the next paragraph
864            delete $para->[1]{'number'};
865             # Only a PROPER item-number element is allowed
866             #  to have a number attribute.
867          } elsif($item_type eq 'text') {
868            $self->whine(
869              $para->[1]{'start_line'},
870              "Expected '=item *'"
871            );
872             # But doesn't need processing.  But it'll block assimilation
873             #  of the next para.
874          } else {
875            die "Unhandled item type $item_type"; # should never happen
876          }
877
878          if(@$para == 2) {
879            # For the cases where we /didn't/ push to @$para
880            if($paras->[0][0] eq '~Para') {
881              DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
882              push @$para, splice @{shift @$paras},2;
883            } else {
884              DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
885              push @$para, '';  # Just so it's not contentless
886            }
887          }
888
889        } else {
890          die "Unhandled =over type \"$over_type\"?";
891          # Shouldn't happen!
892        }
893
894        $para_type = 'Plain';
895        $para->[0] .= '-' . $over_type;
896        # Whew.  Now fall thru and process it.
897
898
899      } elsif($para_type eq '=extend') {
900        # Well, might as well implement it here.
901        $self->_ponder_extend($para);
902        next;  # and skip
903      } elsif($para_type eq '=encoding') {
904        # Not actually acted on here, but we catch errors here.
905        $self->_handle_encoding_second_level($para);
906        next unless $self->keep_encoding_directive;
907        $para_type = 'Plain';
908      } elsif($para_type eq '~Verbatim') {
909        $para->[0] = 'Verbatim';
910        $para_type = '?Verbatim';
911      } elsif($para_type eq '~Para') {
912        $para->[0] = 'Para';
913        $para_type = '?Plain';
914      } elsif($para_type eq 'Data') {
915        $para->[0] = 'Data';
916        $para_type = '?Data';
917      } elsif( $para_type =~ s/^=//s
918        and defined( $para_type = $self->{'accept_directives'}{$para_type} )
919      ) {
920        DEBUG > 1 and print STDERR " Pondering known directive ${$para}[0] as $para_type\n";
921      } else {
922        # An unknown directive!
923        DEBUG > 1 and printf STDERR "Unhandled directive %s (Handled: %s)\n",
924         $para->[0], join(' ', sort keys %{$self->{'accept_directives'}} )
925        ;
926        $self->whine(
927          $para->[1]{'start_line'},
928          "Unknown directive: $para->[0]"
929        );
930
931        # And maybe treat it as text instead of just letting it go?
932        next;
933      }
934
935      if($para_type =~ s/^\?//s) {
936        if(! @$curr_open) {  # usual case
937          DEBUG and print STDERR "Treating $para_type paragraph as such because stack is empty.\n";
938        } else {
939          my @fors = grep $_->[0] eq '=for', @$curr_open;
940          DEBUG > 1 and print STDERR "Containing fors: ",
941            join(',', map $_->[1]{'target'}, @fors), "\n";
942
943          if(! @fors) {
944            DEBUG and print STDERR "Treating $para_type paragraph as such because stack has no =for's\n";
945
946          #} elsif(grep $_->[1]{'~resolve'}, @fors) {
947          #} elsif(not grep !$_->[1]{'~resolve'}, @fors) {
948          } elsif( $fors[-1][1]{'~resolve'} ) {
949            # Look to the immediately containing for
950
951            if($para_type eq 'Data') {
952              DEBUG and print STDERR "Treating Data paragraph as Plain/Verbatim because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
953              $para->[0] = 'Para';
954              $para_type = 'Plain';
955            } else {
956              DEBUG and print STDERR "Treating $para_type paragraph as such because the containing =for ($fors[-1][1]{'target'}) is a resolver\n";
957            }
958          } else {
959            DEBUG and print STDERR "Treating $para_type paragraph as Data because the containing =for ($fors[-1][1]{'target'}) is a non-resolver\n";
960            $para->[0] = $para_type = 'Data';
961          }
962        }
963      }
964
965      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
966      if($para_type eq 'Plain') {
967        $self->_ponder_Plain($para);
968      } elsif($para_type eq 'Verbatim') {
969        $self->_ponder_Verbatim($para);
970      } elsif($para_type eq 'Data') {
971        $self->_ponder_Data($para);
972      } else {
973        die "\$para type is $para_type -- how did that happen?";
974        # Shouldn't happen.
975      }
976
977      #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
978      $para->[0] =~ s/^[~=]//s;
979
980      DEBUG and print STDERR "\n", pretty($para), "\n";
981
982      # traverse the treelet (which might well be just one string scalar)
983      $self->{'content_seen'} ||= 1;
984      $self->_traverse_treelet_bit(@$para);
985    }
986  }
987
988  return;
989}
990
991###########################################################################
992# The sub-ponderers...
993
994
995
996sub _ponder_for {
997  my ($self,$para,$curr_open,$paras) = @_;
998
999  # Fake it out as a begin/end
1000  my $target;
1001
1002  if(grep $_->[1]{'~ignore'}, @$curr_open) {
1003    DEBUG > 1 and print STDERR "Ignoring ignorable =for\n";
1004    return 1;
1005  }
1006
1007  for(my $i = 2; $i < @$para; ++$i) {
1008    if($para->[$i] =~ s/^\s*(\S+)\s*//s) {
1009      $target = $1;
1010      last;
1011    }
1012  }
1013  unless(defined $target) {
1014    $self->whine(
1015      $para->[1]{'start_line'},
1016      "=for without a target?"
1017    );
1018    return 1;
1019  }
1020  DEBUG > 1 and
1021   print STDERR "Faking out a =for $target as a =begin $target / =end $target\n";
1022
1023  $para->[0] = 'Data';
1024
1025  unshift @$paras,
1026    ['=begin',
1027      {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1028      $target,
1029    ],
1030    $para,
1031    ['=end',
1032      {'start_line' => $para->[1]{'start_line'}, '~really' => '=for'},
1033      $target,
1034    ],
1035  ;
1036
1037  return 1;
1038}
1039
1040sub _ponder_begin {
1041  my ($self,$para,$curr_open,$paras) = @_;
1042  my $content = join ' ', splice @$para, 2;
1043  $content =~ s/^\s+//s;
1044  $content =~ s/\s+$//s;
1045  unless(length($content)) {
1046    $self->whine(
1047      $para->[1]{'start_line'},
1048      "=begin without a target?"
1049    );
1050    DEBUG and print STDERR "Ignoring targetless =begin\n";
1051    return 1;
1052  }
1053
1054  my ($target, $title) = $content =~ m/^(\S+)\s*(.*)$/;
1055  $para->[1]{'title'} = $title if ($title);
1056  $para->[1]{'target'} = $target;  # without any ':'
1057  $content = $target; # strip off the title
1058
1059  $content =~ s/^:!/!:/s;
1060  my $neg;  # whether this is a negation-match
1061  $neg = 1        if $content =~ s/^!//s;
1062  my $to_resolve;  # whether to process formatting codes
1063  $to_resolve = 1 if $content =~ s/^://s;
1064
1065  my $dont_ignore; # whether this target matches us
1066
1067  foreach my $target_name (
1068    split(',', $content, -1),
1069    $neg ? () : '*'
1070  ) {
1071    DEBUG > 2 and
1072     print STDERR " Considering whether =begin $content matches $target_name\n";
1073    next unless $self->{'accept_targets'}{$target_name};
1074
1075    DEBUG > 2 and
1076     print STDERR "  It DOES match the acceptable target $target_name!\n";
1077    $to_resolve = 1
1078      if $self->{'accept_targets'}{$target_name} eq 'force_resolve';
1079    $dont_ignore = 1;
1080    $para->[1]{'target_matching'} = $target_name;
1081    last; # stop looking at other target names
1082  }
1083
1084  if($neg) {
1085    if( $dont_ignore ) {
1086      $dont_ignore = '';
1087      delete $para->[1]{'target_matching'};
1088      DEBUG > 2 and print STDERR " But the leading ! means that this is a NON-match!\n";
1089    } else {
1090      $dont_ignore = 1;
1091      $para->[1]{'target_matching'} = '!';
1092      DEBUG > 2 and print STDERR " But the leading ! means that this IS a match!\n";
1093    }
1094  }
1095
1096  $para->[0] = '=for';  # Just what we happen to call these, internally
1097  $para->[1]{'~really'} ||= '=begin';
1098  $para->[1]{'~ignore'}   = (! $dont_ignore) || 0;
1099  $para->[1]{'~resolve'}  = $to_resolve || 0;
1100
1101  DEBUG > 1 and print STDERR " Making note to ", $dont_ignore ? 'not ' : '',
1102    "ignore contents of this region\n";
1103  DEBUG > 1 and $dont_ignore and print STDERR " Making note to treat contents as ",
1104    ($to_resolve ? 'verbatim/plain' : 'data'), " paragraphs\n";
1105  DEBUG > 1 and print STDERR " (Stack now: ", $self->_dump_curr_open(), ")\n";
1106
1107  push @$curr_open, $para;
1108  if(!$dont_ignore or scalar grep $_->[1]{'~ignore'}, @$curr_open) {
1109    DEBUG > 1 and print STDERR "Ignoring ignorable =begin\n";
1110  } else {
1111    $self->{'content_seen'} ||= 1;
1112    $self->_handle_element_start((my $scratch='for'), $para->[1]);
1113  }
1114
1115  return 1;
1116}
1117
1118sub _ponder_end {
1119  my ($self,$para,$curr_open,$paras) = @_;
1120  my $content = join ' ', splice @$para, 2;
1121  $content =~ s/^\s+//s;
1122  $content =~ s/\s+$//s;
1123  DEBUG and print STDERR "Ogling '=end $content' directive\n";
1124
1125  unless(length($content)) {
1126    $self->whine(
1127      $para->[1]{'start_line'},
1128      "'=end' without a target?" . (
1129        ( @$curr_open and $curr_open->[-1][0] eq '=for' )
1130        ? ( " (Should be \"=end " . $curr_open->[-1][1]{'target'} . '")' )
1131        : ''
1132      )
1133    );
1134    DEBUG and print STDERR "Ignoring targetless =end\n";
1135    return 1;
1136  }
1137
1138  unless($content =~ m/^\S+$/) {  # i.e., unless it's one word
1139    $self->whine(
1140      $para->[1]{'start_line'},
1141      "'=end $content' is invalid.  (Stack: "
1142      . $self->_dump_curr_open() . ')'
1143    );
1144    DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1145    return 1;
1146  }
1147
1148  unless(@$curr_open and $curr_open->[-1][0] eq '=for') {
1149    $self->whine(
1150      $para->[1]{'start_line'},
1151      "=end $content without matching =begin.  (Stack: "
1152      . $self->_dump_curr_open() . ')'
1153    );
1154    DEBUG and print STDERR "Ignoring mistargetted =end $content\n";
1155    return 1;
1156  }
1157
1158  unless($content eq $curr_open->[-1][1]{'target'}) {
1159    $self->whine(
1160      $para->[1]{'start_line'},
1161      "=end $content doesn't match =begin "
1162      . $curr_open->[-1][1]{'target'}
1163      . ".  (Stack: "
1164      . $self->_dump_curr_open() . ')'
1165    );
1166    DEBUG and print STDERR "Ignoring mistargetted =end $content at line $para->[1]{'start_line'}\n";
1167    return 1;
1168  }
1169
1170  # Else it's okay to close...
1171  if(grep $_->[1]{'~ignore'}, @$curr_open) {
1172    DEBUG > 1 and print STDERR "Not firing any event for this =end $content because in an ignored region\n";
1173    # And that may be because of this to-be-closed =for region, or some
1174    #  other one, but it doesn't matter.
1175  } else {
1176    $curr_open->[-1][1]{'start_line'} = $para->[1]{'start_line'};
1177      # what's that for?
1178
1179    $self->{'content_seen'} ||= 1;
1180    $self->_handle_element_end( my $scratch = 'for', $para->[1]);
1181  }
1182  DEBUG > 1 and print STDERR "Popping $curr_open->[-1][0] $curr_open->[-1][1]{'target'} because of =end $content\n";
1183  pop @$curr_open;
1184
1185  return 1;
1186}
1187
1188sub _ponder_doc_end {
1189  my ($self,$para,$curr_open,$paras) = @_;
1190  if(@$curr_open) { # Deal with things left open
1191    DEBUG and print STDERR "Stack is nonempty at end-document: (",
1192      $self->_dump_curr_open(), ")\n";
1193
1194    DEBUG > 9 and print STDERR "Stack: ", pretty($curr_open), "\n";
1195    unshift @$paras, $self->_closers_for_all_curr_open;
1196    # Make sure there is exactly one ~end in the parastack, at the end:
1197    @$paras = grep $_->[0] ne '~end', @$paras;
1198    push @$paras, $para, $para;
1199     # We need two -- once for the next cycle where we
1200     #  generate errata, and then another to be at the end
1201     #  when that loop back around to process the errata.
1202    return 1;
1203
1204  } else {
1205    DEBUG and print STDERR "Okay, stack is empty now.\n";
1206  }
1207
1208  # Try generating errata section, if applicable
1209  unless($self->{'~tried_gen_errata'}) {
1210    $self->{'~tried_gen_errata'} = 1;
1211    my @extras = $self->_gen_errata();
1212    if(@extras) {
1213      unshift @$paras, @extras;
1214      DEBUG and print STDERR "Generated errata... relooping...\n";
1215      return 1;  # I.e., loop around again to process these fake-o paragraphs
1216    }
1217  }
1218
1219  splice @$paras; # Well, that's that for this paragraph buffer.
1220  DEBUG and print STDERR "Throwing end-document event.\n";
1221
1222  $self->_handle_element_end( my $scratch = 'Document' );
1223  return 1; # Hasta la byebye
1224}
1225
1226sub _ponder_pod {
1227  my ($self,$para,$curr_open,$paras) = @_;
1228  $self->whine(
1229    $para->[1]{'start_line'},
1230    "=pod directives shouldn't be over one line long!  Ignoring all "
1231     . (@$para - 2) . " lines of content"
1232  ) if @$para > 3;
1233
1234  # Content ignored unless 'pod_handler' is set
1235  if (my $pod_handler = $self->{'pod_handler'}) {
1236      my ($line_num, $line) = map $_, $para->[1]{'start_line'}, $para->[2];
1237      $line = $line eq '' ? "=pod" : "=pod $line"; # imitate cut_handler output
1238      $pod_handler->($line, $line_num, $self);
1239  }
1240
1241  # The surrounding methods set content_seen, so let us remain consistent.
1242  # I do not know why it was not here before -- should it not be here?
1243  # $self->{'content_seen'} ||= 1;
1244
1245  return;
1246}
1247
1248sub _ponder_over {
1249  my ($self,$para,$curr_open,$paras) = @_;
1250  return 1 unless @$paras;
1251  my $list_type;
1252
1253  if($paras->[0][0] eq '=item') { # most common case
1254    $list_type = $self->_get_initial_item_type($paras->[0]);
1255
1256  } elsif($paras->[0][0] eq '=back') {
1257    # Ignore empty lists by default
1258    if ($self->{'parse_empty_lists'}) {
1259      $list_type = 'empty';
1260    } else {
1261      shift @$paras;
1262      return 1;
1263    }
1264  } elsif($paras->[0][0] eq '~end') {
1265    $self->whine(
1266      $para->[1]{'start_line'},
1267      "=over is the last thing in the document?!"
1268    );
1269    return 1; # But feh, ignore it.
1270  } else {
1271    $list_type = 'block';
1272  }
1273  $para->[1]{'~type'} = $list_type;
1274  push @$curr_open, $para;
1275   # yes, we reuse the paragraph as a stack item
1276
1277  my $content = join ' ', splice @$para, 2;
1278  my $overness;
1279  if($content =~ m/^\s*$/s) {
1280    $para->[1]{'indent'} = 4;
1281  } elsif($content =~ m/^\s*((?:\d*\.)?\d+)\s*$/s) {
1282    no integer;
1283    $para->[1]{'indent'} = $1;
1284    if($1 == 0) {
1285      $self->whine(
1286        $para->[1]{'start_line'},
1287        "Can't have a 0 in =over $content"
1288      );
1289      $para->[1]{'indent'} = 4;
1290    }
1291  } else {
1292    $self->whine(
1293      $para->[1]{'start_line'},
1294      "=over should be: '=over' or '=over positive_number'"
1295    );
1296    $para->[1]{'indent'} = 4;
1297  }
1298  DEBUG > 1 and print STDERR "=over found of type $list_type\n";
1299
1300  $self->{'content_seen'} ||= 1;
1301  $self->_handle_element_start((my $scratch = 'over-' . $list_type), $para->[1]);
1302
1303  return;
1304}
1305
1306sub _ponder_back {
1307  my ($self,$para,$curr_open,$paras) = @_;
1308  # TODO: fire off </item-number> or </item-bullet> or </item-text> ??
1309
1310  my $content = join ' ', splice @$para, 2;
1311  if($content =~ m/\S/) {
1312    $self->whine(
1313      $para->[1]{'start_line'},
1314      "=back doesn't take any parameters, but you said =back $content"
1315    );
1316  }
1317
1318  if(@$curr_open and $curr_open->[-1][0] eq '=over') {
1319    DEBUG > 1 and print STDERR "=back happily closes matching =over\n";
1320    # Expected case: we're closing the most recently opened thing
1321    #my $over = pop @$curr_open;
1322    $self->{'content_seen'} ||= 1;
1323    $self->_handle_element_end( my $scratch =
1324      'over-' . ( (pop @$curr_open)->[1]{'~type'} ), $para->[1]
1325    );
1326  } else {
1327    DEBUG > 1 and print STDERR "=back found without a matching =over.  Stack: (",
1328        join(', ', map $_->[0], @$curr_open), ").\n";
1329    $self->whine(
1330      $para->[1]{'start_line'},
1331      '=back without =over'
1332    );
1333    return 1; # and ignore it
1334  }
1335}
1336
1337sub _ponder_item {
1338  my ($self,$para,$curr_open,$paras) = @_;
1339  my $over;
1340  unless(@$curr_open and
1341         $over = (grep { $_->[0] eq '=over' } @$curr_open)[-1]) {
1342    $self->whine(
1343      $para->[1]{'start_line'},
1344      "'=item' outside of any '=over'"
1345    );
1346    unshift @$paras,
1347      ['=over', {'start_line' => $para->[1]{'start_line'}}, ''],
1348      $para
1349    ;
1350    return 1;
1351  }
1352
1353
1354  my $over_type = $over->[1]{'~type'};
1355
1356  if(!$over_type) {
1357    # Shouldn't happen1
1358    die "Typeless over in stack, starting at line "
1359     . $over->[1]{'start_line'};
1360
1361  } elsif($over_type eq 'block') {
1362    unless($curr_open->[-1][1]{'~bitched_about'}) {
1363      $curr_open->[-1][1]{'~bitched_about'} = 1;
1364      $self->whine(
1365        $curr_open->[-1][1]{'start_line'},
1366        "You can't have =items (as at line "
1367        . $para->[1]{'start_line'}
1368        . ") unless the first thing after the =over is an =item"
1369      );
1370    }
1371    # Just turn it into a paragraph and reconsider it
1372    $para->[0] = '~Para';
1373    unshift @$paras, $para;
1374    return 1;
1375
1376  } elsif($over_type eq 'text') {
1377    my $item_type = $self->_get_item_type($para);
1378      # That kills the content of the item if it's a number or bullet.
1379    DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1380
1381    if($item_type eq 'text') {
1382      # Nothing special needs doing for 'text'
1383    } elsif($item_type eq 'number' or $item_type eq 'bullet') {
1384      $self->whine(
1385          $para->[1]{'start_line'},
1386          "Expected text after =item, not a $item_type"
1387      );
1388      # Undo our clobbering:
1389      push @$para, $para->[1]{'~orig_content'};
1390      delete $para->[1]{'number'};
1391       # Only a PROPER item-number element is allowed
1392       #  to have a number attribute.
1393    } else {
1394      die "Unhandled item type $item_type"; # should never happen
1395    }
1396
1397    # =item-text thingies don't need any assimilation, it seems.
1398
1399  } elsif($over_type eq 'number') {
1400    my $item_type = $self->_get_item_type($para);
1401      # That kills the content of the item if it's a number or bullet.
1402    DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1403
1404    my $expected_value = ++ $curr_open->[-1][1]{'~counter'};
1405
1406    if($item_type eq 'bullet') {
1407      # Hm, it's not numeric.  Correct for this.
1408      $para->[1]{'number'} = $expected_value;
1409      $self->whine(
1410        $para->[1]{'start_line'},
1411        "Expected '=item $expected_value'"
1412      );
1413      push @$para, $para->[1]{'~orig_content'};
1414        # restore the bullet, blocking the assimilation of next para
1415
1416    } elsif($item_type eq 'text') {
1417      # Hm, it's not numeric.  Correct for this.
1418      $para->[1]{'number'} = $expected_value;
1419      $self->whine(
1420        $para->[1]{'start_line'},
1421        "Expected '=item $expected_value'"
1422      );
1423      # Text content will still be there and will block next ~Para
1424
1425    } elsif($item_type ne 'number') {
1426      die "Unknown item type $item_type"; # should never happen
1427
1428    } elsif($expected_value == $para->[1]{'number'}) {
1429      DEBUG > 1 and print STDERR " Numeric item has the expected value of $expected_value\n";
1430
1431    } else {
1432      DEBUG > 1 and print STDERR " Numeric item has ", $para->[1]{'number'},
1433       " instead of the expected value of $expected_value\n";
1434      $self->whine(
1435        $para->[1]{'start_line'},
1436        "You have '=item " . $para->[1]{'number'} .
1437        "' instead of the expected '=item $expected_value'"
1438      );
1439      $para->[1]{'number'} = $expected_value;  # correcting!!
1440    }
1441
1442    if(@$para == 2) {
1443      # For the cases where we /didn't/ push to @$para
1444      if($paras->[0][0] eq '~Para') {
1445        DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1446        push @$para, splice @{shift @$paras},2;
1447      } else {
1448        DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1449        push @$para, '';  # Just so it's not contentless
1450      }
1451    }
1452
1453
1454  } elsif($over_type eq 'bullet') {
1455    my $item_type = $self->_get_item_type($para);
1456      # That kills the content of the item if it's a number or bullet.
1457    DEBUG and print STDERR " Item is of type ", $para->[0], " under $over_type\n";
1458
1459    if($item_type eq 'bullet') {
1460      # as expected!
1461
1462      if( $para->[1]{'~_freaky_para_hack'} ) {
1463        DEBUG and print STDERR "Accomodating '=item * Foo' tolerance hack.\n";
1464        push @$para, delete $para->[1]{'~_freaky_para_hack'};
1465      }
1466
1467    } elsif($item_type eq 'number') {
1468      $self->whine(
1469        $para->[1]{'start_line'},
1470        "Expected '=item *'"
1471      );
1472      push @$para, $para->[1]{'~orig_content'};
1473       # and block assimilation of the next paragraph
1474      delete $para->[1]{'number'};
1475       # Only a PROPER item-number element is allowed
1476       #  to have a number attribute.
1477    } elsif($item_type eq 'text') {
1478      $self->whine(
1479        $para->[1]{'start_line'},
1480        "Expected '=item *'"
1481      );
1482       # But doesn't need processing.  But it'll block assimilation
1483       #  of the next para.
1484    } else {
1485      die "Unhandled item type $item_type"; # should never happen
1486    }
1487
1488    if(@$para == 2) {
1489      # For the cases where we /didn't/ push to @$para
1490      if($paras->[0][0] eq '~Para') {
1491        DEBUG and print STDERR "Assimilating following ~Para content into $over_type item\n";
1492        push @$para, splice @{shift @$paras},2;
1493      } else {
1494        DEBUG and print STDERR "Can't assimilate following ", $paras->[0][0], "\n";
1495        push @$para, '';  # Just so it's not contentless
1496      }
1497    }
1498
1499  } else {
1500    die "Unhandled =over type \"$over_type\"?";
1501    # Shouldn't happen!
1502  }
1503  $para->[0] .= '-' . $over_type;
1504
1505  return;
1506}
1507
1508sub _ponder_Plain {
1509  my ($self,$para) = @_;
1510  DEBUG and print STDERR " giving plain treatment...\n";
1511  unless( @$para == 2 or ( @$para == 3 and $para->[2] eq '' )
1512    or $para->[1]{'~cooked'}
1513  ) {
1514    push @$para,
1515    @{$self->_make_treelet(
1516      join("\n", splice(@$para, 2)),
1517      $para->[1]{'start_line'}
1518    )};
1519  }
1520  # Empty paragraphs don't need a treelet for any reason I can see.
1521  # And precooked paragraphs already have a treelet.
1522  return;
1523}
1524
1525sub _ponder_Verbatim {
1526  my ($self,$para) = @_;
1527  DEBUG and print STDERR " giving verbatim treatment...\n";
1528
1529  $para->[1]{'xml:space'} = 'preserve';
1530
1531  my $indent = $self->strip_verbatim_indent;
1532  if ($indent && ref $indent eq 'CODE') {
1533      my @shifted = (shift @{$para}, shift @{$para});
1534      $indent = $indent->($para);
1535      unshift @{$para}, @shifted;
1536  }
1537
1538  for(my $i = 2; $i < @$para; $i++) {
1539    foreach my $line ($para->[$i]) { # just for aliasing
1540      # Strip indentation.
1541      $line =~ s/^\Q$indent// if $indent
1542          && !($self->{accept_codes} && $self->{accept_codes}{VerbatimFormatted});
1543      while( $line =~
1544        # Sort of adapted from Text::Tabs -- yes, it's hardwired in that
1545        # tabs are at every EIGHTH column.  For portability, it has to be
1546        # one setting everywhere, and 8th wins.
1547        s/^([^\t]*)(\t+)/$1.(" " x ((length($2)<<3)-(length($1)&7)))/e
1548      ) {}
1549
1550      # TODO: whinge about (or otherwise treat) unindented or overlong lines
1551
1552    }
1553  }
1554
1555  # Now the VerbatimFormatted hoodoo...
1556  if( $self->{'accept_codes'} and
1557      $self->{'accept_codes'}{'VerbatimFormatted'}
1558  ) {
1559    while(@$para > 3 and $para->[-1] !~ m/\S/) { pop @$para }
1560     # Kill any number of terminal newlines
1561    $self->_verbatim_format($para);
1562  } elsif ($self->{'codes_in_verbatim'}) {
1563    push @$para,
1564    @{$self->_make_treelet(
1565      join("\n", splice(@$para, 2)),
1566      $para->[1]{'start_line'}, $para->[1]{'xml:space'}
1567    )};
1568    $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1569  } else {
1570    push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1571    $para->[-1] =~ s/\n+$//s; # Kill any number of terminal newlines
1572  }
1573  return;
1574}
1575
1576sub _ponder_Data {
1577  my ($self,$para) = @_;
1578  DEBUG and print STDERR " giving data treatment...\n";
1579  $para->[1]{'xml:space'} = 'preserve';
1580  push @$para, join "\n", splice(@$para, 2) if @$para > 3;
1581  return;
1582}
1583
1584
1585
1586
1587###########################################################################
1588
1589sub _traverse_treelet_bit {  # for use only by the routine above
1590  my($self, $name) = splice @_,0,2;
1591
1592  my $scratch;
1593  $self->_handle_element_start(($scratch=$name), shift @_);
1594
1595  while (@_) {
1596    my $x = shift;
1597    if (ref($x)) {
1598      &_traverse_treelet_bit($self, @$x);
1599    } else {
1600      $x .= shift while @_ && !ref($_[0]);
1601      $self->_handle_text($x);
1602    }
1603  }
1604
1605  $self->_handle_element_end($scratch=$name);
1606  return;
1607}
1608
1609#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1610
1611sub _closers_for_all_curr_open {
1612  my $self = $_[0];
1613  my @closers;
1614  foreach my $still_open (@{  $self->{'curr_open'} || return  }) {
1615    my @copy = @$still_open;
1616    $copy[1] = {%{ $copy[1] }};
1617    #$copy[1]{'start_line'} = -1;
1618    if($copy[0] eq '=for') {
1619      $copy[0] = '=end';
1620    } elsif($copy[0] eq '=over') {
1621      $self->whine(
1622        $still_open->[1]{start_line} ,
1623        "=over without closing =back"
1624      );
1625
1626      $copy[0] = '=back';
1627    } else {
1628      die "I don't know how to auto-close an open $copy[0] region";
1629    }
1630
1631    unless( @copy > 2 ) {
1632      push @copy, $copy[1]{'target'};
1633      $copy[-1] = '' unless defined $copy[-1];
1634       # since =over's don't have targets
1635    }
1636
1637    $copy[1]{'fake-closer'} = 1;
1638
1639    DEBUG and print STDERR "Queuing up fake-o event: ", pretty(\@copy), "\n";
1640    unshift @closers, \@copy;
1641  }
1642  return @closers;
1643}
1644
1645#--------------------------------------------------------------------------
1646
1647sub _verbatim_format {
1648  my($it, $p) = @_;
1649
1650  my $formatting;
1651
1652  for(my $i = 2; $i < @$p; $i++) { # work backwards over the lines
1653    DEBUG and print STDERR "_verbatim_format appends a newline to $i: $p->[$i]\n";
1654    $p->[$i] .= "\n";
1655     # Unlike with simple Verbatim blocks, we don't end up just doing
1656     # a join("\n", ...) on the contents, so we have to append a
1657     # newline to ever line, and then nix the last one later.
1658  }
1659
1660  if( DEBUG > 4 ) {
1661    print STDERR "<<\n";
1662    for(my $i = $#$p; $i >= 2; $i--) { # work backwards over the lines
1663      print STDERR "_verbatim_format $i: $p->[$i]";
1664    }
1665    print STDERR ">>\n";
1666  }
1667
1668  for(my $i = $#$p; $i > 2; $i--) {
1669    # work backwards over the lines, except the first (#2)
1670
1671    #next unless $p->[$i]   =~ m{^#:([ \^\/\%]*)\n?$}s
1672    #        and $p->[$i-1] !~ m{^#:[ \^\/\%]*\n?$}s;
1673     # look at a formatty line preceding a nonformatty one
1674    DEBUG > 5 and print STDERR "Scrutinizing line $i: $$p[$i]\n";
1675    if($p->[$i]   =~ m{^#:([ \^\/\%]*)\n?$}s) {
1676      DEBUG > 5 and print STDERR "  It's a formatty line.  ",
1677       "Peeking at previous line ", $i-1, ": $$p[$i-1]: \n";
1678
1679      if( $p->[$i-1] =~ m{^#:[ \^\/\%]*\n?$}s ) {
1680        DEBUG > 5 and print STDERR "  Previous line is formatty!  Skipping this one.\n";
1681        next;
1682      } else {
1683        DEBUG > 5 and print STDERR "  Previous line is non-formatty!  Yay!\n";
1684      }
1685    } else {
1686      DEBUG > 5 and print STDERR "  It's not a formatty line.  Ignoring\n";
1687      next;
1688    }
1689
1690    # A formatty line has to have #: in the first two columns, and uses
1691    # "^" to mean bold, "/" to mean underline, and "%" to mean bold italic.
1692    # Example:
1693    #   What do you want?  i like pie. [or whatever]
1694    # #:^^^^^^^^^^^^^^^^^              /////////////
1695
1696
1697    DEBUG > 4 and print STDERR "_verbatim_format considers:\n<$p->[$i-1]>\n<$p->[$i]>\n";
1698
1699    $formatting = '  ' . $1;
1700    $formatting =~ s/\s+$//s; # nix trailing whitespace
1701    unless(length $formatting and $p->[$i-1] =~ m/\S/) { # no-op
1702      splice @$p,$i,1; # remove this line
1703      $i--; # don't consider next line
1704      next;
1705    }
1706
1707    if( length($formatting) >= length($p->[$i-1]) ) {
1708      $formatting = substr($formatting, 0, length($p->[$i-1]) - 1) . ' ';
1709    } else {
1710      $formatting .= ' ' x (length($p->[$i-1]) - length($formatting));
1711    }
1712    # Make $formatting and the previous line be exactly the same length,
1713    # with $formatting having a " " as the last character.
1714
1715    DEBUG > 4 and print STDERR "Formatting <$formatting>    on <", $p->[$i-1], ">\n";
1716
1717
1718    my @new_line;
1719    while( $formatting =~ m{\G(( +)|(\^+)|(\/+)|(\%+))}g ) {
1720      #print STDERR "Format matches $1\n";
1721
1722      if($2) {
1723        #print STDERR "SKIPPING <$2>\n";
1724        push @new_line,
1725          substr($p->[$i-1], pos($formatting)-length($1), length($1));
1726      } else {
1727        #print STDERR "SNARING $+\n";
1728        push @new_line, [
1729          (
1730            $3 ? 'VerbatimB'  :
1731            $4 ? 'VerbatimI'  :
1732            $5 ? 'VerbatimBI' : die("Should never get called")
1733          ), {},
1734          substr($p->[$i-1], pos($formatting)-length($1), length($1))
1735        ];
1736        #print STDERR "Formatting <$new_line[-1][-1]> as $new_line[-1][0]\n";
1737      }
1738    }
1739    my @nixed =
1740      splice @$p, $i-1, 2, @new_line; # replace myself and the next line
1741    DEBUG > 10 and print STDERR "Nixed count: ", scalar(@nixed), "\n";
1742
1743    DEBUG > 6 and print STDERR "New version of the above line is these tokens (",
1744      scalar(@new_line), "):",
1745      map( ref($_)?"<@$_> ":"<$_>", @new_line ), "\n";
1746    $i--; # So the next line we scrutinize is the line before the one
1747          #  that we just went and formatted
1748  }
1749
1750  $p->[0] = 'VerbatimFormatted';
1751
1752  # Collapse adjacent text nodes, just for kicks.
1753  for( my $i = 2; $i > $#$p; $i++ ) { # work forwards over the tokens except for the last
1754    if( !ref($p->[$i]) and !ref($p->[$i + 1]) ) {
1755      DEBUG > 5 and print STDERR "_verbatim_format merges {$p->[$i]} and {$p->[$i+1]}\n";
1756      $p->[$i] .= splice @$p, $i+1, 1; # merge
1757      --$i;  # and back up
1758    }
1759  }
1760
1761  # Now look for the last text token, and remove the terminal newline
1762  for( my $i = $#$p; $i >= 2; $i-- ) {
1763    # work backwards over the tokens, even the first
1764    if( !ref($p->[$i]) ) {
1765      if($p->[$i] =~ s/\n$//s) {
1766        DEBUG > 5 and print STDERR "_verbatim_format killed the terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]}\n";
1767      } else {
1768        DEBUG > 5 and print STDERR
1769         "No terminal newline on #$i: {$p->[$i]}, after {$p->[$i-1]} !?\n";
1770      }
1771      last; # we only want the next one
1772    }
1773  }
1774
1775  return;
1776}
1777
1778
1779#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1780
1781
1782sub _treelet_from_formatting_codes {
1783  # Given a paragraph, returns a treelet.  Full of scary tokenizing code.
1784  #  Like [ '~Top', {'start_line' => $start_line},
1785  #            "I like ",
1786  #            [ 'B', {}, "pie" ],
1787  #            "!"
1788  #       ]
1789
1790  my($self, $para, $start_line, $preserve_space) = @_;
1791
1792  my $treelet = ['~Top', {'start_line' => $start_line},];
1793
1794  unless ($preserve_space || $self->{'preserve_whitespace'}) {
1795    $para =~ s/\s+/ /g; # collapse and trim all whitespace first.
1796    $para =~ s/ $//;
1797    $para =~ s/^ //;
1798  }
1799
1800  # Only apparent problem the above code is that N<<  >> turns into
1801  # N<< >>.  But then, word wrapping does that too!  So don't do that!
1802
1803  my @stack;
1804  my @lineage = ($treelet);
1805  my $raw = ''; # raw content of L<> fcode before splitting/processing
1806    # XXX 'raw' is not 100% accurate: all surrounding whitespace is condensed
1807    # into just 1 ' '. Is this the regex's doing or 'raw's?
1808  my $inL = 0;
1809
1810  DEBUG > 4 and print STDERR "Paragraph:\n$para\n\n";
1811
1812  # Here begins our frightening tokenizer RE.  The following regex matches
1813  # text in four main parts:
1814  #
1815  #  * Start-codes.  The first alternative matches C< or C<<, the latter
1816  #    followed by some whitespace.  $1 will hold the entire start code
1817  #    (including any space following a multiple-angle-bracket delimiter),
1818  #    and $2 will hold only the additional brackets past the first in a
1819  #    multiple-bracket delimiter.  length($2) + 1 will be the number of
1820  #    closing brackets we have to find.
1821  #
1822  #  * Closing brackets.  Match some amount of whitespace followed by
1823  #    multiple close brackets.  The logic to see if this closes anything
1824  #    is down below.  Note that in order to parse C<<  >> correctly, we
1825  #    have to use look-behind (?<=\s\s), since the match of the starting
1826  #    code will have consumed the whitespace.
1827  #
1828  #  * A single closing bracket, to close a simple code like C<>.
1829  #
1830  #  * Something that isn't a start or end code.  We have to be careful
1831  #    about accepting whitespace, since perlpodspec says that any whitespace
1832  #    before a multiple-bracket closing delimiter should be ignored.
1833  #
1834  while($para =~
1835    m/\G
1836      (?:
1837        # Match starting codes, including the whitespace following a
1838        # multiple-delimiter start code.  $1 gets the whole start code and
1839        # $2 gets all but one of the <s in the multiple-bracket case.
1840        ([A-Z]<(?:(<+)\s+)?)
1841        |
1842        # Match multiple-bracket end codes.  $3 gets the whitespace that
1843        # should be discarded before an end bracket but kept in other cases
1844        # and $4 gets the end brackets themselves.
1845        (\s+|(?<=\s\s))(>{2,})
1846        |
1847        (\s?>)          # $5: simple end-codes
1848        |
1849        (               # $6: stuff containing no start-codes or end-codes
1850          (?:
1851            [^A-Z\s>]
1852            |
1853            (?:
1854              [A-Z](?!<)
1855            )
1856            |
1857            # whitespace is ok, but we don't want to eat the whitespace before
1858            # a multiple-bracket end code.
1859            # NOTE: we may still have problems with e.g. S<<    >>
1860            (?:
1861              \s(?!\s*>{2,})
1862            )
1863          )+
1864        )
1865      )
1866    /xgo
1867  ) {
1868    DEBUG > 4 and print STDERR "\nParagraphic tokenstack = (@stack)\n";
1869    if(defined $1) {
1870      if(defined $2) {
1871        DEBUG > 3 and print STDERR "Found complex start-text code \"$1\"\n";
1872        push @stack, length($2) + 1;
1873          # length of the necessary complex end-code string
1874      } else {
1875        DEBUG > 3 and print STDERR "Found simple start-text code \"$1\"\n";
1876        push @stack, 0;  # signal that we're looking for simple
1877      }
1878      push @lineage, [ substr($1,0,1), {}, ];  # new node object
1879      push @{ $lineage[-2] }, $lineage[-1];
1880      if ('L' eq substr($1,0,1)) {
1881        $raw = $inL ? $raw.$1 : ''; # reset raw content accumulator
1882        $inL = 1;
1883      } else {
1884        $raw .= $1 if $inL;
1885      }
1886
1887    } elsif(defined $4) {
1888      DEBUG > 3 and print STDERR "Found apparent complex end-text code \"$3$4\"\n";
1889      # This is where it gets messy...
1890      if(! @stack) {
1891        # We saw " >>>>" but needed nothing.  This is ALL just stuff then.
1892        DEBUG > 4 and print STDERR " But it's really just stuff.\n";
1893        push @{ $lineage[-1] }, $3, $4;
1894        next;
1895      } elsif(!$stack[-1]) {
1896        # We saw " >>>>" but needed only ">".  Back pos up.
1897        DEBUG > 4 and print STDERR " And that's more than we needed to close simple.\n";
1898        push @{ $lineage[-1] }, $3; # That was a for-real space, too.
1899        pos($para) = pos($para) - length($4) + 1;
1900      } elsif($stack[-1] == length($4)) {
1901        # We found " >>>>", and it was exactly what we needed.  Commonest case.
1902        DEBUG > 4 and print STDERR " And that's exactly what we needed to close complex.\n";
1903      } elsif($stack[-1] < length($4)) {
1904        # We saw " >>>>" but needed only " >>".  Back pos up.
1905        DEBUG > 4 and print STDERR " And that's more than we needed to close complex.\n";
1906        pos($para) = pos($para) - length($4) + $stack[-1];
1907      } else {
1908        # We saw " >>>>" but needed " >>>>>>".  So this is all just stuff!
1909        DEBUG > 4 and print STDERR " But it's really just stuff, because we needed more.\n";
1910        push @{ $lineage[-1] }, $3, $4;
1911        next;
1912      }
1913      #print STDERR "\nHOOBOY ", scalar(@{$lineage[-1]}), "!!!\n";
1914
1915      push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
1916      # Keep the element from being childless
1917
1918      pop @stack;
1919      pop @lineage;
1920
1921      unless (@stack) { # not in an L if there are no open fcodes
1922        $inL = 0;
1923        if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') {
1924          $lineage[-1][-1][1]{'raw'} = $raw
1925        }
1926      }
1927      $raw .= $3.$4 if $inL;
1928
1929    } elsif(defined $5) {
1930      DEBUG > 3 and print STDERR "Found apparent simple end-text code \"$5\"\n";
1931
1932      if(@stack and ! $stack[-1]) {
1933        # We're indeed expecting a simple end-code
1934        DEBUG > 4 and print STDERR " It's indeed an end-code.\n";
1935
1936        if(length($5) == 2) { # There was a space there: " >"
1937          push @{ $lineage[-1] }, ' ';
1938        } elsif( 2 == @{ $lineage[-1] } ) { # Closing a childless element
1939          push @{ $lineage[-1] }, ''; # keep it from being really childless
1940        }
1941
1942        pop @stack;
1943        pop @lineage;
1944      } else {
1945        DEBUG > 4 and print STDERR " It's just stuff.\n";
1946        push @{ $lineage[-1] }, $5;
1947      }
1948
1949      unless (@stack) { # not in an L if there are no open fcodes
1950        $inL = 0;
1951        if (ref $lineage[-1][-1] && $lineage[-1][-1][0] eq 'L') {
1952          $lineage[-1][-1][1]{'raw'} = $raw
1953        }
1954      }
1955      $raw .= $5 if $inL;
1956
1957    } elsif(defined $6) {
1958      DEBUG > 3 and print STDERR "Found stuff \"$6\"\n";
1959      push @{ $lineage[-1] }, $6;
1960      $raw .= $6 if $inL;
1961        # XXX does not capture multiplace whitespaces -- 'raw' ends up with
1962        #     at most 1 leading/trailing whitespace, why not all of it?
1963
1964    } else {
1965      # should never ever ever ever happen
1966      DEBUG and print STDERR "AYYAYAAAAA at line ", __LINE__, "\n";
1967      die "SPORK 512512!";
1968    }
1969  }
1970
1971  if(@stack) { # Uhoh, some sequences weren't closed.
1972    my $x= "...";
1973    while(@stack) {
1974      push @{ $lineage[-1] }, '' if 2 == @{ $lineage[-1] };
1975      # Hmmmmm!
1976
1977      my $code         = (pop @lineage)->[0];
1978      my $ender_length =  pop @stack;
1979      if($ender_length) {
1980        --$ender_length;
1981        $x = $code . ("<" x $ender_length) . " $x " . (">" x $ender_length);
1982      } else {
1983        $x = $code . "<$x>";
1984      }
1985    }
1986    DEBUG > 1 and print STDERR "Unterminated $x sequence\n";
1987    $self->whine($start_line,
1988      "Unterminated $x sequence",
1989    );
1990  }
1991
1992  return $treelet;
1993}
1994
1995#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1996
1997sub text_content_of_treelet {  # method: $parser->text_content_of_treelet($lol)
1998  return stringify_lol($_[1]);
1999}
2000
2001sub stringify_lol {  # function: stringify_lol($lol)
2002  my $string_form = '';
2003  _stringify_lol( $_[0] => \$string_form );
2004  return $string_form;
2005}
2006
2007sub _stringify_lol {  # the real recursor
2008  my($lol, $to) = @_;
2009  for(my $i = 2; $i < @$lol; ++$i) {
2010    if( ref($lol->[$i] || '') and UNIVERSAL::isa($lol->[$i], 'ARRAY') ) {
2011      _stringify_lol( $lol->[$i], $to);  # recurse!
2012    } else {
2013      $$to .= $lol->[$i];
2014    }
2015  }
2016  return;
2017}
2018
2019#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2020
2021sub _dump_curr_open { # return a string representation of the stack
2022  my $curr_open = $_[0]{'curr_open'};
2023
2024  return '[empty]' unless @$curr_open;
2025  return join '; ',
2026    map {;
2027           ($_->[0] eq '=for')
2028             ? ( ($_->[1]{'~really'} || '=over')
2029               . ' ' . $_->[1]{'target'})
2030             : $_->[0]
2031        }
2032    @$curr_open
2033  ;
2034}
2035
2036###########################################################################
2037my %pretty_form = (
2038  "\a" => '\a', # ding!
2039  "\b" => '\b', # BS
2040  "\e" => '\e', # ESC
2041  "\f" => '\f', # FF
2042  "\t" => '\t', # tab
2043  "\cm" => '\cm',
2044  "\cj" => '\cj',
2045  "\n" => '\n', # probably overrides one of either \cm or \cj
2046  '"' => '\"',
2047  '\\' => '\\\\',
2048  '$' => '\\$',
2049  '@' => '\\@',
2050  '%' => '\\%',
2051  '#' => '\\#',
2052);
2053
2054sub pretty { # adopted from Class::Classless
2055  # Not the most brilliant routine, but passable.
2056  # Don't give it a cyclic data structure!
2057  my @stuff = @_; # copy
2058  my $x;
2059  my $out =
2060    # join ",\n" .
2061    join ", ",
2062    map {;
2063    if(!defined($_)) {
2064      "undef";
2065    } elsif(ref($_) eq 'ARRAY' or ref($_) eq 'Pod::Simple::LinkSection') {
2066      $x = "[ " . pretty(@$_) . " ]" ;
2067      $x;
2068    } elsif(ref($_) eq 'SCALAR') {
2069      $x = "\\" . pretty($$_) ;
2070      $x;
2071    } elsif(ref($_) eq 'HASH') {
2072      my $hr = $_;
2073      $x = "{" . join(", ",
2074        map(pretty($_) . '=>' . pretty($hr->{$_}),
2075            sort keys %$hr ) ) . "}" ;
2076      $x;
2077    } elsif(!length($_)) { q{''} # empty string
2078    } elsif(
2079      $_ eq '0' # very common case
2080      or(
2081         m/^-?(?:[123456789]\d*|0)(?:\.\d+)?$/s
2082         and $_ ne '-0' # the strange case that RE lets thru
2083      )
2084    ) { $_;
2085    } else {
2086        # Yes, explicitly name every character desired. There are shorcuts one
2087        # could make, but I (Karl Williamson) was afraid that some Perl
2088        # releases would have bugs in some of them. For example [A-Z] works
2089        # even on EBCDIC platforms to match exactly the 26 uppercase English
2090        # letters, but I don't know if it has always worked without bugs. It
2091        # seemed safest just to list the characters.
2092        # s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E])>
2093        s<([^ !#'()*+,\-./0123456789:;\<=\>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\[\]^_`abcdefghijklmnopqrstuvwxyz{|}~])>
2094         <$pretty_form{$1} || '\\x{'.sprintf("%x", ord($1)).'}'>eg;
2095         #<$pretty_form{$1} || '\\x'.(unpack("H2",$1))>eg;
2096      qq{"$_"};
2097    }
2098  } @stuff;
2099  # $out =~ s/\n */ /g if length($out) < 75;
2100  return $out;
2101}
2102
2103#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
2104
2105# A rather unsubtle method of blowing away all the state information
2106# from a parser object so it can be reused. Provided as a utility for
2107# backward compatibility in Pod::Man, etc. but not recommended for
2108# general use.
2109
2110sub reinit {
2111  my $self = shift;
2112  foreach (qw(source_dead source_filename doc_has_started
2113start_of_pod_block content_seen last_was_blank paras curr_open
2114line_count pod_para_count in_pod ~tried_gen_errata all_errata errata errors_seen
2115Title)) {
2116
2117    delete $self->{$_};
2118  }
2119}
2120
2121#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
21221;
2123
2124