lib/Text/Wrap.pm

eac174f2Safresh1use strict; use warnings;
eac174f2Safresh1
b39c5158Smillertpackage Text::Wrap;
b39c5158Smillert
b39c5158Smillertuse warnings::register;
b39c5158Smillert
eac174f2Safresh1BEGIN { require Exporter; *import = \&Exporter::import }
b39c5158Smillert
eac174f2Safresh1our @EXPORT = qw( wrap fill );
eac174f2Safresh1our @EXPORT_OK = qw( $columns $break $huge );
b39c5158Smillert
*3d61058aSafresh1our $VERSION = '2024.001';
eac174f2Safresh1our $SUBVERSION = 'modern'; # back-compat vestige
91f110e0Safresh1
*3d61058aSafresh1BEGIN { eval sprintf 'sub REGEXPS_USE_BYTES () { %d }', scalar( pack('U*', 0x80) =~ /\xc2/ ) }
*3d61058aSafresh1
*3d61058aSafresh1my $brkspc = "\x{a0}\x{202f}" =~ /\s/ ? '[^\x{a0}\x{202f}\S]' : '\s';
*3d61058aSafresh1
eac174f2Safresh1our $columns = 76;  # <= screen width
*3d61058aSafresh1our $break = '(?>\n|\r\n|'.$brkspc.'\pM*)';
eac174f2Safresh1our $huge = 'wrap'; # alternatively: 'die' or 'overflow'
eac174f2Safresh1our $unexpand = 1;
eac174f2Safresh1our $tabstop = 8;
eac174f2Safresh1our $separator = "\n";
eac174f2Safresh1our $separator2 = undef;
b39c5158Smillert
*3d61058aSafresh1sub _xlen { $_[0] =~ /^\pM/ + ( () = $_[0] =~ /\PM/g ) }
91f110e0Safresh1
b39c5158Smillertuse Text::Tabs qw(expand unexpand);
b39c5158Smillert
b39c5158Smillertsub wrap
b39c5158Smillert{
eac174f2Safresh1	my ($ip, $xp, @t) = map +( defined $_ ? $_ : '' ), @_;
b39c5158Smillert
b39c5158Smillert	local($Text::Tabs::tabstop) = $tabstop;
b39c5158Smillert	my $r = "";
b39c5158Smillert	my $tail = pop(@t);
b39c5158Smillert	my $t = expand(join("", (map { /\s+\z/ ? ( $_ ) : ($_, ' ') } @t), $tail));
b39c5158Smillert	my $lead = $ip;
91f110e0Safresh1	my $nll = $columns - _xlen(expand($xp)) - 1;
b39c5158Smillert	if ($nll <= 0 && $xp ne '') {
91f110e0Safresh1		my $nc = _xlen(expand($xp)) + 2;
b39c5158Smillert		warnings::warnif "Increasing \$Text::Wrap::columns from $columns to $nc to accommodate length of subsequent tab";
b39c5158Smillert		$columns = $nc;
b39c5158Smillert		$nll = 1;
b39c5158Smillert	}
91f110e0Safresh1	my $ll = $columns - _xlen(expand($ip)) - 1;
b39c5158Smillert	$ll = 0 if $ll < 0;
b39c5158Smillert	my $nl = "";
b39c5158Smillert	my $remainder = "";
b39c5158Smillert
b39c5158Smillert	use re 'taint';
b39c5158Smillert
b39c5158Smillert	pos($t) = 0;
b39c5158Smillert	while ($t !~ /\G(?:$break)*\Z/gc) {
*3d61058aSafresh1		if ($t =~ /\G((?>(?!\n)\PM\pM*|(?<![^\n])\pM+){0,$ll})($break|\n+|\z)/xmgc) {
b39c5158Smillert			$r .= $unexpand
b39c5158Smillert				? unexpand($nl . $lead . $1)
b39c5158Smillert				: $nl . $lead . $1;
b39c5158Smillert			$remainder = $2;
*3d61058aSafresh1		} elsif ($huge eq 'wrap' && $t =~ /\G((?>(?!\n)\PM\pM*|(?<![^\n])\pM+){$ll})/gc) {
b39c5158Smillert			$r .= $unexpand
b39c5158Smillert				? unexpand($nl . $lead . $1)
b39c5158Smillert				: $nl . $lead . $1;
b39c5158Smillert			$remainder = defined($separator2) ? $separator2 : $separator;
*3d61058aSafresh1		} elsif ($huge eq 'overflow' && $t =~ /\G([^\n]*?)(?!(?<![^\n])\pM)($break|\n+|\z)/xmgc) {
b39c5158Smillert			$r .= $unexpand
b39c5158Smillert				? unexpand($nl . $lead . $1)
b39c5158Smillert				: $nl . $lead . $1;
b39c5158Smillert			$remainder = $2;
b39c5158Smillert		} elsif ($huge eq 'die') {
b39c5158Smillert			die "couldn't wrap '$t'";
b39c5158Smillert		} elsif ($columns < 2) {
b39c5158Smillert			warnings::warnif "Increasing \$Text::Wrap::columns from $columns to 2";
b39c5158Smillert			$columns = 2;
eac174f2Safresh1			return @_;
b39c5158Smillert		} else {
b39c5158Smillert			die "This shouldn't happen";
b39c5158Smillert		}
b39c5158Smillert
b39c5158Smillert		$lead = $xp;
b39c5158Smillert		$ll = $nll;
b39c5158Smillert		$nl = defined($separator2)
b39c5158Smillert			? ($remainder eq "\n"
b39c5158Smillert				? "\n"
b39c5158Smillert				: $separator2)
b39c5158Smillert			: $separator;
b39c5158Smillert	}
b39c5158Smillert	$r .= $remainder;
b39c5158Smillert
b39c5158Smillert	$r .= $lead . substr($t, pos($t), length($t) - pos($t))
b39c5158Smillert		if pos($t) ne length($t);
b39c5158Smillert
*3d61058aSafresh1	# the 5.6 regexp engine ignores the UTF8 flag, so using capture buffers acts as an implicit _utf8_off
*3d61058aSafresh1	# that means on 5.6 we now have to manually set UTF8=on on the output if the input had it, for which
*3d61058aSafresh1	# we extract just the UTF8 flag from the input and check if it forces chr(0x80) to become multibyte
*3d61058aSafresh1	return REGEXPS_USE_BYTES && (substr($t,0,0)."\x80") =~ /\xc2/ ? pack('U0a*', $r) : $r;
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillertsub fill
b39c5158Smillert{
eac174f2Safresh1	my ($ip, $xp, @raw) = map +( defined $_ ? $_ : '' ), @_;
b39c5158Smillert	my @para;
b39c5158Smillert	my $pp;
b39c5158Smillert
b39c5158Smillert	for $pp (split(/\n\s+/, join("\n",@raw))) {
b39c5158Smillert		$pp =~ s/\s+/ /g;
b39c5158Smillert		my $x = wrap($ip, $xp, $pp);
b39c5158Smillert		push(@para, $x);
b39c5158Smillert	}
b39c5158Smillert
b39c5158Smillert	# if paragraph_indent is the same as line_indent,
b39c5158Smillert	# separate paragraphs with blank lines
b39c5158Smillert
b39c5158Smillert	my $ps = ($ip eq $xp) ? "\n\n" : "\n";
b39c5158Smillert	return join ($ps, @para);
b39c5158Smillert}
b39c5158Smillert
b39c5158Smillert1;
eac174f2Safresh1
b39c5158Smillert__END__
b39c5158Smillert
b39c5158Smillert=head1 NAME
b39c5158Smillert
b39c5158SmillertText::Wrap - line wrapping to form simple paragraphs
b39c5158Smillert
b39c5158Smillert=head1 SYNOPSIS
b39c5158Smillert
b39c5158SmillertB<Example 1>
b39c5158Smillert
b39c5158Smillert	use Text::Wrap;
b39c5158Smillert
b39c5158Smillert	$initial_tab = "\t";	# Tab before first line
b39c5158Smillert	$subsequent_tab = "";	# All other lines flush left
b39c5158Smillert
b39c5158Smillert	print wrap($initial_tab, $subsequent_tab, @text);
b39c5158Smillert	print fill($initial_tab, $subsequent_tab, @text);
b39c5158Smillert
b39c5158Smillert	$lines = wrap($initial_tab, $subsequent_tab, @text);
b39c5158Smillert
b39c5158Smillert	@paragraphs = fill($initial_tab, $subsequent_tab, @text);
b39c5158Smillert
b39c5158SmillertB<Example 2>
b39c5158Smillert
b39c5158Smillert	use Text::Wrap qw(wrap $columns $huge);
b39c5158Smillert
b39c5158Smillert	$columns = 132;		# Wrap at 132 characters
b39c5158Smillert	$huge = 'die';
b39c5158Smillert	$huge = 'wrap';
b39c5158Smillert	$huge = 'overflow';
b39c5158Smillert
b39c5158SmillertB<Example 3>
b39c5158Smillert
b39c5158Smillert	use Text::Wrap;
b39c5158Smillert
b39c5158Smillert	$Text::Wrap::columns = 72;
b39c5158Smillert	print wrap('', '', @text);
b39c5158Smillert
b39c5158Smillert=head1 DESCRIPTION
b39c5158Smillert
b39c5158SmillertC<Text::Wrap::wrap()> is a very simple paragraph formatter.  It formats a
b39c5158Smillertsingle paragraph at a time by breaking lines at word boundaries.
b39c5158SmillertIndentation is controlled for the first line (C<$initial_tab>) and
b39c5158Smillertall subsequent lines (C<$subsequent_tab>) independently.  Please note:
b39c5158SmillertC<$initial_tab> and C<$subsequent_tab> are the literal strings that will
b39c5158Smillertbe used: it is unlikely you would want to pass in a number.
b39c5158Smillert
91f110e0Safresh1C<Text::Wrap::fill()> is a simple multi-paragraph formatter.  It formats
b39c5158Smillerteach paragraph separately and then joins them together when it's done.  It
b39c5158Smillertwill destroy any whitespace in the original text.  It breaks text into
91f110e0Safresh1paragraphs by looking for whitespace after a newline.  In other respects,
b39c5158Smillertit acts like wrap().
b39c5158Smillert
91f110e0Safresh1C<wrap()> compresses trailing whitespace into one newline, and C<fill()>
91f110e0Safresh1deletes all trailing whitespace.
91f110e0Safresh1
b39c5158SmillertBoth C<wrap()> and C<fill()> return a single string.
b39c5158Smillert
91f110e0Safresh1Unlike the old Unix fmt(1) utility, this module correctly accounts for
91f110e0Safresh1any Unicode combining characters (such as diacriticals) that may occur
91f110e0Safresh1in each line for both expansion and unexpansion.  These are overstrike
91f110e0Safresh1characters that do not increment the logical position.  Make sure
91f110e0Safresh1you have the appropriate Unicode settings enabled.
91f110e0Safresh1
b39c5158Smillert=head1 OVERRIDES
b39c5158Smillert
b39c5158SmillertC<Text::Wrap::wrap()> has a number of variables that control its behavior.
b39c5158SmillertBecause other modules might be using C<Text::Wrap::wrap()> it is suggested
b39c5158Smillertthat you leave these variables alone!  If you can't do that, then
b39c5158Smillertuse C<local($Text::Wrap::VARIABLE) = YOURVALUE> when you change the
b39c5158Smillertvalues so that the original value is restored.  This C<local()> trick
b39c5158Smillertwill not work if you import the variable into your own namespace.
b39c5158Smillert
b39c5158SmillertLines are wrapped at C<$Text::Wrap::columns> columns (default value: 76).
b39c5158SmillertC<$Text::Wrap::columns> should be set to the full width of your output
b39c5158Smillertdevice.  In fact, every resulting line will have length of no more than
b39c5158SmillertC<$columns - 1>.
b39c5158Smillert
b39c5158SmillertIt is possible to control which characters terminate words by
b39c5158Smillertmodifying C<$Text::Wrap::break>. Set this to a string such as
b39c5158SmillertC<'[\s:]'> (to break before spaces or colons) or a pre-compiled regexp
b39c5158Smillertsuch as C<qr/[\s']/> (to break before spaces or apostrophes). The
b39c5158Smillertdefault is simply C<'\s'>; that is, words are terminated by spaces.
b39c5158Smillert(This means, among other things, that trailing punctuation  such as
b39c5158Smillertfull stops or commas stay with the word they are "attached" to.)
b39c5158SmillertSetting C<$Text::Wrap::break> to a regular expression that doesn't
b39c5158Smillerteat any characters (perhaps just a forward look-ahead assertion) will
b39c5158Smillertcause warnings.
b39c5158Smillert
b39c5158SmillertBeginner note: In example 2, above C<$columns> is imported into
b39c5158Smillertthe local namespace, and set locally.  In example 3,
b39c5158SmillertC<$Text::Wrap::columns> is set in its own namespace without importing it.
b39c5158Smillert
b39c5158SmillertC<Text::Wrap::wrap()> starts its work by expanding all the tabs in its
b39c5158Smillertinput into spaces.  The last thing it does it to turn spaces back
b39c5158Smillertinto tabs.  If you do not want tabs in your results, set
b39c5158SmillertC<$Text::Wrap::unexpand> to a false value.  Likewise if you do not
b39c5158Smillertwant to use 8-character tabstops, set C<$Text::Wrap::tabstop> to
b39c5158Smillertthe number of characters you do want for your tabstops.
b39c5158Smillert
b39c5158SmillertIf you want to separate your lines with something other than C<\n>
b39c5158Smillertthen set C<$Text::Wrap::separator> to your preference.  This replaces
b39c5158Smillertall newlines with C<$Text::Wrap::separator>.  If you just want to
b39c5158Smillertpreserve existing newlines but add new breaks with something else, set
b39c5158SmillertC<$Text::Wrap::separator2> instead.
b39c5158Smillert
b39c5158SmillertWhen words that are longer than C<$columns> are encountered, they
b39c5158Smillertare broken up.  C<wrap()> adds a C<"\n"> at column C<$columns>.
b39c5158SmillertThis behavior can be overridden by setting C<$huge> to
b39c5158Smillert'die' or to 'overflow'.  When set to 'die', large words will cause
b39c5158SmillertC<die()> to be called.  When set to 'overflow', large words will be
b39c5158Smillertleft intact.
b39c5158Smillert
b39c5158SmillertHistorical notes: 'die' used to be the default value of
b39c5158SmillertC<$huge>.  Now, 'wrap' is the default value.
b39c5158Smillert
b39c5158Smillert=head1 EXAMPLES
b39c5158Smillert
b39c5158SmillertCode:
b39c5158Smillert
b39c5158Smillert  print wrap("\t","",<<END);
b39c5158Smillert  This is a bit of text that forms
b39c5158Smillert  a normal book-style indented paragraph
b39c5158Smillert  END
b39c5158Smillert
b39c5158SmillertResult:
b39c5158Smillert
b39c5158Smillert  "	This is a bit of text that forms
b39c5158Smillert  a normal book-style indented paragraph
b39c5158Smillert  "
b39c5158Smillert
b39c5158SmillertCode:
b39c5158Smillert
b39c5158Smillert  $Text::Wrap::columns=20;
b39c5158Smillert  $Text::Wrap::separator="|";
b39c5158Smillert  print wrap("","","This is a bit of text that forms a normal book-style paragraph");
b39c5158Smillert
b39c5158SmillertResult:
b39c5158Smillert
b39c5158Smillert  "This is a bit of|text that forms a|normal book-style|paragraph"
b39c5158Smillert
b39c5158Smillert=head1 SEE ALSO
b39c5158Smillert
91f110e0Safresh1For correct handling of East Asian half- and full-width characters,
91f110e0Safresh1see L<Text::WrapI18N>.  For more detailed controls: L<Text::Format>.
91f110e0Safresh1
91f110e0Safresh1=head1 AUTHOR
91f110e0Safresh1
91f110e0Safresh1David Muir Sharnoff <cpan@dave.sharnoff.org> with help from Tim Pierce and
91f110e0Safresh1many many others.
b39c5158Smillert
b39c5158Smillert=head1 LICENSE
b39c5158Smillert
91f110e0Safresh1Copyright (C) 1996-2009 David Muir Sharnoff.
e9ce3842Safresh1Copyright (C) 2012-2013 Google, Inc.
91f110e0Safresh1This module may be modified, used, copied, and redistributed at your own risk.
e9ce3842Safresh1Although allowed by the preceding license, please do not publicly
e9ce3842Safresh1redistribute modified versions of this code with the name "Text::Wrap"
e9ce3842Safresh1unless it passes the unmodified Text::Wrap test suite.