xref: /openbsd-src/gnu/usr.bin/perl/cpan/Text-ParseWords/lib/Text/ParseWords.pm (revision eac174f2741a08d8deb8aae59a7f778ef9b5d770)
1b39c5158Smillertpackage Text::ParseWords;
2b39c5158Smillert
3b39c5158Smillertuse strict;
4*eac174f2Safresh1use warnings;
5b39c5158Smillertrequire 5.006;
6*eac174f2Safresh1our $VERSION = "3.31";
7b39c5158Smillert
8b39c5158Smillert
9b39c5158Smillertuse Exporter;
10b39c5158Smillertour @ISA = qw(Exporter);
11b39c5158Smillertour @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
12b39c5158Smillertour @EXPORT_OK = qw(old_shellwords);
13b39c5158Smillertour $PERL_SINGLE_QUOTE;
14b39c5158Smillert
15b39c5158Smillert
16b39c5158Smillertsub shellwords {
17b39c5158Smillert    my (@lines) = @_;
18b39c5158Smillert    my @allwords;
19b39c5158Smillert
20b39c5158Smillert    foreach my $line (@lines) {
21b39c5158Smillert	$line =~ s/^\s+//;
22b39c5158Smillert	my @words = parse_line('\s+', 0, $line);
23b39c5158Smillert	pop @words if (@words and !defined $words[-1]);
24b39c5158Smillert	return() unless (@words || !length($line));
25b39c5158Smillert	push(@allwords, @words);
26b39c5158Smillert    }
27b39c5158Smillert    return(@allwords);
28b39c5158Smillert}
29b39c5158Smillert
30b39c5158Smillert
31b39c5158Smillert
32b39c5158Smillertsub quotewords {
33b39c5158Smillert    my($delim, $keep, @lines) = @_;
34b39c5158Smillert    my($line, @words, @allwords);
35b39c5158Smillert
36b39c5158Smillert    foreach $line (@lines) {
37b39c5158Smillert	@words = parse_line($delim, $keep, $line);
38b39c5158Smillert	return() unless (@words || !length($line));
39b39c5158Smillert	push(@allwords, @words);
40b39c5158Smillert    }
41b39c5158Smillert    return(@allwords);
42b39c5158Smillert}
43b39c5158Smillert
44b39c5158Smillert
45b39c5158Smillert
46b39c5158Smillertsub nested_quotewords {
47b39c5158Smillert    my($delim, $keep, @lines) = @_;
48b39c5158Smillert    my($i, @allwords);
49b39c5158Smillert
50b39c5158Smillert    for ($i = 0; $i < @lines; $i++) {
51b39c5158Smillert	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
52b39c5158Smillert	return() unless (@{$allwords[$i]} || !length($lines[$i]));
53b39c5158Smillert    }
54b39c5158Smillert    return(@allwords);
55b39c5158Smillert}
56b39c5158Smillert
57b39c5158Smillert
58b39c5158Smillert
59b39c5158Smillertsub parse_line {
60b39c5158Smillert    my($delimiter, $keep, $line) = @_;
61b39c5158Smillert    my($word, @pieces);
62b39c5158Smillert
63b39c5158Smillert    no warnings 'uninitialized';	# we will be testing undef strings
64b39c5158Smillert
65b39c5158Smillert    while (length($line)) {
66b39c5158Smillert        # This pattern is optimised to be stack conservative on older perls.
67b39c5158Smillert        # Do not refactor without being careful and testing it on very long strings.
68b39c5158Smillert        # See Perl bug #42980 for an example of a stack busting input.
69b39c5158Smillert        $line =~ s/^
70b39c5158Smillert                    (?:
71b39c5158Smillert                        # double quoted string
72b39c5158Smillert                        (")                             # $quote
73b39c5158Smillert                        ((?>[^\\"]*(?:\\.[^\\"]*)*))"   # $quoted
74b39c5158Smillert		    |	# --OR--
75b39c5158Smillert                        # singe quoted string
76b39c5158Smillert                        (')                             # $quote
77b39c5158Smillert                        ((?>[^\\']*(?:\\.[^\\']*)*))'   # $quoted
78b39c5158Smillert                    |   # --OR--
79b39c5158Smillert                        # unquoted string
80b39c5158Smillert		        (                               # $unquoted
81b39c5158Smillert                            (?:\\.|[^\\"'])*?
82b39c5158Smillert                        )
83b39c5158Smillert                        # followed by
84b39c5158Smillert		        (                               # $delim
85b39c5158Smillert                            \Z(?!\n)                    # EOL
86b39c5158Smillert                        |   # --OR--
87b39c5158Smillert                            (?-x:$delimiter)            # delimiter
88b39c5158Smillert                        |   # --OR--
89b39c5158Smillert                            (?!^)(?=["'])               # a quote
90b39c5158Smillert                        )
91b39c5158Smillert		    )//xs or return;		# extended layout
92b39c5158Smillert        my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
93b39c5158Smillert
94b39c5158Smillert
95b39c5158Smillert	return() unless( defined($quote) || length($unquoted) || length($delim));
96b39c5158Smillert
97b39c5158Smillert        if ($keep) {
98b39c5158Smillert	    $quoted = "$quote$quoted$quote";
99b39c5158Smillert	}
100b39c5158Smillert        else {
101b39c5158Smillert	    $unquoted =~ s/\\(.)/$1/sg;
102b39c5158Smillert	    if (defined $quote) {
103b39c5158Smillert		$quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
104b39c5158Smillert		$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
105b39c5158Smillert            }
106b39c5158Smillert	}
107b39c5158Smillert        $word .= substr($line, 0, 0);	# leave results tainted
108b39c5158Smillert        $word .= defined $quote ? $quoted : $unquoted;
109b39c5158Smillert
110b39c5158Smillert        if (length($delim)) {
111b39c5158Smillert            push(@pieces, $word);
112b39c5158Smillert            push(@pieces, $delim) if ($keep eq 'delimiters');
113b39c5158Smillert            undef $word;
114b39c5158Smillert        }
115b39c5158Smillert        if (!length($line)) {
116b39c5158Smillert            push(@pieces, $word);
117b39c5158Smillert	}
118b39c5158Smillert    }
119b39c5158Smillert    return(@pieces);
120b39c5158Smillert}
121b39c5158Smillert
122b39c5158Smillert
123b39c5158Smillert
124b39c5158Smillertsub old_shellwords {
125b39c5158Smillert
126b39c5158Smillert    # Usage:
127b39c5158Smillert    #	use ParseWords;
128b39c5158Smillert    #	@words = old_shellwords($line);
129b39c5158Smillert    #	or
130b39c5158Smillert    #	@words = old_shellwords(@lines);
131b39c5158Smillert    #	or
132b39c5158Smillert    #	@words = old_shellwords();	# defaults to $_ (and clobbers it)
133b39c5158Smillert
134b39c5158Smillert    no warnings 'uninitialized';	# we will be testing undef strings
135b39c5158Smillert    local *_ = \join('', @_) if @_;
136b39c5158Smillert    my (@words, $snippet);
137b39c5158Smillert
138b39c5158Smillert    s/\A\s+//;
139b39c5158Smillert    while ($_ ne '') {
140b39c5158Smillert	my $field = substr($_, 0, 0);	# leave results tainted
141b39c5158Smillert	for (;;) {
142b39c5158Smillert	    if (s/\A"(([^"\\]|\\.)*)"//s) {
143b39c5158Smillert		($snippet = $1) =~ s#\\(.)#$1#sg;
144b39c5158Smillert	    }
145b39c5158Smillert	    elsif (/\A"/) {
146b39c5158Smillert		require Carp;
147b39c5158Smillert		Carp::carp("Unmatched double quote: $_");
148b39c5158Smillert		return();
149b39c5158Smillert	    }
150b39c5158Smillert	    elsif (s/\A'(([^'\\]|\\.)*)'//s) {
151b39c5158Smillert		($snippet = $1) =~ s#\\(.)#$1#sg;
152b39c5158Smillert	    }
153b39c5158Smillert	    elsif (/\A'/) {
154b39c5158Smillert		require Carp;
155b39c5158Smillert		Carp::carp("Unmatched single quote: $_");
156b39c5158Smillert		return();
157b39c5158Smillert	    }
158b39c5158Smillert	    elsif (s/\A\\(.?)//s) {
159b39c5158Smillert		$snippet = $1;
160b39c5158Smillert	    }
161b39c5158Smillert	    elsif (s/\A([^\s\\'"]+)//) {
162b39c5158Smillert		$snippet = $1;
163b39c5158Smillert	    }
164b39c5158Smillert	    else {
165b39c5158Smillert		s/\A\s+//;
166b39c5158Smillert		last;
167b39c5158Smillert	    }
168b39c5158Smillert	    $field .= $snippet;
169b39c5158Smillert	}
170b39c5158Smillert	push(@words, $field);
171b39c5158Smillert    }
172b39c5158Smillert    return @words;
173b39c5158Smillert}
174b39c5158Smillert
175b39c5158Smillert1;
176b39c5158Smillert
177b39c5158Smillert__END__
178b39c5158Smillert
179b39c5158Smillert=head1 NAME
180b39c5158Smillert
181b39c5158SmillertText::ParseWords - parse text into an array of tokens or array of arrays
182b39c5158Smillert
183b39c5158Smillert=head1 SYNOPSIS
184b39c5158Smillert
185b39c5158Smillert  use Text::ParseWords;
186b39c5158Smillert  @lists = nested_quotewords($delim, $keep, @lines);
187b39c5158Smillert  @words = quotewords($delim, $keep, @lines);
188b39c5158Smillert  @words = shellwords(@lines);
189b39c5158Smillert  @words = parse_line($delim, $keep, $line);
190b39c5158Smillert  @words = old_shellwords(@lines); # DEPRECATED!
191b39c5158Smillert
192b39c5158Smillert=head1 DESCRIPTION
193b39c5158Smillert
194*eac174f2Safresh1The C<nested_quotewords()> and C<quotewords()> functions accept a delimiter
195b39c5158Smillert(which can be a regular expression)
196b39c5158Smillertand a list of lines and then breaks those lines up into a list of
197*eac174f2Safresh1words ignoring delimiters that appear inside quotes.  C<quotewords()>
198*eac174f2Safresh1returns all of the tokens in a single long list, while C<nested_quotewords()>
199*eac174f2Safresh1returns a list of token lists corresponding to the elements of C<@lines>.
200*eac174f2Safresh1C<parse_line()> does tokenizing on a single string.  The C<*quotewords()>
201*eac174f2Safresh1functions simply call C<parse_line()>, so if you're only splitting
202*eac174f2Safresh1one line you can call C<parse_line()> directly and save a function
203b39c5158Smillertcall.
204b39c5158Smillert
205*eac174f2Safresh1The C<$keep> controls what happens with delimters and special characters:
206*eac174f2Safresh1
207*eac174f2Safresh1=over 4
208*eac174f2Safresh1
209*eac174f2Safresh1=item true
210*eac174f2Safresh1
211*eac174f2Safresh1If true, then the tokens are split on the specified delimiter,
212*eac174f2Safresh1but all other characters (including quotes and backslashes)
213*eac174f2Safresh1are kept in the tokens.
214*eac174f2Safresh1
215*eac174f2Safresh1=item false
216*eac174f2Safresh1
217*eac174f2Safresh1If $keep is false then the C<*quotewords()> functions
218*eac174f2Safresh1remove all quotes and backslashes that are
219b39c5158Smillertnot themselves backslash-escaped or inside of single quotes (i.e.,
220*eac174f2Safresh1C<quotewords()> tries to interpret these characters just like the Bourne
221b39c5158Smillertshell).  NB: these semantics are significantly different from the
222b39c5158Smillertoriginal version of this module shipped with Perl 5.000 through 5.004.
223*eac174f2Safresh1
224*eac174f2Safresh1=item C<"delimiters">
225*eac174f2Safresh1
226b39c5158SmillertAs an additional feature, $keep may be the keyword "delimiters" which
227b39c5158Smillertcauses the functions to preserve the delimiters in each string as
228b39c5158Smillerttokens in the token lists, in addition to preserving quote and
229b39c5158Smillertbackslash characters.
230b39c5158Smillert
231*eac174f2Safresh1=back
232*eac174f2Safresh1
233*eac174f2Safresh1C<shellwords()> is written as a special case of C<quotewords()>, and it
234b39c5158Smillertdoes token parsing with whitespace as a delimiter-- similar to most
235b39c5158SmillertUnix shells.
236b39c5158Smillert
237b39c5158Smillert=head1 EXAMPLES
238b39c5158Smillert
239b39c5158SmillertThe sample program:
240b39c5158Smillert
241b39c5158Smillert  use Text::ParseWords;
242b39c5158Smillert  @words = quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
243b39c5158Smillert  $i = 0;
244b39c5158Smillert  foreach (@words) {
245b39c5158Smillert      print "$i: <$_>\n";
246b39c5158Smillert      $i++;
247b39c5158Smillert  }
248b39c5158Smillert
249b39c5158Smillertproduces:
250b39c5158Smillert
251b39c5158Smillert  0: <this>
252b39c5158Smillert  1: <is>
253b39c5158Smillert  2: <a test>
254b39c5158Smillert  3: <of quotewords>
255b39c5158Smillert  4: <"for>
256b39c5158Smillert  5: <you>
257b39c5158Smillert
258b39c5158Smillertdemonstrating:
259b39c5158Smillert
260b39c5158Smillert=over 4
261b39c5158Smillert
262b8851fccSafresh1=item 0Z<>
263b39c5158Smillert
264b39c5158Smillerta simple word
265b39c5158Smillert
266b8851fccSafresh1=item 1Z<>
267b39c5158Smillert
268b39c5158Smillertmultiple spaces are skipped because of our $delim
269b39c5158Smillert
270b8851fccSafresh1=item 2Z<>
271b39c5158Smillert
272b39c5158Smillertuse of quotes to include a space in a word
273b39c5158Smillert
274b8851fccSafresh1=item 3Z<>
275b39c5158Smillert
276b39c5158Smillertuse of a backslash to include a space in a word
277b39c5158Smillert
278b8851fccSafresh1=item 4Z<>
279b39c5158Smillert
280b39c5158Smillertuse of a backslash to remove the special meaning of a double-quote
281b39c5158Smillert
282b8851fccSafresh1=item 5Z<>
283b39c5158Smillert
284b39c5158Smillertanother simple word (note the lack of effect of the
285b39c5158Smillertbackslashed double-quote)
286b39c5158Smillert
287b39c5158Smillert=back
288b39c5158Smillert
289b39c5158SmillertReplacing C<quotewords('\s+', 0, q{this   is...})>
290b39c5158Smillertwith C<shellwords(q{this   is...})>
291b39c5158Smillertis a simpler way to accomplish the same thing.
292b39c5158Smillert
29391f110e0Safresh1=head1 SEE ALSO
29491f110e0Safresh1
29591f110e0Safresh1L<Text::CSV> - for parsing CSV files
29691f110e0Safresh1
297b39c5158Smillert=head1 AUTHORS
298b39c5158Smillert
299*eac174f2Safresh1The original author is unknown,
300*eac174f2Safresh1but presumably this evolved from C<shellwords.pl> in Perl 4.
301b39c5158Smillert
302*eac174f2Safresh1Much of the code for C<parse_line()>
303*eac174f2Safresh1(including the primary regexp)
304*eac174f2Safresh1came from Joerk Behrends E<lt>jbehrends@multimediaproduzenten.deE<gt>.
305b39c5158Smillert
306*eac174f2Safresh1Examples section and other documentation provided by
307*eac174f2Safresh1John Heidemann E<lt>johnh@ISI.EDUE<gt>.
308b39c5158Smillert
309*eac174f2Safresh1Hal Pomeranz E<lt>pomeranz@netcom.comE<gt>
310*eac174f2Safresh1maintained this from 1994 through 1999,
311*eac174f2Safresh1and did the first CPAN release.
312*eac174f2Safresh1
313*eac174f2Safresh1Alexandr Ciornii E<lt>alexchornyATgmail.comE<gt>
314*eac174f2Safresh1maintained this from 2008 to 2015.
315*eac174f2Safresh1
316*eac174f2Safresh1Many other people have contributed,
317*eac174f2Safresh1with special thanks due to
318*eac174f2Safresh1Michael Schwern E<lt>schwern@envirolink.orgE<gt>
319*eac174f2Safresh1and
320*eac174f2Safresh1Jeff Friedl E<lt>jfriedl@yahoo-inc.comE<gt>.
321b39c5158Smillert
322b8851fccSafresh1=head1 COPYRIGHT AND LICENSE
323b8851fccSafresh1
324b8851fccSafresh1This library is free software; you may redistribute and/or modify it
325b8851fccSafresh1under the same terms as Perl itself.
326b8851fccSafresh1
327b39c5158Smillert=cut
328