xref: /openbsd-src/gnu/usr.bin/perl/cpan/Text-ParseWords/lib/Text/ParseWords.pm (revision 46035553bfdd96e63c94e32da0210227ec2e3cf1)
1package Text::ParseWords;
2
3use strict;
4require 5.006;
5our $VERSION = "3.30";
6
7
8use Exporter;
9our @ISA = qw(Exporter);
10our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11our @EXPORT_OK = qw(old_shellwords);
12our $PERL_SINGLE_QUOTE;
13
14
15sub shellwords {
16    my (@lines) = @_;
17    my @allwords;
18
19    foreach my $line (@lines) {
20	$line =~ s/^\s+//;
21	my @words = parse_line('\s+', 0, $line);
22	pop @words if (@words and !defined $words[-1]);
23	return() unless (@words || !length($line));
24	push(@allwords, @words);
25    }
26    return(@allwords);
27}
28
29
30
31sub quotewords {
32    my($delim, $keep, @lines) = @_;
33    my($line, @words, @allwords);
34
35    foreach $line (@lines) {
36	@words = parse_line($delim, $keep, $line);
37	return() unless (@words || !length($line));
38	push(@allwords, @words);
39    }
40    return(@allwords);
41}
42
43
44
45sub nested_quotewords {
46    my($delim, $keep, @lines) = @_;
47    my($i, @allwords);
48
49    for ($i = 0; $i < @lines; $i++) {
50	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
51	return() unless (@{$allwords[$i]} || !length($lines[$i]));
52    }
53    return(@allwords);
54}
55
56
57
58sub parse_line {
59    my($delimiter, $keep, $line) = @_;
60    my($word, @pieces);
61
62    no warnings 'uninitialized';	# we will be testing undef strings
63
64    while (length($line)) {
65        # This pattern is optimised to be stack conservative on older perls.
66        # Do not refactor without being careful and testing it on very long strings.
67        # See Perl bug #42980 for an example of a stack busting input.
68        $line =~ s/^
69                    (?:
70                        # double quoted string
71                        (")                             # $quote
72                        ((?>[^\\"]*(?:\\.[^\\"]*)*))"   # $quoted
73		    |	# --OR--
74                        # singe quoted string
75                        (')                             # $quote
76                        ((?>[^\\']*(?:\\.[^\\']*)*))'   # $quoted
77                    |   # --OR--
78                        # unquoted string
79		        (                               # $unquoted
80                            (?:\\.|[^\\"'])*?
81                        )
82                        # followed by
83		        (                               # $delim
84                            \Z(?!\n)                    # EOL
85                        |   # --OR--
86                            (?-x:$delimiter)            # delimiter
87                        |   # --OR--
88                            (?!^)(?=["'])               # a quote
89                        )
90		    )//xs or return;		# extended layout
91        my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
92
93
94	return() unless( defined($quote) || length($unquoted) || length($delim));
95
96        if ($keep) {
97	    $quoted = "$quote$quoted$quote";
98	}
99        else {
100	    $unquoted =~ s/\\(.)/$1/sg;
101	    if (defined $quote) {
102		$quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
103		$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
104            }
105	}
106        $word .= substr($line, 0, 0);	# leave results tainted
107        $word .= defined $quote ? $quoted : $unquoted;
108
109        if (length($delim)) {
110            push(@pieces, $word);
111            push(@pieces, $delim) if ($keep eq 'delimiters');
112            undef $word;
113        }
114        if (!length($line)) {
115            push(@pieces, $word);
116	}
117    }
118    return(@pieces);
119}
120
121
122
123sub old_shellwords {
124
125    # Usage:
126    #	use ParseWords;
127    #	@words = old_shellwords($line);
128    #	or
129    #	@words = old_shellwords(@lines);
130    #	or
131    #	@words = old_shellwords();	# defaults to $_ (and clobbers it)
132
133    no warnings 'uninitialized';	# we will be testing undef strings
134    local *_ = \join('', @_) if @_;
135    my (@words, $snippet);
136
137    s/\A\s+//;
138    while ($_ ne '') {
139	my $field = substr($_, 0, 0);	# leave results tainted
140	for (;;) {
141	    if (s/\A"(([^"\\]|\\.)*)"//s) {
142		($snippet = $1) =~ s#\\(.)#$1#sg;
143	    }
144	    elsif (/\A"/) {
145		require Carp;
146		Carp::carp("Unmatched double quote: $_");
147		return();
148	    }
149	    elsif (s/\A'(([^'\\]|\\.)*)'//s) {
150		($snippet = $1) =~ s#\\(.)#$1#sg;
151	    }
152	    elsif (/\A'/) {
153		require Carp;
154		Carp::carp("Unmatched single quote: $_");
155		return();
156	    }
157	    elsif (s/\A\\(.?)//s) {
158		$snippet = $1;
159	    }
160	    elsif (s/\A([^\s\\'"]+)//) {
161		$snippet = $1;
162	    }
163	    else {
164		s/\A\s+//;
165		last;
166	    }
167	    $field .= $snippet;
168	}
169	push(@words, $field);
170    }
171    return @words;
172}
173
1741;
175
176__END__
177
178=head1 NAME
179
180Text::ParseWords - parse text into an array of tokens or array of arrays
181
182=head1 SYNOPSIS
183
184  use Text::ParseWords;
185  @lists = nested_quotewords($delim, $keep, @lines);
186  @words = quotewords($delim, $keep, @lines);
187  @words = shellwords(@lines);
188  @words = parse_line($delim, $keep, $line);
189  @words = old_shellwords(@lines); # DEPRECATED!
190
191=head1 DESCRIPTION
192
193The &nested_quotewords() and &quotewords() functions accept a delimiter
194(which can be a regular expression)
195and a list of lines and then breaks those lines up into a list of
196words ignoring delimiters that appear inside quotes.  &quotewords()
197returns all of the tokens in a single long list, while &nested_quotewords()
198returns a list of token lists corresponding to the elements of @lines.
199&parse_line() does tokenizing on a single string.  The &*quotewords()
200functions simply call &parse_line(), so if you're only splitting
201one line you can call &parse_line() directly and save a function
202call.
203
204The $keep argument is a boolean flag.  If true, then the tokens are
205split on the specified delimiter, but all other characters (including
206quotes and backslashes) are kept in the tokens.  If $keep is false then the
207&*quotewords() functions remove all quotes and backslashes that are
208not themselves backslash-escaped or inside of single quotes (i.e.,
209&quotewords() tries to interpret these characters just like the Bourne
210shell).  NB: these semantics are significantly different from the
211original version of this module shipped with Perl 5.000 through 5.004.
212As an additional feature, $keep may be the keyword "delimiters" which
213causes the functions to preserve the delimiters in each string as
214tokens in the token lists, in addition to preserving quote and
215backslash characters.
216
217&shellwords() is written as a special case of &quotewords(), and it
218does token parsing with whitespace as a delimiter-- similar to most
219Unix shells.
220
221=head1 EXAMPLES
222
223The sample program:
224
225  use Text::ParseWords;
226  @words = quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
227  $i = 0;
228  foreach (@words) {
229      print "$i: <$_>\n";
230      $i++;
231  }
232
233produces:
234
235  0: <this>
236  1: <is>
237  2: <a test>
238  3: <of quotewords>
239  4: <"for>
240  5: <you>
241
242demonstrating:
243
244=over 4
245
246=item 0Z<>
247
248a simple word
249
250=item 1Z<>
251
252multiple spaces are skipped because of our $delim
253
254=item 2Z<>
255
256use of quotes to include a space in a word
257
258=item 3Z<>
259
260use of a backslash to include a space in a word
261
262=item 4Z<>
263
264use of a backslash to remove the special meaning of a double-quote
265
266=item 5Z<>
267
268another simple word (note the lack of effect of the
269backslashed double-quote)
270
271=back
272
273Replacing C<quotewords('\s+', 0, q{this   is...})>
274with C<shellwords(q{this   is...})>
275is a simpler way to accomplish the same thing.
276
277=head1 SEE ALSO
278
279L<Text::CSV> - for parsing CSV files
280
281=head1 AUTHORS
282
283Maintainer: Alexandr Ciornii <alexchornyATgmail.com>.
284
285Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
286author unknown).  Much of the code for &parse_line() (including the
287primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
288
289Examples section another documentation provided by John Heidemann
290<johnh@ISI.EDU>
291
292Bug reports, patches, and nagging provided by lots of folks-- thanks
293everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
294for assuring me that a &nested_quotewords() would be useful, and to
295Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
296error-checking (sort of-- you had to be there).
297
298=head1 COPYRIGHT AND LICENSE
299
300This library is free software; you may redistribute and/or modify it
301under the same terms as Perl itself.
302
303=cut
304