xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/lib/Text/ParseWords.pm (revision 0:68f95e015346)
1package Text::ParseWords;
2
3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
4$VERSION = "3.21";
5
6require 5.000;
7
8use Exporter;
9@ISA = qw(Exporter);
10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11@EXPORT_OK = qw(old_shellwords);
12
13
14sub shellwords {
15    local(@lines) = @_;
16    $lines[$#lines] =~ s/\s+$//;
17    return(quotewords('\s+', 0, @lines));
18}
19
20
21
22sub quotewords {
23    my($delim, $keep, @lines) = @_;
24    my($line, @words, @allwords);
25
26
27    foreach $line (@lines) {
28	@words = parse_line($delim, $keep, $line);
29	return() unless (@words || !length($line));
30	push(@allwords, @words);
31    }
32    return(@allwords);
33}
34
35
36
37sub nested_quotewords {
38    my($delim, $keep, @lines) = @_;
39    my($i, @allwords);
40
41    for ($i = 0; $i < @lines; $i++) {
42	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43	return() unless (@{$allwords[$i]} || !length($lines[$i]));
44    }
45    return(@allwords);
46}
47
48
49
50sub parse_line {
51	# We will be testing undef strings
52	no warnings;
53	use re 'taint'; # if it's tainted, leave it as such
54
55    my($delimiter, $keep, $line) = @_;
56    my($quote, $quoted, $unquoted, $delim, $word, @pieces);
57
58    while (length($line)) {
59
60	($quote, $quoted, undef, $unquoted, $delim, undef) =
61	    $line =~ m/^(["'])                 # a $quote
62                        ((?:\\.|(?!\1)[^\\])*)    # and $quoted text
63                        \1 		       # followed by the same quote
64                        ([\000-\377]*)	       # and the rest
65		       |                       # --OR--
66                       ^((?:\\.|[^\\"'])*?)    # an $unquoted text
67		      (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
68                                               # plus EOL, delimiter, or quote
69                      ([\000-\377]*)	       # the rest
70		      /x;		       # extended layout
71	return() unless( $quote || length($unquoted) || length($delim));
72
73	$line = $+;
74
75        if ($keep) {
76	    $quoted = "$quote$quoted$quote";
77	}
78        else {
79	    $unquoted =~ s/\\(.)/$1/g;
80	    if (defined $quote) {
81		$quoted =~ s/\\(.)/$1/g if ($quote eq '"');
82		$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
83            }
84	}
85        $word .= defined $quote ? $quoted : $unquoted;
86
87        if (length($delim)) {
88            push(@pieces, $word);
89            push(@pieces, $delim) if ($keep eq 'delimiters');
90            undef $word;
91        }
92        if (!length($line)) {
93            push(@pieces, $word);
94	}
95    }
96    return(@pieces);
97}
98
99
100
101sub old_shellwords {
102
103    # Usage:
104    #	use ParseWords;
105    #	@words = old_shellwords($line);
106    #	or
107    #	@words = old_shellwords(@lines);
108
109    local($_) = join('', @_);
110    my(@words,$snippet,$field);
111
112    s/^\s+//;
113    while ($_ ne '') {
114	$field = '';
115	for (;;) {
116	    if (s/^"(([^"\\]|\\.)*)"//) {
117		($snippet = $1) =~ s#\\(.)#$1#g;
118	    }
119	    elsif (/^"/) {
120		return();
121	    }
122	    elsif (s/^'(([^'\\]|\\.)*)'//) {
123		($snippet = $1) =~ s#\\(.)#$1#g;
124	    }
125	    elsif (/^'/) {
126		return();
127	    }
128	    elsif (s/^\\(.)//) {
129		$snippet = $1;
130	    }
131	    elsif (s/^([^\s\\'"]+)//) {
132		$snippet = $1;
133	    }
134	    else {
135		s/^\s+//;
136		last;
137	    }
138	    $field .= $snippet;
139	}
140	push(@words, $field);
141    }
142    @words;
143}
144
1451;
146
147__END__
148
149=head1 NAME
150
151Text::ParseWords - parse text into an array of tokens or array of arrays
152
153=head1 SYNOPSIS
154
155  use Text::ParseWords;
156  @lists = &nested_quotewords($delim, $keep, @lines);
157  @words = &quotewords($delim, $keep, @lines);
158  @words = &shellwords(@lines);
159  @words = &parse_line($delim, $keep, $line);
160  @words = &old_shellwords(@lines); # DEPRECATED!
161
162=head1 DESCRIPTION
163
164The &nested_quotewords() and &quotewords() functions accept a delimiter
165(which can be a regular expression)
166and a list of lines and then breaks those lines up into a list of
167words ignoring delimiters that appear inside quotes.  &quotewords()
168returns all of the tokens in a single long list, while &nested_quotewords()
169returns a list of token lists corresponding to the elements of @lines.
170&parse_line() does tokenizing on a single string.  The &*quotewords()
171functions simply call &parse_lines(), so if you're only splitting
172one line you can call &parse_lines() directly and save a function
173call.
174
175The $keep argument is a boolean flag.  If true, then the tokens are
176split on the specified delimiter, but all other characters (quotes,
177backslashes, etc.) are kept in the tokens.  If $keep is false then the
178&*quotewords() functions remove all quotes and backslashes that are
179not themselves backslash-escaped or inside of single quotes (i.e.,
180&quotewords() tries to interpret these characters just like the Bourne
181shell).  NB: these semantics are significantly different from the
182original version of this module shipped with Perl 5.000 through 5.004.
183As an additional feature, $keep may be the keyword "delimiters" which
184causes the functions to preserve the delimiters in each string as
185tokens in the token lists, in addition to preserving quote and
186backslash characters.
187
188&shellwords() is written as a special case of &quotewords(), and it
189does token parsing with whitespace as a delimiter-- similar to most
190Unix shells.
191
192=head1 EXAMPLES
193
194The sample program:
195
196  use Text::ParseWords;
197  @words = &quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
198  $i = 0;
199  foreach (@words) {
200      print "$i: <$_>\n";
201      $i++;
202  }
203
204produces:
205
206  0: <this>
207  1: <is>
208  2: <a test>
209  3: <of quotewords>
210  4: <"for>
211  5: <you>
212
213demonstrating:
214
215=over 4
216
217=item 0
218
219a simple word
220
221=item 1
222
223multiple spaces are skipped because of our $delim
224
225=item 2
226
227use of quotes to include a space in a word
228
229=item 3
230
231use of a backslash to include a space in a word
232
233=item 4
234
235use of a backslash to remove the special meaning of a double-quote
236
237=item 5
238
239another simple word (note the lack of effect of the
240backslashed double-quote)
241
242=back
243
244Replacing C<&quotewords('\s+', 0, q{this   is...})>
245with C<&shellwords(q{this   is...})>
246is a simpler way to accomplish the same thing.
247
248=head1 AUTHORS
249
250Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
251author unknown).  Much of the code for &parse_line() (including the
252primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
253
254Examples section another documentation provided by John Heidemann
255<johnh@ISI.EDU>
256
257Bug reports, patches, and nagging provided by lots of folks-- thanks
258everybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
259for assuring me that a &nested_quotewords() would be useful, and to
260Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
261error-checking (sort of-- you had to be there).
262
263=cut
264