xref: /onnv-gate/usr/src/cmd/perl/5.8.4/distrib/lib/Text/ParseWords.pm (revision 0:68f95e015346)
1*0Sstevel@tonic-gatepackage Text::ParseWords;
2*0Sstevel@tonic-gate
3*0Sstevel@tonic-gateuse vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
4*0Sstevel@tonic-gate$VERSION = "3.21";
5*0Sstevel@tonic-gate
6*0Sstevel@tonic-gaterequire 5.000;
7*0Sstevel@tonic-gate
8*0Sstevel@tonic-gateuse Exporter;
9*0Sstevel@tonic-gate@ISA = qw(Exporter);
10*0Sstevel@tonic-gate@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11*0Sstevel@tonic-gate@EXPORT_OK = qw(old_shellwords);
12*0Sstevel@tonic-gate
13*0Sstevel@tonic-gate
14*0Sstevel@tonic-gatesub shellwords {
15*0Sstevel@tonic-gate    local(@lines) = @_;
16*0Sstevel@tonic-gate    $lines[$#lines] =~ s/\s+$//;
17*0Sstevel@tonic-gate    return(quotewords('\s+', 0, @lines));
18*0Sstevel@tonic-gate}
19*0Sstevel@tonic-gate
20*0Sstevel@tonic-gate
21*0Sstevel@tonic-gate
22*0Sstevel@tonic-gatesub quotewords {
23*0Sstevel@tonic-gate    my($delim, $keep, @lines) = @_;
24*0Sstevel@tonic-gate    my($line, @words, @allwords);
25*0Sstevel@tonic-gate
26*0Sstevel@tonic-gate
27*0Sstevel@tonic-gate    foreach $line (@lines) {
28*0Sstevel@tonic-gate	@words = parse_line($delim, $keep, $line);
29*0Sstevel@tonic-gate	return() unless (@words || !length($line));
30*0Sstevel@tonic-gate	push(@allwords, @words);
31*0Sstevel@tonic-gate    }
32*0Sstevel@tonic-gate    return(@allwords);
33*0Sstevel@tonic-gate}
34*0Sstevel@tonic-gate
35*0Sstevel@tonic-gate
36*0Sstevel@tonic-gate
37*0Sstevel@tonic-gatesub nested_quotewords {
38*0Sstevel@tonic-gate    my($delim, $keep, @lines) = @_;
39*0Sstevel@tonic-gate    my($i, @allwords);
40*0Sstevel@tonic-gate
41*0Sstevel@tonic-gate    for ($i = 0; $i < @lines; $i++) {
42*0Sstevel@tonic-gate	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43*0Sstevel@tonic-gate	return() unless (@{$allwords[$i]} || !length($lines[$i]));
44*0Sstevel@tonic-gate    }
45*0Sstevel@tonic-gate    return(@allwords);
46*0Sstevel@tonic-gate}
47*0Sstevel@tonic-gate
48*0Sstevel@tonic-gate
49*0Sstevel@tonic-gate
50*0Sstevel@tonic-gatesub parse_line {
51*0Sstevel@tonic-gate	# We will be testing undef strings
52*0Sstevel@tonic-gate	no warnings;
53*0Sstevel@tonic-gate	use re 'taint'; # if it's tainted, leave it as such
54*0Sstevel@tonic-gate
55*0Sstevel@tonic-gate    my($delimiter, $keep, $line) = @_;
56*0Sstevel@tonic-gate    my($quote, $quoted, $unquoted, $delim, $word, @pieces);
57*0Sstevel@tonic-gate
58*0Sstevel@tonic-gate    while (length($line)) {
59*0Sstevel@tonic-gate
60*0Sstevel@tonic-gate	($quote, $quoted, undef, $unquoted, $delim, undef) =
61*0Sstevel@tonic-gate	    $line =~ m/^(["'])                 # a $quote
62*0Sstevel@tonic-gate                        ((?:\\.|(?!\1)[^\\])*)    # and $quoted text
63*0Sstevel@tonic-gate                        \1 		       # followed by the same quote
64*0Sstevel@tonic-gate                        ([\000-\377]*)	       # and the rest
65*0Sstevel@tonic-gate		       |                       # --OR--
66*0Sstevel@tonic-gate                       ^((?:\\.|[^\\"'])*?)    # an $unquoted text
67*0Sstevel@tonic-gate		      (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["']))
68*0Sstevel@tonic-gate                                               # plus EOL, delimiter, or quote
69*0Sstevel@tonic-gate                      ([\000-\377]*)	       # the rest
70*0Sstevel@tonic-gate		      /x;		       # extended layout
71*0Sstevel@tonic-gate	return() unless( $quote || length($unquoted) || length($delim));
72*0Sstevel@tonic-gate
73*0Sstevel@tonic-gate	$line = $+;
74*0Sstevel@tonic-gate
75*0Sstevel@tonic-gate        if ($keep) {
76*0Sstevel@tonic-gate	    $quoted = "$quote$quoted$quote";
77*0Sstevel@tonic-gate	}
78*0Sstevel@tonic-gate        else {
79*0Sstevel@tonic-gate	    $unquoted =~ s/\\(.)/$1/g;
80*0Sstevel@tonic-gate	    if (defined $quote) {
81*0Sstevel@tonic-gate		$quoted =~ s/\\(.)/$1/g if ($quote eq '"');
82*0Sstevel@tonic-gate		$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
83*0Sstevel@tonic-gate            }
84*0Sstevel@tonic-gate	}
85*0Sstevel@tonic-gate        $word .= defined $quote ? $quoted : $unquoted;
86*0Sstevel@tonic-gate
87*0Sstevel@tonic-gate        if (length($delim)) {
88*0Sstevel@tonic-gate            push(@pieces, $word);
89*0Sstevel@tonic-gate            push(@pieces, $delim) if ($keep eq 'delimiters');
90*0Sstevel@tonic-gate            undef $word;
91*0Sstevel@tonic-gate        }
92*0Sstevel@tonic-gate        if (!length($line)) {
93*0Sstevel@tonic-gate            push(@pieces, $word);
94*0Sstevel@tonic-gate	}
95*0Sstevel@tonic-gate    }
96*0Sstevel@tonic-gate    return(@pieces);
97*0Sstevel@tonic-gate}
98*0Sstevel@tonic-gate
99*0Sstevel@tonic-gate
100*0Sstevel@tonic-gate
101*0Sstevel@tonic-gatesub old_shellwords {
102*0Sstevel@tonic-gate
103*0Sstevel@tonic-gate    # Usage:
104*0Sstevel@tonic-gate    #	use ParseWords;
105*0Sstevel@tonic-gate    #	@words = old_shellwords($line);
106*0Sstevel@tonic-gate    #	or
107*0Sstevel@tonic-gate    #	@words = old_shellwords(@lines);
108*0Sstevel@tonic-gate
109*0Sstevel@tonic-gate    local($_) = join('', @_);
110*0Sstevel@tonic-gate    my(@words,$snippet,$field);
111*0Sstevel@tonic-gate
112*0Sstevel@tonic-gate    s/^\s+//;
113*0Sstevel@tonic-gate    while ($_ ne '') {
114*0Sstevel@tonic-gate	$field = '';
115*0Sstevel@tonic-gate	for (;;) {
116*0Sstevel@tonic-gate	    if (s/^"(([^"\\]|\\.)*)"//) {
117*0Sstevel@tonic-gate		($snippet = $1) =~ s#\\(.)#$1#g;
118*0Sstevel@tonic-gate	    }
119*0Sstevel@tonic-gate	    elsif (/^"/) {
120*0Sstevel@tonic-gate		return();
121*0Sstevel@tonic-gate	    }
122*0Sstevel@tonic-gate	    elsif (s/^'(([^'\\]|\\.)*)'//) {
123*0Sstevel@tonic-gate		($snippet = $1) =~ s#\\(.)#$1#g;
124*0Sstevel@tonic-gate	    }
125*0Sstevel@tonic-gate	    elsif (/^'/) {
126*0Sstevel@tonic-gate		return();
127*0Sstevel@tonic-gate	    }
128*0Sstevel@tonic-gate	    elsif (s/^\\(.)//) {
129*0Sstevel@tonic-gate		$snippet = $1;
130*0Sstevel@tonic-gate	    }
131*0Sstevel@tonic-gate	    elsif (s/^([^\s\\'"]+)//) {
132*0Sstevel@tonic-gate		$snippet = $1;
133*0Sstevel@tonic-gate	    }
134*0Sstevel@tonic-gate	    else {
135*0Sstevel@tonic-gate		s/^\s+//;
136*0Sstevel@tonic-gate		last;
137*0Sstevel@tonic-gate	    }
138*0Sstevel@tonic-gate	    $field .= $snippet;
139*0Sstevel@tonic-gate	}
140*0Sstevel@tonic-gate	push(@words, $field);
141*0Sstevel@tonic-gate    }
142*0Sstevel@tonic-gate    @words;
143*0Sstevel@tonic-gate}
144*0Sstevel@tonic-gate
145*0Sstevel@tonic-gate1;
146*0Sstevel@tonic-gate
147*0Sstevel@tonic-gate__END__
148*0Sstevel@tonic-gate
149*0Sstevel@tonic-gate=head1 NAME
150*0Sstevel@tonic-gate
151*0Sstevel@tonic-gateText::ParseWords - parse text into an array of tokens or array of arrays
152*0Sstevel@tonic-gate
153*0Sstevel@tonic-gate=head1 SYNOPSIS
154*0Sstevel@tonic-gate
155*0Sstevel@tonic-gate  use Text::ParseWords;
156*0Sstevel@tonic-gate  @lists = &nested_quotewords($delim, $keep, @lines);
157*0Sstevel@tonic-gate  @words = &quotewords($delim, $keep, @lines);
158*0Sstevel@tonic-gate  @words = &shellwords(@lines);
159*0Sstevel@tonic-gate  @words = &parse_line($delim, $keep, $line);
160*0Sstevel@tonic-gate  @words = &old_shellwords(@lines); # DEPRECATED!
161*0Sstevel@tonic-gate
162*0Sstevel@tonic-gate=head1 DESCRIPTION
163*0Sstevel@tonic-gate
164*0Sstevel@tonic-gateThe &nested_quotewords() and &quotewords() functions accept a delimiter
165*0Sstevel@tonic-gate(which can be a regular expression)
166*0Sstevel@tonic-gateand a list of lines and then breaks those lines up into a list of
167*0Sstevel@tonic-gatewords ignoring delimiters that appear inside quotes.  &quotewords()
168*0Sstevel@tonic-gatereturns all of the tokens in a single long list, while &nested_quotewords()
169*0Sstevel@tonic-gatereturns a list of token lists corresponding to the elements of @lines.
170*0Sstevel@tonic-gate&parse_line() does tokenizing on a single string.  The &*quotewords()
171*0Sstevel@tonic-gatefunctions simply call &parse_lines(), so if you're only splitting
172*0Sstevel@tonic-gateone line you can call &parse_lines() directly and save a function
173*0Sstevel@tonic-gatecall.
174*0Sstevel@tonic-gate
175*0Sstevel@tonic-gateThe $keep argument is a boolean flag.  If true, then the tokens are
176*0Sstevel@tonic-gatesplit on the specified delimiter, but all other characters (quotes,
177*0Sstevel@tonic-gatebackslashes, etc.) are kept in the tokens.  If $keep is false then the
178*0Sstevel@tonic-gate&*quotewords() functions remove all quotes and backslashes that are
179*0Sstevel@tonic-gatenot themselves backslash-escaped or inside of single quotes (i.e.,
180*0Sstevel@tonic-gate&quotewords() tries to interpret these characters just like the Bourne
181*0Sstevel@tonic-gateshell).  NB: these semantics are significantly different from the
182*0Sstevel@tonic-gateoriginal version of this module shipped with Perl 5.000 through 5.004.
183*0Sstevel@tonic-gateAs an additional feature, $keep may be the keyword "delimiters" which
184*0Sstevel@tonic-gatecauses the functions to preserve the delimiters in each string as
185*0Sstevel@tonic-gatetokens in the token lists, in addition to preserving quote and
186*0Sstevel@tonic-gatebackslash characters.
187*0Sstevel@tonic-gate
188*0Sstevel@tonic-gate&shellwords() is written as a special case of &quotewords(), and it
189*0Sstevel@tonic-gatedoes token parsing with whitespace as a delimiter-- similar to most
190*0Sstevel@tonic-gateUnix shells.
191*0Sstevel@tonic-gate
192*0Sstevel@tonic-gate=head1 EXAMPLES
193*0Sstevel@tonic-gate
194*0Sstevel@tonic-gateThe sample program:
195*0Sstevel@tonic-gate
196*0Sstevel@tonic-gate  use Text::ParseWords;
197*0Sstevel@tonic-gate  @words = &quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
198*0Sstevel@tonic-gate  $i = 0;
199*0Sstevel@tonic-gate  foreach (@words) {
200*0Sstevel@tonic-gate      print "$i: <$_>\n";
201*0Sstevel@tonic-gate      $i++;
202*0Sstevel@tonic-gate  }
203*0Sstevel@tonic-gate
204*0Sstevel@tonic-gateproduces:
205*0Sstevel@tonic-gate
206*0Sstevel@tonic-gate  0: <this>
207*0Sstevel@tonic-gate  1: <is>
208*0Sstevel@tonic-gate  2: <a test>
209*0Sstevel@tonic-gate  3: <of quotewords>
210*0Sstevel@tonic-gate  4: <"for>
211*0Sstevel@tonic-gate  5: <you>
212*0Sstevel@tonic-gate
213*0Sstevel@tonic-gatedemonstrating:
214*0Sstevel@tonic-gate
215*0Sstevel@tonic-gate=over 4
216*0Sstevel@tonic-gate
217*0Sstevel@tonic-gate=item 0
218*0Sstevel@tonic-gate
219*0Sstevel@tonic-gatea simple word
220*0Sstevel@tonic-gate
221*0Sstevel@tonic-gate=item 1
222*0Sstevel@tonic-gate
223*0Sstevel@tonic-gatemultiple spaces are skipped because of our $delim
224*0Sstevel@tonic-gate
225*0Sstevel@tonic-gate=item 2
226*0Sstevel@tonic-gate
227*0Sstevel@tonic-gateuse of quotes to include a space in a word
228*0Sstevel@tonic-gate
229*0Sstevel@tonic-gate=item 3
230*0Sstevel@tonic-gate
231*0Sstevel@tonic-gateuse of a backslash to include a space in a word
232*0Sstevel@tonic-gate
233*0Sstevel@tonic-gate=item 4
234*0Sstevel@tonic-gate
235*0Sstevel@tonic-gateuse of a backslash to remove the special meaning of a double-quote
236*0Sstevel@tonic-gate
237*0Sstevel@tonic-gate=item 5
238*0Sstevel@tonic-gate
239*0Sstevel@tonic-gateanother simple word (note the lack of effect of the
240*0Sstevel@tonic-gatebackslashed double-quote)
241*0Sstevel@tonic-gate
242*0Sstevel@tonic-gate=back
243*0Sstevel@tonic-gate
244*0Sstevel@tonic-gateReplacing C<&quotewords('\s+', 0, q{this   is...})>
245*0Sstevel@tonic-gatewith C<&shellwords(q{this   is...})>
246*0Sstevel@tonic-gateis a simpler way to accomplish the same thing.
247*0Sstevel@tonic-gate
248*0Sstevel@tonic-gate=head1 AUTHORS
249*0Sstevel@tonic-gate
250*0Sstevel@tonic-gateMaintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
251*0Sstevel@tonic-gateauthor unknown).  Much of the code for &parse_line() (including the
252*0Sstevel@tonic-gateprimary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
253*0Sstevel@tonic-gate
254*0Sstevel@tonic-gateExamples section another documentation provided by John Heidemann
255*0Sstevel@tonic-gate<johnh@ISI.EDU>
256*0Sstevel@tonic-gate
257*0Sstevel@tonic-gateBug reports, patches, and nagging provided by lots of folks-- thanks
258*0Sstevel@tonic-gateeverybody!  Special thanks to Michael Schwern <schwern@envirolink.org>
259*0Sstevel@tonic-gatefor assuring me that a &nested_quotewords() would be useful, and to
260*0Sstevel@tonic-gateJeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
261*0Sstevel@tonic-gateerror-checking (sort of-- you had to be there).
262*0Sstevel@tonic-gate
263*0Sstevel@tonic-gate=cut
264