1package Text::ParseWords; 2 3use strict; 4require 5.006; 5our $VERSION = "3.30"; 6 7 8use Exporter; 9our @ISA = qw(Exporter); 10our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line); 11our @EXPORT_OK = qw(old_shellwords); 12our $PERL_SINGLE_QUOTE; 13 14 15sub shellwords { 16 my (@lines) = @_; 17 my @allwords; 18 19 foreach my $line (@lines) { 20 $line =~ s/^\s+//; 21 my @words = parse_line('\s+', 0, $line); 22 pop @words if (@words and !defined $words[-1]); 23 return() unless (@words || !length($line)); 24 push(@allwords, @words); 25 } 26 return(@allwords); 27} 28 29 30 31sub quotewords { 32 my($delim, $keep, @lines) = @_; 33 my($line, @words, @allwords); 34 35 foreach $line (@lines) { 36 @words = parse_line($delim, $keep, $line); 37 return() unless (@words || !length($line)); 38 push(@allwords, @words); 39 } 40 return(@allwords); 41} 42 43 44 45sub nested_quotewords { 46 my($delim, $keep, @lines) = @_; 47 my($i, @allwords); 48 49 for ($i = 0; $i < @lines; $i++) { 50 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); 51 return() unless (@{$allwords[$i]} || !length($lines[$i])); 52 } 53 return(@allwords); 54} 55 56 57 58sub parse_line { 59 my($delimiter, $keep, $line) = @_; 60 my($word, @pieces); 61 62 no warnings 'uninitialized'; # we will be testing undef strings 63 64 while (length($line)) { 65 # This pattern is optimised to be stack conservative on older perls. 66 # Do not refactor without being careful and testing it on very long strings. 67 # See Perl bug #42980 for an example of a stack busting input. 68 $line =~ s/^ 69 (?: 70 # double quoted string 71 (") # $quote 72 ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted 73 | # --OR-- 74 # singe quoted string 75 (') # $quote 76 ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted 77 | # --OR-- 78 # unquoted string 79 ( # $unquoted 80 (?:\\.|[^\\"'])*? 81 ) 82 # followed by 83 ( # $delim 84 \Z(?!\n) # EOL 85 | # --OR-- 86 (?-x:$delimiter) # delimiter 87 | # --OR-- 88 (?!^)(?=["']) # a quote 89 ) 90 )//xs or return; # extended layout 91 my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6); 92 93 94 return() unless( defined($quote) || length($unquoted) || length($delim)); 95 96 if ($keep) { 97 $quoted = "$quote$quoted$quote"; 98 } 99 else { 100 $unquoted =~ s/\\(.)/$1/sg; 101 if (defined $quote) { 102 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"'); 103 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); 104 } 105 } 106 $word .= substr($line, 0, 0); # leave results tainted 107 $word .= defined $quote ? $quoted : $unquoted; 108 109 if (length($delim)) { 110 push(@pieces, $word); 111 push(@pieces, $delim) if ($keep eq 'delimiters'); 112 undef $word; 113 } 114 if (!length($line)) { 115 push(@pieces, $word); 116 } 117 } 118 return(@pieces); 119} 120 121 122 123sub old_shellwords { 124 125 # Usage: 126 # use ParseWords; 127 # @words = old_shellwords($line); 128 # or 129 # @words = old_shellwords(@lines); 130 # or 131 # @words = old_shellwords(); # defaults to $_ (and clobbers it) 132 133 no warnings 'uninitialized'; # we will be testing undef strings 134 local *_ = \join('', @_) if @_; 135 my (@words, $snippet); 136 137 s/\A\s+//; 138 while ($_ ne '') { 139 my $field = substr($_, 0, 0); # leave results tainted 140 for (;;) { 141 if (s/\A"(([^"\\]|\\.)*)"//s) { 142 ($snippet = $1) =~ s#\\(.)#$1#sg; 143 } 144 elsif (/\A"/) { 145 require Carp; 146 Carp::carp("Unmatched double quote: $_"); 147 return(); 148 } 149 elsif (s/\A'(([^'\\]|\\.)*)'//s) { 150 ($snippet = $1) =~ s#\\(.)#$1#sg; 151 } 152 elsif (/\A'/) { 153 require Carp; 154 Carp::carp("Unmatched single quote: $_"); 155 return(); 156 } 157 elsif (s/\A\\(.?)//s) { 158 $snippet = $1; 159 } 160 elsif (s/\A([^\s\\'"]+)//) { 161 $snippet = $1; 162 } 163 else { 164 s/\A\s+//; 165 last; 166 } 167 $field .= $snippet; 168 } 169 push(@words, $field); 170 } 171 return @words; 172} 173 1741; 175 176__END__ 177 178=head1 NAME 179 180Text::ParseWords - parse text into an array of tokens or array of arrays 181 182=head1 SYNOPSIS 183 184 use Text::ParseWords; 185 @lists = nested_quotewords($delim, $keep, @lines); 186 @words = quotewords($delim, $keep, @lines); 187 @words = shellwords(@lines); 188 @words = parse_line($delim, $keep, $line); 189 @words = old_shellwords(@lines); # DEPRECATED! 190 191=head1 DESCRIPTION 192 193The &nested_quotewords() and "ewords() functions accept a delimiter 194(which can be a regular expression) 195and a list of lines and then breaks those lines up into a list of 196words ignoring delimiters that appear inside quotes. "ewords() 197returns all of the tokens in a single long list, while &nested_quotewords() 198returns a list of token lists corresponding to the elements of @lines. 199&parse_line() does tokenizing on a single string. The &*quotewords() 200functions simply call &parse_line(), so if you're only splitting 201one line you can call &parse_line() directly and save a function 202call. 203 204The $keep argument is a boolean flag. If true, then the tokens are 205split on the specified delimiter, but all other characters (including 206quotes and backslashes) are kept in the tokens. If $keep is false then the 207&*quotewords() functions remove all quotes and backslashes that are 208not themselves backslash-escaped or inside of single quotes (i.e., 209"ewords() tries to interpret these characters just like the Bourne 210shell). NB: these semantics are significantly different from the 211original version of this module shipped with Perl 5.000 through 5.004. 212As an additional feature, $keep may be the keyword "delimiters" which 213causes the functions to preserve the delimiters in each string as 214tokens in the token lists, in addition to preserving quote and 215backslash characters. 216 217&shellwords() is written as a special case of "ewords(), and it 218does token parsing with whitespace as a delimiter-- similar to most 219Unix shells. 220 221=head1 EXAMPLES 222 223The sample program: 224 225 use Text::ParseWords; 226 @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); 227 $i = 0; 228 foreach (@words) { 229 print "$i: <$_>\n"; 230 $i++; 231 } 232 233produces: 234 235 0: <this> 236 1: <is> 237 2: <a test> 238 3: <of quotewords> 239 4: <"for> 240 5: <you> 241 242demonstrating: 243 244=over 4 245 246=item 0Z<> 247 248a simple word 249 250=item 1Z<> 251 252multiple spaces are skipped because of our $delim 253 254=item 2Z<> 255 256use of quotes to include a space in a word 257 258=item 3Z<> 259 260use of a backslash to include a space in a word 261 262=item 4Z<> 263 264use of a backslash to remove the special meaning of a double-quote 265 266=item 5Z<> 267 268another simple word (note the lack of effect of the 269backslashed double-quote) 270 271=back 272 273Replacing C<quotewords('\s+', 0, q{this is...})> 274with C<shellwords(q{this is...})> 275is a simpler way to accomplish the same thing. 276 277=head1 SEE ALSO 278 279L<Text::CSV> - for parsing CSV files 280 281=head1 AUTHORS 282 283Maintainer: Alexandr Ciornii <alexchornyATgmail.com>. 284 285Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original 286author unknown). Much of the code for &parse_line() (including the 287primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>. 288 289Examples section another documentation provided by John Heidemann 290<johnh@ISI.EDU> 291 292Bug reports, patches, and nagging provided by lots of folks-- thanks 293everybody! Special thanks to Michael Schwern <schwern@envirolink.org> 294for assuring me that a &nested_quotewords() would be useful, and to 295Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about 296error-checking (sort of-- you had to be there). 297 298=head1 COPYRIGHT AND LICENSE 299 300This library is free software; you may redistribute and/or modify it 301under the same terms as Perl itself. 302 303=cut 304