1b39c5158Smillertpackage Text::ParseWords; 2b39c5158Smillert 3b39c5158Smillertuse strict; 4*eac174f2Safresh1use warnings; 5b39c5158Smillertrequire 5.006; 6*eac174f2Safresh1our $VERSION = "3.31"; 7b39c5158Smillert 8b39c5158Smillert 9b39c5158Smillertuse Exporter; 10b39c5158Smillertour @ISA = qw(Exporter); 11b39c5158Smillertour @EXPORT = qw(shellwords quotewords nested_quotewords parse_line); 12b39c5158Smillertour @EXPORT_OK = qw(old_shellwords); 13b39c5158Smillertour $PERL_SINGLE_QUOTE; 14b39c5158Smillert 15b39c5158Smillert 16b39c5158Smillertsub shellwords { 17b39c5158Smillert my (@lines) = @_; 18b39c5158Smillert my @allwords; 19b39c5158Smillert 20b39c5158Smillert foreach my $line (@lines) { 21b39c5158Smillert $line =~ s/^\s+//; 22b39c5158Smillert my @words = parse_line('\s+', 0, $line); 23b39c5158Smillert pop @words if (@words and !defined $words[-1]); 24b39c5158Smillert return() unless (@words || !length($line)); 25b39c5158Smillert push(@allwords, @words); 26b39c5158Smillert } 27b39c5158Smillert return(@allwords); 28b39c5158Smillert} 29b39c5158Smillert 30b39c5158Smillert 31b39c5158Smillert 32b39c5158Smillertsub quotewords { 33b39c5158Smillert my($delim, $keep, @lines) = @_; 34b39c5158Smillert my($line, @words, @allwords); 35b39c5158Smillert 36b39c5158Smillert foreach $line (@lines) { 37b39c5158Smillert @words = parse_line($delim, $keep, $line); 38b39c5158Smillert return() unless (@words || !length($line)); 39b39c5158Smillert push(@allwords, @words); 40b39c5158Smillert } 41b39c5158Smillert return(@allwords); 42b39c5158Smillert} 43b39c5158Smillert 44b39c5158Smillert 45b39c5158Smillert 46b39c5158Smillertsub nested_quotewords { 47b39c5158Smillert my($delim, $keep, @lines) = @_; 48b39c5158Smillert my($i, @allwords); 49b39c5158Smillert 50b39c5158Smillert for ($i = 0; $i < @lines; $i++) { 51b39c5158Smillert @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); 52b39c5158Smillert return() unless (@{$allwords[$i]} || !length($lines[$i])); 53b39c5158Smillert } 54b39c5158Smillert return(@allwords); 55b39c5158Smillert} 56b39c5158Smillert 57b39c5158Smillert 58b39c5158Smillert 59b39c5158Smillertsub parse_line { 60b39c5158Smillert my($delimiter, $keep, $line) = @_; 61b39c5158Smillert my($word, @pieces); 62b39c5158Smillert 63b39c5158Smillert no warnings 'uninitialized'; # we will be testing undef strings 64b39c5158Smillert 65b39c5158Smillert while (length($line)) { 66b39c5158Smillert # This pattern is optimised to be stack conservative on older perls. 67b39c5158Smillert # Do not refactor without being careful and testing it on very long strings. 68b39c5158Smillert # See Perl bug #42980 for an example of a stack busting input. 69b39c5158Smillert $line =~ s/^ 70b39c5158Smillert (?: 71b39c5158Smillert # double quoted string 72b39c5158Smillert (") # $quote 73b39c5158Smillert ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted 74b39c5158Smillert | # --OR-- 75b39c5158Smillert # singe quoted string 76b39c5158Smillert (') # $quote 77b39c5158Smillert ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted 78b39c5158Smillert | # --OR-- 79b39c5158Smillert # unquoted string 80b39c5158Smillert ( # $unquoted 81b39c5158Smillert (?:\\.|[^\\"'])*? 82b39c5158Smillert ) 83b39c5158Smillert # followed by 84b39c5158Smillert ( # $delim 85b39c5158Smillert \Z(?!\n) # EOL 86b39c5158Smillert | # --OR-- 87b39c5158Smillert (?-x:$delimiter) # delimiter 88b39c5158Smillert | # --OR-- 89b39c5158Smillert (?!^)(?=["']) # a quote 90b39c5158Smillert ) 91b39c5158Smillert )//xs or return; # extended layout 92b39c5158Smillert my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6); 93b39c5158Smillert 94b39c5158Smillert 95b39c5158Smillert return() unless( defined($quote) || length($unquoted) || length($delim)); 96b39c5158Smillert 97b39c5158Smillert if ($keep) { 98b39c5158Smillert $quoted = "$quote$quoted$quote"; 99b39c5158Smillert } 100b39c5158Smillert else { 101b39c5158Smillert $unquoted =~ s/\\(.)/$1/sg; 102b39c5158Smillert if (defined $quote) { 103b39c5158Smillert $quoted =~ s/\\(.)/$1/sg if ($quote eq '"'); 104b39c5158Smillert $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); 105b39c5158Smillert } 106b39c5158Smillert } 107b39c5158Smillert $word .= substr($line, 0, 0); # leave results tainted 108b39c5158Smillert $word .= defined $quote ? $quoted : $unquoted; 109b39c5158Smillert 110b39c5158Smillert if (length($delim)) { 111b39c5158Smillert push(@pieces, $word); 112b39c5158Smillert push(@pieces, $delim) if ($keep eq 'delimiters'); 113b39c5158Smillert undef $word; 114b39c5158Smillert } 115b39c5158Smillert if (!length($line)) { 116b39c5158Smillert push(@pieces, $word); 117b39c5158Smillert } 118b39c5158Smillert } 119b39c5158Smillert return(@pieces); 120b39c5158Smillert} 121b39c5158Smillert 122b39c5158Smillert 123b39c5158Smillert 124b39c5158Smillertsub old_shellwords { 125b39c5158Smillert 126b39c5158Smillert # Usage: 127b39c5158Smillert # use ParseWords; 128b39c5158Smillert # @words = old_shellwords($line); 129b39c5158Smillert # or 130b39c5158Smillert # @words = old_shellwords(@lines); 131b39c5158Smillert # or 132b39c5158Smillert # @words = old_shellwords(); # defaults to $_ (and clobbers it) 133b39c5158Smillert 134b39c5158Smillert no warnings 'uninitialized'; # we will be testing undef strings 135b39c5158Smillert local *_ = \join('', @_) if @_; 136b39c5158Smillert my (@words, $snippet); 137b39c5158Smillert 138b39c5158Smillert s/\A\s+//; 139b39c5158Smillert while ($_ ne '') { 140b39c5158Smillert my $field = substr($_, 0, 0); # leave results tainted 141b39c5158Smillert for (;;) { 142b39c5158Smillert if (s/\A"(([^"\\]|\\.)*)"//s) { 143b39c5158Smillert ($snippet = $1) =~ s#\\(.)#$1#sg; 144b39c5158Smillert } 145b39c5158Smillert elsif (/\A"/) { 146b39c5158Smillert require Carp; 147b39c5158Smillert Carp::carp("Unmatched double quote: $_"); 148b39c5158Smillert return(); 149b39c5158Smillert } 150b39c5158Smillert elsif (s/\A'(([^'\\]|\\.)*)'//s) { 151b39c5158Smillert ($snippet = $1) =~ s#\\(.)#$1#sg; 152b39c5158Smillert } 153b39c5158Smillert elsif (/\A'/) { 154b39c5158Smillert require Carp; 155b39c5158Smillert Carp::carp("Unmatched single quote: $_"); 156b39c5158Smillert return(); 157b39c5158Smillert } 158b39c5158Smillert elsif (s/\A\\(.?)//s) { 159b39c5158Smillert $snippet = $1; 160b39c5158Smillert } 161b39c5158Smillert elsif (s/\A([^\s\\'"]+)//) { 162b39c5158Smillert $snippet = $1; 163b39c5158Smillert } 164b39c5158Smillert else { 165b39c5158Smillert s/\A\s+//; 166b39c5158Smillert last; 167b39c5158Smillert } 168b39c5158Smillert $field .= $snippet; 169b39c5158Smillert } 170b39c5158Smillert push(@words, $field); 171b39c5158Smillert } 172b39c5158Smillert return @words; 173b39c5158Smillert} 174b39c5158Smillert 175b39c5158Smillert1; 176b39c5158Smillert 177b39c5158Smillert__END__ 178b39c5158Smillert 179b39c5158Smillert=head1 NAME 180b39c5158Smillert 181b39c5158SmillertText::ParseWords - parse text into an array of tokens or array of arrays 182b39c5158Smillert 183b39c5158Smillert=head1 SYNOPSIS 184b39c5158Smillert 185b39c5158Smillert use Text::ParseWords; 186b39c5158Smillert @lists = nested_quotewords($delim, $keep, @lines); 187b39c5158Smillert @words = quotewords($delim, $keep, @lines); 188b39c5158Smillert @words = shellwords(@lines); 189b39c5158Smillert @words = parse_line($delim, $keep, $line); 190b39c5158Smillert @words = old_shellwords(@lines); # DEPRECATED! 191b39c5158Smillert 192b39c5158Smillert=head1 DESCRIPTION 193b39c5158Smillert 194*eac174f2Safresh1The C<nested_quotewords()> and C<quotewords()> functions accept a delimiter 195b39c5158Smillert(which can be a regular expression) 196b39c5158Smillertand a list of lines and then breaks those lines up into a list of 197*eac174f2Safresh1words ignoring delimiters that appear inside quotes. C<quotewords()> 198*eac174f2Safresh1returns all of the tokens in a single long list, while C<nested_quotewords()> 199*eac174f2Safresh1returns a list of token lists corresponding to the elements of C<@lines>. 200*eac174f2Safresh1C<parse_line()> does tokenizing on a single string. The C<*quotewords()> 201*eac174f2Safresh1functions simply call C<parse_line()>, so if you're only splitting 202*eac174f2Safresh1one line you can call C<parse_line()> directly and save a function 203b39c5158Smillertcall. 204b39c5158Smillert 205*eac174f2Safresh1The C<$keep> controls what happens with delimters and special characters: 206*eac174f2Safresh1 207*eac174f2Safresh1=over 4 208*eac174f2Safresh1 209*eac174f2Safresh1=item true 210*eac174f2Safresh1 211*eac174f2Safresh1If true, then the tokens are split on the specified delimiter, 212*eac174f2Safresh1but all other characters (including quotes and backslashes) 213*eac174f2Safresh1are kept in the tokens. 214*eac174f2Safresh1 215*eac174f2Safresh1=item false 216*eac174f2Safresh1 217*eac174f2Safresh1If $keep is false then the C<*quotewords()> functions 218*eac174f2Safresh1remove all quotes and backslashes that are 219b39c5158Smillertnot themselves backslash-escaped or inside of single quotes (i.e., 220*eac174f2Safresh1C<quotewords()> tries to interpret these characters just like the Bourne 221b39c5158Smillertshell). NB: these semantics are significantly different from the 222b39c5158Smillertoriginal version of this module shipped with Perl 5.000 through 5.004. 223*eac174f2Safresh1 224*eac174f2Safresh1=item C<"delimiters"> 225*eac174f2Safresh1 226b39c5158SmillertAs an additional feature, $keep may be the keyword "delimiters" which 227b39c5158Smillertcauses the functions to preserve the delimiters in each string as 228b39c5158Smillerttokens in the token lists, in addition to preserving quote and 229b39c5158Smillertbackslash characters. 230b39c5158Smillert 231*eac174f2Safresh1=back 232*eac174f2Safresh1 233*eac174f2Safresh1C<shellwords()> is written as a special case of C<quotewords()>, and it 234b39c5158Smillertdoes token parsing with whitespace as a delimiter-- similar to most 235b39c5158SmillertUnix shells. 236b39c5158Smillert 237b39c5158Smillert=head1 EXAMPLES 238b39c5158Smillert 239b39c5158SmillertThe sample program: 240b39c5158Smillert 241b39c5158Smillert use Text::ParseWords; 242b39c5158Smillert @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); 243b39c5158Smillert $i = 0; 244b39c5158Smillert foreach (@words) { 245b39c5158Smillert print "$i: <$_>\n"; 246b39c5158Smillert $i++; 247b39c5158Smillert } 248b39c5158Smillert 249b39c5158Smillertproduces: 250b39c5158Smillert 251b39c5158Smillert 0: <this> 252b39c5158Smillert 1: <is> 253b39c5158Smillert 2: <a test> 254b39c5158Smillert 3: <of quotewords> 255b39c5158Smillert 4: <"for> 256b39c5158Smillert 5: <you> 257b39c5158Smillert 258b39c5158Smillertdemonstrating: 259b39c5158Smillert 260b39c5158Smillert=over 4 261b39c5158Smillert 262b8851fccSafresh1=item 0Z<> 263b39c5158Smillert 264b39c5158Smillerta simple word 265b39c5158Smillert 266b8851fccSafresh1=item 1Z<> 267b39c5158Smillert 268b39c5158Smillertmultiple spaces are skipped because of our $delim 269b39c5158Smillert 270b8851fccSafresh1=item 2Z<> 271b39c5158Smillert 272b39c5158Smillertuse of quotes to include a space in a word 273b39c5158Smillert 274b8851fccSafresh1=item 3Z<> 275b39c5158Smillert 276b39c5158Smillertuse of a backslash to include a space in a word 277b39c5158Smillert 278b8851fccSafresh1=item 4Z<> 279b39c5158Smillert 280b39c5158Smillertuse of a backslash to remove the special meaning of a double-quote 281b39c5158Smillert 282b8851fccSafresh1=item 5Z<> 283b39c5158Smillert 284b39c5158Smillertanother simple word (note the lack of effect of the 285b39c5158Smillertbackslashed double-quote) 286b39c5158Smillert 287b39c5158Smillert=back 288b39c5158Smillert 289b39c5158SmillertReplacing C<quotewords('\s+', 0, q{this is...})> 290b39c5158Smillertwith C<shellwords(q{this is...})> 291b39c5158Smillertis a simpler way to accomplish the same thing. 292b39c5158Smillert 29391f110e0Safresh1=head1 SEE ALSO 29491f110e0Safresh1 29591f110e0Safresh1L<Text::CSV> - for parsing CSV files 29691f110e0Safresh1 297b39c5158Smillert=head1 AUTHORS 298b39c5158Smillert 299*eac174f2Safresh1The original author is unknown, 300*eac174f2Safresh1but presumably this evolved from C<shellwords.pl> in Perl 4. 301b39c5158Smillert 302*eac174f2Safresh1Much of the code for C<parse_line()> 303*eac174f2Safresh1(including the primary regexp) 304*eac174f2Safresh1came from Joerk Behrends E<lt>jbehrends@multimediaproduzenten.deE<gt>. 305b39c5158Smillert 306*eac174f2Safresh1Examples section and other documentation provided by 307*eac174f2Safresh1John Heidemann E<lt>johnh@ISI.EDUE<gt>. 308b39c5158Smillert 309*eac174f2Safresh1Hal Pomeranz E<lt>pomeranz@netcom.comE<gt> 310*eac174f2Safresh1maintained this from 1994 through 1999, 311*eac174f2Safresh1and did the first CPAN release. 312*eac174f2Safresh1 313*eac174f2Safresh1Alexandr Ciornii E<lt>alexchornyATgmail.comE<gt> 314*eac174f2Safresh1maintained this from 2008 to 2015. 315*eac174f2Safresh1 316*eac174f2Safresh1Many other people have contributed, 317*eac174f2Safresh1with special thanks due to 318*eac174f2Safresh1Michael Schwern E<lt>schwern@envirolink.orgE<gt> 319*eac174f2Safresh1and 320*eac174f2Safresh1Jeff Friedl E<lt>jfriedl@yahoo-inc.comE<gt>. 321b39c5158Smillert 322b8851fccSafresh1=head1 COPYRIGHT AND LICENSE 323b8851fccSafresh1 324b8851fccSafresh1This library is free software; you may redistribute and/or modify it 325b8851fccSafresh1under the same terms as Perl itself. 326b8851fccSafresh1 327b39c5158Smillert=cut 328