1package Text::ParseWords; 2 3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE); 4$VERSION = "3.21"; 5 6require 5.000; 7 8use Exporter; 9@ISA = qw(Exporter); 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line); 11@EXPORT_OK = qw(old_shellwords); 12 13 14sub shellwords { 15 local(@lines) = @_; 16 $lines[$#lines] =~ s/\s+$//; 17 return(quotewords('\s+', 0, @lines)); 18} 19 20 21 22sub quotewords { 23 my($delim, $keep, @lines) = @_; 24 my($line, @words, @allwords); 25 26 27 foreach $line (@lines) { 28 @words = parse_line($delim, $keep, $line); 29 return() unless (@words || !length($line)); 30 push(@allwords, @words); 31 } 32 return(@allwords); 33} 34 35 36 37sub nested_quotewords { 38 my($delim, $keep, @lines) = @_; 39 my($i, @allwords); 40 41 for ($i = 0; $i < @lines; $i++) { 42 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); 43 return() unless (@{$allwords[$i]} || !length($lines[$i])); 44 } 45 return(@allwords); 46} 47 48 49 50sub parse_line { 51 # We will be testing undef strings 52 no warnings; 53 use re 'taint'; # if it's tainted, leave it as such 54 55 my($delimiter, $keep, $line) = @_; 56 my($quote, $quoted, $unquoted, $delim, $word, @pieces); 57 58 while (length($line)) { 59 60 ($quote, $quoted, undef, $unquoted, $delim, undef) = 61 $line =~ m/^(["']) # a $quote 62 ((?:\\.|(?!\1)[^\\])*) # and $quoted text 63 \1 # followed by the same quote 64 ([\000-\377]*) # and the rest 65 | # --OR-- 66 ^((?:\\.|[^\\"'])*?) # an $unquoted text 67 (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["'])) 68 # plus EOL, delimiter, or quote 69 ([\000-\377]*) # the rest 70 /x; # extended layout 71 return() unless( $quote || length($unquoted) || length($delim)); 72 73 $line = $+; 74 75 if ($keep) { 76 $quoted = "$quote$quoted$quote"; 77 } 78 else { 79 $unquoted =~ s/\\(.)/$1/g; 80 if (defined $quote) { 81 $quoted =~ s/\\(.)/$1/g if ($quote eq '"'); 82 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); 83 } 84 } 85 $word .= defined $quote ? $quoted : $unquoted; 86 87 if (length($delim)) { 88 push(@pieces, $word); 89 push(@pieces, $delim) if ($keep eq 'delimiters'); 90 undef $word; 91 } 92 if (!length($line)) { 93 push(@pieces, $word); 94 } 95 } 96 return(@pieces); 97} 98 99 100 101sub old_shellwords { 102 103 # Usage: 104 # use ParseWords; 105 # @words = old_shellwords($line); 106 # or 107 # @words = old_shellwords(@lines); 108 109 local($_) = join('', @_); 110 my(@words,$snippet,$field); 111 112 s/^\s+//; 113 while ($_ ne '') { 114 $field = ''; 115 for (;;) { 116 if (s/^"(([^"\\]|\\.)*)"//) { 117 ($snippet = $1) =~ s#\\(.)#$1#g; 118 } 119 elsif (/^"/) { 120 return(); 121 } 122 elsif (s/^'(([^'\\]|\\.)*)'//) { 123 ($snippet = $1) =~ s#\\(.)#$1#g; 124 } 125 elsif (/^'/) { 126 return(); 127 } 128 elsif (s/^\\(.)//) { 129 $snippet = $1; 130 } 131 elsif (s/^([^\s\\'"]+)//) { 132 $snippet = $1; 133 } 134 else { 135 s/^\s+//; 136 last; 137 } 138 $field .= $snippet; 139 } 140 push(@words, $field); 141 } 142 @words; 143} 144 1451; 146 147__END__ 148 149=head1 NAME 150 151Text::ParseWords - parse text into an array of tokens or array of arrays 152 153=head1 SYNOPSIS 154 155 use Text::ParseWords; 156 @lists = &nested_quotewords($delim, $keep, @lines); 157 @words = "ewords($delim, $keep, @lines); 158 @words = &shellwords(@lines); 159 @words = &parse_line($delim, $keep, $line); 160 @words = &old_shellwords(@lines); # DEPRECATED! 161 162=head1 DESCRIPTION 163 164The &nested_quotewords() and "ewords() functions accept a delimiter 165(which can be a regular expression) 166and a list of lines and then breaks those lines up into a list of 167words ignoring delimiters that appear inside quotes. "ewords() 168returns all of the tokens in a single long list, while &nested_quotewords() 169returns a list of token lists corresponding to the elements of @lines. 170&parse_line() does tokenizing on a single string. The &*quotewords() 171functions simply call &parse_lines(), so if you're only splitting 172one line you can call &parse_lines() directly and save a function 173call. 174 175The $keep argument is a boolean flag. If true, then the tokens are 176split on the specified delimiter, but all other characters (quotes, 177backslashes, etc.) are kept in the tokens. If $keep is false then the 178&*quotewords() functions remove all quotes and backslashes that are 179not themselves backslash-escaped or inside of single quotes (i.e., 180"ewords() tries to interpret these characters just like the Bourne 181shell). NB: these semantics are significantly different from the 182original version of this module shipped with Perl 5.000 through 5.004. 183As an additional feature, $keep may be the keyword "delimiters" which 184causes the functions to preserve the delimiters in each string as 185tokens in the token lists, in addition to preserving quote and 186backslash characters. 187 188&shellwords() is written as a special case of "ewords(), and it 189does token parsing with whitespace as a delimiter-- similar to most 190Unix shells. 191 192=head1 EXAMPLES 193 194The sample program: 195 196 use Text::ParseWords; 197 @words = "ewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); 198 $i = 0; 199 foreach (@words) { 200 print "$i: <$_>\n"; 201 $i++; 202 } 203 204produces: 205 206 0: <this> 207 1: <is> 208 2: <a test> 209 3: <of quotewords> 210 4: <"for> 211 5: <you> 212 213demonstrating: 214 215=over 4 216 217=item 0 218 219a simple word 220 221=item 1 222 223multiple spaces are skipped because of our $delim 224 225=item 2 226 227use of quotes to include a space in a word 228 229=item 3 230 231use of a backslash to include a space in a word 232 233=item 4 234 235use of a backslash to remove the special meaning of a double-quote 236 237=item 5 238 239another simple word (note the lack of effect of the 240backslashed double-quote) 241 242=back 243 244Replacing C<"ewords('\s+', 0, q{this is...})> 245with C<&shellwords(q{this is...})> 246is a simpler way to accomplish the same thing. 247 248=head1 AUTHORS 249 250Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original 251author unknown). Much of the code for &parse_line() (including the 252primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>. 253 254Examples section another documentation provided by John Heidemann 255<johnh@ISI.EDU> 256 257Bug reports, patches, and nagging provided by lots of folks-- thanks 258everybody! Special thanks to Michael Schwern <schwern@envirolink.org> 259for assuring me that a &nested_quotewords() would be useful, and to 260Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about 261error-checking (sort of-- you had to be there). 262 263=cut 264