1*0Sstevel@tonic-gatepackage Text::ParseWords; 2*0Sstevel@tonic-gate 3*0Sstevel@tonic-gateuse vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE); 4*0Sstevel@tonic-gate$VERSION = "3.21"; 5*0Sstevel@tonic-gate 6*0Sstevel@tonic-gaterequire 5.000; 7*0Sstevel@tonic-gate 8*0Sstevel@tonic-gateuse Exporter; 9*0Sstevel@tonic-gate@ISA = qw(Exporter); 10*0Sstevel@tonic-gate@EXPORT = qw(shellwords quotewords nested_quotewords parse_line); 11*0Sstevel@tonic-gate@EXPORT_OK = qw(old_shellwords); 12*0Sstevel@tonic-gate 13*0Sstevel@tonic-gate 14*0Sstevel@tonic-gatesub shellwords { 15*0Sstevel@tonic-gate local(@lines) = @_; 16*0Sstevel@tonic-gate $lines[$#lines] =~ s/\s+$//; 17*0Sstevel@tonic-gate return(quotewords('\s+', 0, @lines)); 18*0Sstevel@tonic-gate} 19*0Sstevel@tonic-gate 20*0Sstevel@tonic-gate 21*0Sstevel@tonic-gate 22*0Sstevel@tonic-gatesub quotewords { 23*0Sstevel@tonic-gate my($delim, $keep, @lines) = @_; 24*0Sstevel@tonic-gate my($line, @words, @allwords); 25*0Sstevel@tonic-gate 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate foreach $line (@lines) { 28*0Sstevel@tonic-gate @words = parse_line($delim, $keep, $line); 29*0Sstevel@tonic-gate return() unless (@words || !length($line)); 30*0Sstevel@tonic-gate push(@allwords, @words); 31*0Sstevel@tonic-gate } 32*0Sstevel@tonic-gate return(@allwords); 33*0Sstevel@tonic-gate} 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gate 36*0Sstevel@tonic-gate 37*0Sstevel@tonic-gatesub nested_quotewords { 38*0Sstevel@tonic-gate my($delim, $keep, @lines) = @_; 39*0Sstevel@tonic-gate my($i, @allwords); 40*0Sstevel@tonic-gate 41*0Sstevel@tonic-gate for ($i = 0; $i < @lines; $i++) { 42*0Sstevel@tonic-gate @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); 43*0Sstevel@tonic-gate return() unless (@{$allwords[$i]} || !length($lines[$i])); 44*0Sstevel@tonic-gate } 45*0Sstevel@tonic-gate return(@allwords); 46*0Sstevel@tonic-gate} 47*0Sstevel@tonic-gate 48*0Sstevel@tonic-gate 49*0Sstevel@tonic-gate 50*0Sstevel@tonic-gatesub parse_line { 51*0Sstevel@tonic-gate # We will be testing undef strings 52*0Sstevel@tonic-gate no warnings; 53*0Sstevel@tonic-gate use re 'taint'; # if it's tainted, leave it as such 54*0Sstevel@tonic-gate 55*0Sstevel@tonic-gate my($delimiter, $keep, $line) = @_; 56*0Sstevel@tonic-gate my($quote, $quoted, $unquoted, $delim, $word, @pieces); 57*0Sstevel@tonic-gate 58*0Sstevel@tonic-gate while (length($line)) { 59*0Sstevel@tonic-gate 60*0Sstevel@tonic-gate ($quote, $quoted, undef, $unquoted, $delim, undef) = 61*0Sstevel@tonic-gate $line =~ m/^(["']) # a $quote 62*0Sstevel@tonic-gate ((?:\\.|(?!\1)[^\\])*) # and $quoted text 63*0Sstevel@tonic-gate \1 # followed by the same quote 64*0Sstevel@tonic-gate ([\000-\377]*) # and the rest 65*0Sstevel@tonic-gate | # --OR-- 66*0Sstevel@tonic-gate ^((?:\\.|[^\\"'])*?) # an $unquoted text 67*0Sstevel@tonic-gate (\Z(?!\n)|(?-x:$delimiter)|(?!^)(?=["'])) 68*0Sstevel@tonic-gate # plus EOL, delimiter, or quote 69*0Sstevel@tonic-gate ([\000-\377]*) # the rest 70*0Sstevel@tonic-gate /x; # extended layout 71*0Sstevel@tonic-gate return() unless( $quote || length($unquoted) || length($delim)); 72*0Sstevel@tonic-gate 73*0Sstevel@tonic-gate $line = $+; 74*0Sstevel@tonic-gate 75*0Sstevel@tonic-gate if ($keep) { 76*0Sstevel@tonic-gate $quoted = "$quote$quoted$quote"; 77*0Sstevel@tonic-gate } 78*0Sstevel@tonic-gate else { 79*0Sstevel@tonic-gate $unquoted =~ s/\\(.)/$1/g; 80*0Sstevel@tonic-gate if (defined $quote) { 81*0Sstevel@tonic-gate $quoted =~ s/\\(.)/$1/g if ($quote eq '"'); 82*0Sstevel@tonic-gate $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); 83*0Sstevel@tonic-gate } 84*0Sstevel@tonic-gate } 85*0Sstevel@tonic-gate $word .= defined $quote ? $quoted : $unquoted; 86*0Sstevel@tonic-gate 87*0Sstevel@tonic-gate if (length($delim)) { 88*0Sstevel@tonic-gate push(@pieces, $word); 89*0Sstevel@tonic-gate push(@pieces, $delim) if ($keep eq 'delimiters'); 90*0Sstevel@tonic-gate undef $word; 91*0Sstevel@tonic-gate } 92*0Sstevel@tonic-gate if (!length($line)) { 93*0Sstevel@tonic-gate push(@pieces, $word); 94*0Sstevel@tonic-gate } 95*0Sstevel@tonic-gate } 96*0Sstevel@tonic-gate return(@pieces); 97*0Sstevel@tonic-gate} 98*0Sstevel@tonic-gate 99*0Sstevel@tonic-gate 100*0Sstevel@tonic-gate 101*0Sstevel@tonic-gatesub old_shellwords { 102*0Sstevel@tonic-gate 103*0Sstevel@tonic-gate # Usage: 104*0Sstevel@tonic-gate # use ParseWords; 105*0Sstevel@tonic-gate # @words = old_shellwords($line); 106*0Sstevel@tonic-gate # or 107*0Sstevel@tonic-gate # @words = old_shellwords(@lines); 108*0Sstevel@tonic-gate 109*0Sstevel@tonic-gate local($_) = join('', @_); 110*0Sstevel@tonic-gate my(@words,$snippet,$field); 111*0Sstevel@tonic-gate 112*0Sstevel@tonic-gate s/^\s+//; 113*0Sstevel@tonic-gate while ($_ ne '') { 114*0Sstevel@tonic-gate $field = ''; 115*0Sstevel@tonic-gate for (;;) { 116*0Sstevel@tonic-gate if (s/^"(([^"\\]|\\.)*)"//) { 117*0Sstevel@tonic-gate ($snippet = $1) =~ s#\\(.)#$1#g; 118*0Sstevel@tonic-gate } 119*0Sstevel@tonic-gate elsif (/^"/) { 120*0Sstevel@tonic-gate return(); 121*0Sstevel@tonic-gate } 122*0Sstevel@tonic-gate elsif (s/^'(([^'\\]|\\.)*)'//) { 123*0Sstevel@tonic-gate ($snippet = $1) =~ s#\\(.)#$1#g; 124*0Sstevel@tonic-gate } 125*0Sstevel@tonic-gate elsif (/^'/) { 126*0Sstevel@tonic-gate return(); 127*0Sstevel@tonic-gate } 128*0Sstevel@tonic-gate elsif (s/^\\(.)//) { 129*0Sstevel@tonic-gate $snippet = $1; 130*0Sstevel@tonic-gate } 131*0Sstevel@tonic-gate elsif (s/^([^\s\\'"]+)//) { 132*0Sstevel@tonic-gate $snippet = $1; 133*0Sstevel@tonic-gate } 134*0Sstevel@tonic-gate else { 135*0Sstevel@tonic-gate s/^\s+//; 136*0Sstevel@tonic-gate last; 137*0Sstevel@tonic-gate } 138*0Sstevel@tonic-gate $field .= $snippet; 139*0Sstevel@tonic-gate } 140*0Sstevel@tonic-gate push(@words, $field); 141*0Sstevel@tonic-gate } 142*0Sstevel@tonic-gate @words; 143*0Sstevel@tonic-gate} 144*0Sstevel@tonic-gate 145*0Sstevel@tonic-gate1; 146*0Sstevel@tonic-gate 147*0Sstevel@tonic-gate__END__ 148*0Sstevel@tonic-gate 149*0Sstevel@tonic-gate=head1 NAME 150*0Sstevel@tonic-gate 151*0Sstevel@tonic-gateText::ParseWords - parse text into an array of tokens or array of arrays 152*0Sstevel@tonic-gate 153*0Sstevel@tonic-gate=head1 SYNOPSIS 154*0Sstevel@tonic-gate 155*0Sstevel@tonic-gate use Text::ParseWords; 156*0Sstevel@tonic-gate @lists = &nested_quotewords($delim, $keep, @lines); 157*0Sstevel@tonic-gate @words = "ewords($delim, $keep, @lines); 158*0Sstevel@tonic-gate @words = &shellwords(@lines); 159*0Sstevel@tonic-gate @words = &parse_line($delim, $keep, $line); 160*0Sstevel@tonic-gate @words = &old_shellwords(@lines); # DEPRECATED! 161*0Sstevel@tonic-gate 162*0Sstevel@tonic-gate=head1 DESCRIPTION 163*0Sstevel@tonic-gate 164*0Sstevel@tonic-gateThe &nested_quotewords() and "ewords() functions accept a delimiter 165*0Sstevel@tonic-gate(which can be a regular expression) 166*0Sstevel@tonic-gateand a list of lines and then breaks those lines up into a list of 167*0Sstevel@tonic-gatewords ignoring delimiters that appear inside quotes. "ewords() 168*0Sstevel@tonic-gatereturns all of the tokens in a single long list, while &nested_quotewords() 169*0Sstevel@tonic-gatereturns a list of token lists corresponding to the elements of @lines. 170*0Sstevel@tonic-gate&parse_line() does tokenizing on a single string. The &*quotewords() 171*0Sstevel@tonic-gatefunctions simply call &parse_lines(), so if you're only splitting 172*0Sstevel@tonic-gateone line you can call &parse_lines() directly and save a function 173*0Sstevel@tonic-gatecall. 174*0Sstevel@tonic-gate 175*0Sstevel@tonic-gateThe $keep argument is a boolean flag. If true, then the tokens are 176*0Sstevel@tonic-gatesplit on the specified delimiter, but all other characters (quotes, 177*0Sstevel@tonic-gatebackslashes, etc.) are kept in the tokens. If $keep is false then the 178*0Sstevel@tonic-gate&*quotewords() functions remove all quotes and backslashes that are 179*0Sstevel@tonic-gatenot themselves backslash-escaped or inside of single quotes (i.e., 180*0Sstevel@tonic-gate"ewords() tries to interpret these characters just like the Bourne 181*0Sstevel@tonic-gateshell). NB: these semantics are significantly different from the 182*0Sstevel@tonic-gateoriginal version of this module shipped with Perl 5.000 through 5.004. 183*0Sstevel@tonic-gateAs an additional feature, $keep may be the keyword "delimiters" which 184*0Sstevel@tonic-gatecauses the functions to preserve the delimiters in each string as 185*0Sstevel@tonic-gatetokens in the token lists, in addition to preserving quote and 186*0Sstevel@tonic-gatebackslash characters. 187*0Sstevel@tonic-gate 188*0Sstevel@tonic-gate&shellwords() is written as a special case of "ewords(), and it 189*0Sstevel@tonic-gatedoes token parsing with whitespace as a delimiter-- similar to most 190*0Sstevel@tonic-gateUnix shells. 191*0Sstevel@tonic-gate 192*0Sstevel@tonic-gate=head1 EXAMPLES 193*0Sstevel@tonic-gate 194*0Sstevel@tonic-gateThe sample program: 195*0Sstevel@tonic-gate 196*0Sstevel@tonic-gate use Text::ParseWords; 197*0Sstevel@tonic-gate @words = "ewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); 198*0Sstevel@tonic-gate $i = 0; 199*0Sstevel@tonic-gate foreach (@words) { 200*0Sstevel@tonic-gate print "$i: <$_>\n"; 201*0Sstevel@tonic-gate $i++; 202*0Sstevel@tonic-gate } 203*0Sstevel@tonic-gate 204*0Sstevel@tonic-gateproduces: 205*0Sstevel@tonic-gate 206*0Sstevel@tonic-gate 0: <this> 207*0Sstevel@tonic-gate 1: <is> 208*0Sstevel@tonic-gate 2: <a test> 209*0Sstevel@tonic-gate 3: <of quotewords> 210*0Sstevel@tonic-gate 4: <"for> 211*0Sstevel@tonic-gate 5: <you> 212*0Sstevel@tonic-gate 213*0Sstevel@tonic-gatedemonstrating: 214*0Sstevel@tonic-gate 215*0Sstevel@tonic-gate=over 4 216*0Sstevel@tonic-gate 217*0Sstevel@tonic-gate=item 0 218*0Sstevel@tonic-gate 219*0Sstevel@tonic-gatea simple word 220*0Sstevel@tonic-gate 221*0Sstevel@tonic-gate=item 1 222*0Sstevel@tonic-gate 223*0Sstevel@tonic-gatemultiple spaces are skipped because of our $delim 224*0Sstevel@tonic-gate 225*0Sstevel@tonic-gate=item 2 226*0Sstevel@tonic-gate 227*0Sstevel@tonic-gateuse of quotes to include a space in a word 228*0Sstevel@tonic-gate 229*0Sstevel@tonic-gate=item 3 230*0Sstevel@tonic-gate 231*0Sstevel@tonic-gateuse of a backslash to include a space in a word 232*0Sstevel@tonic-gate 233*0Sstevel@tonic-gate=item 4 234*0Sstevel@tonic-gate 235*0Sstevel@tonic-gateuse of a backslash to remove the special meaning of a double-quote 236*0Sstevel@tonic-gate 237*0Sstevel@tonic-gate=item 5 238*0Sstevel@tonic-gate 239*0Sstevel@tonic-gateanother simple word (note the lack of effect of the 240*0Sstevel@tonic-gatebackslashed double-quote) 241*0Sstevel@tonic-gate 242*0Sstevel@tonic-gate=back 243*0Sstevel@tonic-gate 244*0Sstevel@tonic-gateReplacing C<"ewords('\s+', 0, q{this is...})> 245*0Sstevel@tonic-gatewith C<&shellwords(q{this is...})> 246*0Sstevel@tonic-gateis a simpler way to accomplish the same thing. 247*0Sstevel@tonic-gate 248*0Sstevel@tonic-gate=head1 AUTHORS 249*0Sstevel@tonic-gate 250*0Sstevel@tonic-gateMaintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original 251*0Sstevel@tonic-gateauthor unknown). Much of the code for &parse_line() (including the 252*0Sstevel@tonic-gateprimary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>. 253*0Sstevel@tonic-gate 254*0Sstevel@tonic-gateExamples section another documentation provided by John Heidemann 255*0Sstevel@tonic-gate<johnh@ISI.EDU> 256*0Sstevel@tonic-gate 257*0Sstevel@tonic-gateBug reports, patches, and nagging provided by lots of folks-- thanks 258*0Sstevel@tonic-gateeverybody! Special thanks to Michael Schwern <schwern@envirolink.org> 259*0Sstevel@tonic-gatefor assuring me that a &nested_quotewords() would be useful, and to 260*0Sstevel@tonic-gateJeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about 261*0Sstevel@tonic-gateerror-checking (sort of-- you had to be there). 262*0Sstevel@tonic-gate 263*0Sstevel@tonic-gate=cut 264