This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The new Archive::Tar tests are TODO on VMS for reasons unrelated
[perl5.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e
LW
1package Text::ParseWords;
2
b174585d 3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
f2a30bc9 4$VERSION = "3.26";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e
LW
11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a 14sub shellwords {
d5c14ab2
AT
15 my (@lines) = @_;
16 my @allwords;
17
18 foreach my $line (@lines) {
19 $line =~ s/^\s+//;
20 my @words = parse_line('\s+', 0, $line);
21 pop @words if (@words and !defined $words[-1]);
22 return() unless (@words || !length($line));
23 push(@allwords, @words);
24 }
25 return(@allwords);
9b599b2a 26}
a5f75d66 27
a5f75d66 28
a5f75d66 29
9b599b2a
GS
30sub quotewords {
31 my($delim, $keep, @lines) = @_;
32 my($line, @words, @allwords);
9b599b2a
GS
33
34 foreach $line (@lines) {
35 @words = parse_line($delim, $keep, $line);
36 return() unless (@words || !length($line));
37 push(@allwords, @words);
38 }
39 return(@allwords);
40}
a5f75d66 41
a5f75d66 42
a5f75d66 43
9b599b2a
GS
44sub nested_quotewords {
45 my($delim, $keep, @lines) = @_;
46 my($i, @allwords);
6a724e38 47
9b599b2a
GS
48 for ($i = 0; $i < @lines; $i++) {
49 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
50 return() unless (@{$allwords[$i]} || !length($lines[$i]));
51 }
52 return(@allwords);
2304df62
AD
53}
54
55
a0d0e21e 56
9b599b2a
GS
57sub parse_line {
58 my($delimiter, $keep, $line) = @_;
429b060a 59 my($word, @pieces);
936c8837 60
6a724e38
AT
61 no warnings 'uninitialized'; # we will be testing undef strings
62
9b599b2a 63 while (length($line)) {
f2a30bc9
YO
64 # This pattern is optimised to be stack conservative on older perls.
65 # Do not refactor without being careful and testing it on very long strings.
66 # See Perl bug #42980 for an example of a stack busting input.
67 $line =~ s/^
68 (?:
69 # double quoted string
70 (") # $quote
71 ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted
72 | # --OR--
73 # singe quoted string
74 (') # $quote
75 ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted
76 | # --OR--
77 # unquoted string
78 ( # $unquoted
79 (?:\\.|[^\\"'])*?
80 )
81 # followed by
82 ( # $delim
83 \Z(?!\n) # EOL
84 | # --OR--
85 (?-x:$delimiter) # delimiter
86 | # --OR--
87 (?!^)(?=["']) # a quote
88 )
89 )//xs or return; # extended layout
90 my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
91
92
429b060a 93 return() unless( defined($quote) || length($unquoted) || length($delim));
936c8837 94
9b599b2a
GS
95 if ($keep) {
96 $quoted = "$quote$quoted$quote";
97 }
98 else {
429b060a 99 $unquoted =~ s/\\(.)/$1/sg;
167b9ebc 100 if (defined $quote) {
429b060a 101 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
167b9ebc
JH
102 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
103 }
9b599b2a 104 }
6a724e38 105 $word .= substr($line, 0, 0); # leave results tainted
167b9ebc 106 $word .= defined $quote ? $quoted : $unquoted;
9b599b2a
GS
107
108 if (length($delim)) {
109 push(@pieces, $word);
110 push(@pieces, $delim) if ($keep eq 'delimiters');
111 undef $word;
112 }
113 if (!length($line)) {
114 push(@pieces, $word);
2304df62 115 }
2304df62 116 }
9b599b2a 117 return(@pieces);
2304df62 118}
2304df62
AD
119
120
9b599b2a 121
a0d0e21e
LW
122sub old_shellwords {
123
124 # Usage:
125 # use ParseWords;
126 # @words = old_shellwords($line);
127 # or
128 # @words = old_shellwords(@lines);
6a724e38
AT
129 # or
130 # @words = old_shellwords(); # defaults to $_ (and clobbers it)
a0d0e21e 131
6a724e38
AT
132 no warnings 'uninitialized'; # we will be testing undef strings
133 local *_ = \join('', @_) if @_;
134 my (@words, $snippet);
a0d0e21e 135
6a724e38 136 s/\A\s+//;
a0d0e21e 137 while ($_ ne '') {
6a724e38 138 my $field = substr($_, 0, 0); # leave results tainted
a0d0e21e 139 for (;;) {
6a724e38
AT
140 if (s/\A"(([^"\\]|\\.)*)"//s) {
141 ($snippet = $1) =~ s#\\(.)#$1#sg;
a0d0e21e 142 }
6a724e38
AT
143 elsif (/\A"/) {
144 require Carp;
145 Carp::carp("Unmatched double quote: $_");
9b599b2a 146 return();
a0d0e21e 147 }
6a724e38
AT
148 elsif (s/\A'(([^'\\]|\\.)*)'//s) {
149 ($snippet = $1) =~ s#\\(.)#$1#sg;
a0d0e21e 150 }
6a724e38
AT
151 elsif (/\A'/) {
152 require Carp;
153 Carp::carp("Unmatched single quote: $_");
9b599b2a 154 return();
a0d0e21e 155 }
9983eac8 156 elsif (s/\A\\(.?)//s) {
a0d0e21e
LW
157 $snippet = $1;
158 }
6a724e38 159 elsif (s/\A([^\s\\'"]+)//) {
a0d0e21e
LW
160 $snippet = $1;
161 }
162 else {
6a724e38 163 s/\A\s+//;
a0d0e21e
LW
164 last;
165 }
166 $field .= $snippet;
167 }
168 push(@words, $field);
169 }
6a724e38 170 return @words;
a0d0e21e 171}
9b599b2a
GS
172
1731;
174
175__END__
176
177=head1 NAME
178
179Text::ParseWords - parse text into an array of tokens or array of arrays
180
181=head1 SYNOPSIS
182
183 use Text::ParseWords;
184 @lists = &nested_quotewords($delim, $keep, @lines);
185 @words = &quotewords($delim, $keep, @lines);
186 @words = &shellwords(@lines);
187 @words = &parse_line($delim, $keep, $line);
188 @words = &old_shellwords(@lines); # DEPRECATED!
189
190=head1 DESCRIPTION
191
192The &nested_quotewords() and &quotewords() functions accept a delimiter
193(which can be a regular expression)
194and a list of lines and then breaks those lines up into a list of
195words ignoring delimiters that appear inside quotes. &quotewords()
196returns all of the tokens in a single long list, while &nested_quotewords()
197returns a list of token lists corresponding to the elements of @lines.
198&parse_line() does tokenizing on a single string. The &*quotewords()
cf18bebb
B
199functions simply call &parse_line(), so if you're only splitting
200one line you can call &parse_line() directly and save a function
9b599b2a
GS
201call.
202
203The $keep argument is a boolean flag. If true, then the tokens are
204split on the specified delimiter, but all other characters (quotes,
205backslashes, etc.) are kept in the tokens. If $keep is false then the
206&*quotewords() functions remove all quotes and backslashes that are
207not themselves backslash-escaped or inside of single quotes (i.e.,
208&quotewords() tries to interpret these characters just like the Bourne
209shell). NB: these semantics are significantly different from the
210original version of this module shipped with Perl 5.000 through 5.004.
211As an additional feature, $keep may be the keyword "delimiters" which
212causes the functions to preserve the delimiters in each string as
213tokens in the token lists, in addition to preserving quote and
214backslash characters.
215
216&shellwords() is written as a special case of &quotewords(), and it
217does token parsing with whitespace as a delimiter-- similar to most
218Unix shells.
219
220=head1 EXAMPLES
221
222The sample program:
223
224 use Text::ParseWords;
225 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
226 $i = 0;
227 foreach (@words) {
228 print "$i: <$_>\n";
229 $i++;
230 }
231
232produces:
233
234 0: <this>
235 1: <is>
236 2: <a test>
237 3: <of quotewords>
238 4: <"for>
239 5: <you>
240
241demonstrating:
242
243=over 4
244
245=item 0
551e1d92 246
9b599b2a
GS
247a simple word
248
249=item 1
551e1d92 250
9b599b2a
GS
251multiple spaces are skipped because of our $delim
252
253=item 2
551e1d92 254
9b599b2a
GS
255use of quotes to include a space in a word
256
257=item 3
551e1d92 258
9b599b2a
GS
259use of a backslash to include a space in a word
260
261=item 4
551e1d92 262
9b599b2a
GS
263use of a backslash to remove the special meaning of a double-quote
264
265=item 5
551e1d92 266
9b599b2a
GS
267another simple word (note the lack of effect of the
268backslashed double-quote)
269
270=back
271
272Replacing C<&quotewords('\s+', 0, q{this is...})>
273with C<&shellwords(q{this is...})>
274is a simpler way to accomplish the same thing.
275
276=head1 AUTHORS
277
278Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
279author unknown). Much of the code for &parse_line() (including the
280primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
281
282Examples section another documentation provided by John Heidemann
283<johnh@ISI.EDU>
284
285Bug reports, patches, and nagging provided by lots of folks-- thanks
286everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
287for assuring me that a &nested_quotewords() would be useful, and to
288Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
289error-checking (sort of-- you had to be there).
290
291=cut