This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Text-ParseWords to CPAN version 3.29
[perl5.git] / cpan / Text-ParseWords / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e
LW
1package Text::ParseWords;
2
9480d411
NC
3use strict;
4require 5.006;
33954ec3 5our $VERSION = "3.29";
a0d0e21e 6
dc848c6f 7
9b599b2a 8use Exporter;
9480d411
NC
9our @ISA = qw(Exporter);
10our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11our @EXPORT_OK = qw(old_shellwords);
12our $PERL_SINGLE_QUOTE;
a0d0e21e 13
a5f75d66 14
9b599b2a 15sub shellwords {
d5c14ab2
AT
16 my (@lines) = @_;
17 my @allwords;
18
19 foreach my $line (@lines) {
20 $line =~ s/^\s+//;
21 my @words = parse_line('\s+', 0, $line);
22 pop @words if (@words and !defined $words[-1]);
23 return() unless (@words || !length($line));
24 push(@allwords, @words);
25 }
26 return(@allwords);
9b599b2a 27}
a5f75d66 28
a5f75d66 29
a5f75d66 30
9b599b2a
GS
31sub quotewords {
32 my($delim, $keep, @lines) = @_;
33 my($line, @words, @allwords);
9b599b2a
GS
34
35 foreach $line (@lines) {
36 @words = parse_line($delim, $keep, $line);
37 return() unless (@words || !length($line));
38 push(@allwords, @words);
39 }
40 return(@allwords);
41}
a5f75d66 42
a5f75d66 43
a5f75d66 44
9b599b2a
GS
45sub nested_quotewords {
46 my($delim, $keep, @lines) = @_;
47 my($i, @allwords);
6a724e38 48
9b599b2a
GS
49 for ($i = 0; $i < @lines; $i++) {
50 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
51 return() unless (@{$allwords[$i]} || !length($lines[$i]));
52 }
53 return(@allwords);
2304df62
AD
54}
55
56
a0d0e21e 57
9b599b2a
GS
58sub parse_line {
59 my($delimiter, $keep, $line) = @_;
429b060a 60 my($word, @pieces);
936c8837 61
6a724e38
AT
62 no warnings 'uninitialized'; # we will be testing undef strings
63
9b599b2a 64 while (length($line)) {
f2a30bc9
YO
65 # This pattern is optimised to be stack conservative on older perls.
66 # Do not refactor without being careful and testing it on very long strings.
67 # See Perl bug #42980 for an example of a stack busting input.
68 $line =~ s/^
69 (?:
70 # double quoted string
71 (") # $quote
72 ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted
73 | # --OR--
74 # singe quoted string
75 (') # $quote
76 ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted
77 | # --OR--
78 # unquoted string
79 ( # $unquoted
80 (?:\\.|[^\\"'])*?
81 )
82 # followed by
83 ( # $delim
84 \Z(?!\n) # EOL
85 | # --OR--
86 (?-x:$delimiter) # delimiter
87 | # --OR--
88 (?!^)(?=["']) # a quote
89 )
90 )//xs or return; # extended layout
91 my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
92
93
429b060a 94 return() unless( defined($quote) || length($unquoted) || length($delim));
936c8837 95
9b599b2a
GS
96 if ($keep) {
97 $quoted = "$quote$quoted$quote";
98 }
99 else {
429b060a 100 $unquoted =~ s/\\(.)/$1/sg;
167b9ebc 101 if (defined $quote) {
429b060a 102 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
167b9ebc
JH
103 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
104 }
9b599b2a 105 }
6a724e38 106 $word .= substr($line, 0, 0); # leave results tainted
167b9ebc 107 $word .= defined $quote ? $quoted : $unquoted;
9b599b2a
GS
108
109 if (length($delim)) {
110 push(@pieces, $word);
111 push(@pieces, $delim) if ($keep eq 'delimiters');
112 undef $word;
113 }
114 if (!length($line)) {
115 push(@pieces, $word);
2304df62 116 }
2304df62 117 }
9b599b2a 118 return(@pieces);
2304df62 119}
2304df62
AD
120
121
9b599b2a 122
a0d0e21e
LW
123sub old_shellwords {
124
125 # Usage:
126 # use ParseWords;
127 # @words = old_shellwords($line);
128 # or
129 # @words = old_shellwords(@lines);
6a724e38
AT
130 # or
131 # @words = old_shellwords(); # defaults to $_ (and clobbers it)
a0d0e21e 132
6a724e38
AT
133 no warnings 'uninitialized'; # we will be testing undef strings
134 local *_ = \join('', @_) if @_;
135 my (@words, $snippet);
a0d0e21e 136
6a724e38 137 s/\A\s+//;
a0d0e21e 138 while ($_ ne '') {
6a724e38 139 my $field = substr($_, 0, 0); # leave results tainted
a0d0e21e 140 for (;;) {
6a724e38
AT
141 if (s/\A"(([^"\\]|\\.)*)"//s) {
142 ($snippet = $1) =~ s#\\(.)#$1#sg;
a0d0e21e 143 }
6a724e38
AT
144 elsif (/\A"/) {
145 require Carp;
146 Carp::carp("Unmatched double quote: $_");
9b599b2a 147 return();
a0d0e21e 148 }
6a724e38
AT
149 elsif (s/\A'(([^'\\]|\\.)*)'//s) {
150 ($snippet = $1) =~ s#\\(.)#$1#sg;
a0d0e21e 151 }
6a724e38
AT
152 elsif (/\A'/) {
153 require Carp;
154 Carp::carp("Unmatched single quote: $_");
9b599b2a 155 return();
a0d0e21e 156 }
9983eac8 157 elsif (s/\A\\(.?)//s) {
a0d0e21e
LW
158 $snippet = $1;
159 }
6a724e38 160 elsif (s/\A([^\s\\'"]+)//) {
a0d0e21e
LW
161 $snippet = $1;
162 }
163 else {
6a724e38 164 s/\A\s+//;
a0d0e21e
LW
165 last;
166 }
167 $field .= $snippet;
168 }
169 push(@words, $field);
170 }
6a724e38 171 return @words;
a0d0e21e 172}
9b599b2a
GS
173
1741;
175
176__END__
177
178=head1 NAME
179
180Text::ParseWords - parse text into an array of tokens or array of arrays
181
182=head1 SYNOPSIS
183
184 use Text::ParseWords;
9480d411
NC
185 @lists = nested_quotewords($delim, $keep, @lines);
186 @words = quotewords($delim, $keep, @lines);
187 @words = shellwords(@lines);
188 @words = parse_line($delim, $keep, $line);
189 @words = old_shellwords(@lines); # DEPRECATED!
9b599b2a
GS
190
191=head1 DESCRIPTION
192
193The &nested_quotewords() and &quotewords() functions accept a delimiter
194(which can be a regular expression)
195and a list of lines and then breaks those lines up into a list of
196words ignoring delimiters that appear inside quotes. &quotewords()
197returns all of the tokens in a single long list, while &nested_quotewords()
198returns a list of token lists corresponding to the elements of @lines.
199&parse_line() does tokenizing on a single string. The &*quotewords()
cf18bebb 200functions simply call &parse_line(), so if you're only splitting
201one line you can call &parse_line() directly and save a function
9b599b2a
GS
202call.
203
204The $keep argument is a boolean flag. If true, then the tokens are
205split on the specified delimiter, but all other characters (quotes,
206backslashes, etc.) are kept in the tokens. If $keep is false then the
207&*quotewords() functions remove all quotes and backslashes that are
208not themselves backslash-escaped or inside of single quotes (i.e.,
209&quotewords() tries to interpret these characters just like the Bourne
210shell). NB: these semantics are significantly different from the
211original version of this module shipped with Perl 5.000 through 5.004.
212As an additional feature, $keep may be the keyword "delimiters" which
213causes the functions to preserve the delimiters in each string as
214tokens in the token lists, in addition to preserving quote and
215backslash characters.
216
217&shellwords() is written as a special case of &quotewords(), and it
218does token parsing with whitespace as a delimiter-- similar to most
219Unix shells.
220
221=head1 EXAMPLES
222
223The sample program:
224
225 use Text::ParseWords;
9480d411 226 @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
9b599b2a
GS
227 $i = 0;
228 foreach (@words) {
229 print "$i: <$_>\n";
230 $i++;
231 }
232
233produces:
234
235 0: <this>
236 1: <is>
237 2: <a test>
238 3: <of quotewords>
239 4: <"for>
240 5: <you>
241
242demonstrating:
243
244=over 4
245
246=item 0
551e1d92 247
9b599b2a
GS
248a simple word
249
250=item 1
551e1d92 251
9b599b2a
GS
252multiple spaces are skipped because of our $delim
253
254=item 2
551e1d92 255
9b599b2a
GS
256use of quotes to include a space in a word
257
258=item 3
551e1d92 259
9b599b2a
GS
260use of a backslash to include a space in a word
261
262=item 4
551e1d92 263
9b599b2a
GS
264use of a backslash to remove the special meaning of a double-quote
265
266=item 5
551e1d92 267
9b599b2a
GS
268another simple word (note the lack of effect of the
269backslashed double-quote)
270
271=back
272
9480d411
NC
273Replacing C<quotewords('\s+', 0, q{this is...})>
274with C<shellwords(q{this is...})>
9b599b2a
GS
275is a simpler way to accomplish the same thing.
276
41191e55
CBW
277=head1 SEE ALSO
278
279L<Text::CSV> - for parsing CSV files
280
9b599b2a
GS
281=head1 AUTHORS
282
9480d411
NC
283Maintainer: Alexandr Ciornii <alexchornyATgmail.com>.
284
285Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
9b599b2a
GS
286author unknown). Much of the code for &parse_line() (including the
287primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
288
289Examples section another documentation provided by John Heidemann
290<johnh@ISI.EDU>
291
292Bug reports, patches, and nagging provided by lots of folks-- thanks
293everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
294for assuring me that a &nested_quotewords() would be useful, and to
295Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
296error-checking (sort of-- you had to be there).
297
298=cut