This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
VMS updates from Dan Sugalski <sugalskd@osshe.edu>
[perl5.git] / lib / Text / ParseWords.pm
CommitLineData
a0d0e21e
LW
1package Text::ParseWords;
2
b174585d
MH
3use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
4$VERSION = "3.1";
a0d0e21e 5
9b599b2a 6require 5.000;
dc848c6f 7
9b599b2a 8use Exporter;
dc848c6f 9@ISA = qw(Exporter);
9b599b2a 10@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
a0d0e21e
LW
11@EXPORT_OK = qw(old_shellwords);
12
a5f75d66 13
9b599b2a
GS
14sub shellwords {
15 local(@lines) = @_;
16 $lines[$#lines] =~ s/\s+$//;
17 return(quotewords('\s+', 0, @lines));
18}
a5f75d66 19
a5f75d66 20
a5f75d66 21
9b599b2a
GS
22sub quotewords {
23 my($delim, $keep, @lines) = @_;
24 my($line, @words, @allwords);
25
26
27 foreach $line (@lines) {
28 @words = parse_line($delim, $keep, $line);
29 return() unless (@words || !length($line));
30 push(@allwords, @words);
31 }
32 return(@allwords);
33}
a5f75d66 34
a5f75d66 35
a5f75d66 36
9b599b2a
GS
37sub nested_quotewords {
38 my($delim, $keep, @lines) = @_;
39 my($i, @allwords);
40
41 for ($i = 0; $i < @lines; $i++) {
42 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
43 return() unless (@{$allwords[$i]} || !length($lines[$i]));
44 }
45 return(@allwords);
2304df62
AD
46}
47
48
a0d0e21e 49
9b599b2a 50sub parse_line {
b174585d
MH
51 # We will be testing undef strings
52 local($^W) = 0;
53
9b599b2a
GS
54 my($delimiter, $keep, $line) = @_;
55 my($quote, $quoted, $unquoted, $delim, $word, @pieces);
936c8837 56
9b599b2a 57 while (length($line)) {
b174585d
MH
58
59 ($quote, $quoted, undef, $unquoted, $delim, undef) =
9b599b2a 60 $line =~ m/^(["']) # a $quote
b174585d
MH
61 ((?:\\.|(?!\1)[^\\])*) # and $quoted text
62 \1 # followed by the same quote
63 ([\000-\377]*) # and the rest
9b599b2a
GS
64 | # --OR--
65 ^((?:\\.|[^\\"'])*?) # an $unquoted text
b174585d 66 (\Z(?!\n)|$delimiter|(?!^)(?=["']))
9b599b2a 67 # plus EOL, delimiter, or quote
b174585d
MH
68 ([\000-\377]*) # the rest
69 /x; # extended layout
70 return() unless( $quote || length($unquoted) || length($delim));
936c8837 71
b174585d 72 $line = $+;
936c8837 73
9b599b2a
GS
74 if ($keep) {
75 $quoted = "$quote$quoted$quote";
76 }
77 else {
78 $unquoted =~ s/\\(.)/$1/g;
79 $quoted =~ s/\\(.)/$1/g if ($quote eq '"');
b174585d 80 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
9b599b2a
GS
81 }
82 $word .= ($quote) ? $quoted : $unquoted;
83
84 if (length($delim)) {
85 push(@pieces, $word);
86 push(@pieces, $delim) if ($keep eq 'delimiters');
87 undef $word;
88 }
89 if (!length($line)) {
90 push(@pieces, $word);
2304df62 91 }
2304df62 92 }
9b599b2a 93 return(@pieces);
2304df62 94}
2304df62
AD
95
96
9b599b2a 97
a0d0e21e
LW
98sub old_shellwords {
99
100 # Usage:
101 # use ParseWords;
102 # @words = old_shellwords($line);
103 # or
104 # @words = old_shellwords(@lines);
105
106 local($_) = join('', @_);
107 my(@words,$snippet,$field);
108
109 s/^\s+//;
110 while ($_ ne '') {
111 $field = '';
112 for (;;) {
113 if (s/^"(([^"\\]|\\.)*)"//) {
114 ($snippet = $1) =~ s#\\(.)#$1#g;
115 }
116 elsif (/^"/) {
9b599b2a 117 return();
a0d0e21e
LW
118 }
119 elsif (s/^'(([^'\\]|\\.)*)'//) {
120 ($snippet = $1) =~ s#\\(.)#$1#g;
121 }
122 elsif (/^'/) {
9b599b2a 123 return();
a0d0e21e
LW
124 }
125 elsif (s/^\\(.)//) {
126 $snippet = $1;
127 }
128 elsif (s/^([^\s\\'"]+)//) {
129 $snippet = $1;
130 }
131 else {
132 s/^\s+//;
133 last;
134 }
135 $field .= $snippet;
136 }
137 push(@words, $field);
138 }
139 @words;
140}
9b599b2a
GS
141
1421;
143
144__END__
145
146=head1 NAME
147
148Text::ParseWords - parse text into an array of tokens or array of arrays
149
150=head1 SYNOPSIS
151
152 use Text::ParseWords;
153 @lists = &nested_quotewords($delim, $keep, @lines);
154 @words = &quotewords($delim, $keep, @lines);
155 @words = &shellwords(@lines);
156 @words = &parse_line($delim, $keep, $line);
157 @words = &old_shellwords(@lines); # DEPRECATED!
158
159=head1 DESCRIPTION
160
161The &nested_quotewords() and &quotewords() functions accept a delimiter
162(which can be a regular expression)
163and a list of lines and then breaks those lines up into a list of
164words ignoring delimiters that appear inside quotes. &quotewords()
165returns all of the tokens in a single long list, while &nested_quotewords()
166returns a list of token lists corresponding to the elements of @lines.
167&parse_line() does tokenizing on a single string. The &*quotewords()
168functions simply call &parse_lines(), so if you're only splitting
169one line you can call &parse_lines() directly and save a function
170call.
171
172The $keep argument is a boolean flag. If true, then the tokens are
173split on the specified delimiter, but all other characters (quotes,
174backslashes, etc.) are kept in the tokens. If $keep is false then the
175&*quotewords() functions remove all quotes and backslashes that are
176not themselves backslash-escaped or inside of single quotes (i.e.,
177&quotewords() tries to interpret these characters just like the Bourne
178shell). NB: these semantics are significantly different from the
179original version of this module shipped with Perl 5.000 through 5.004.
180As an additional feature, $keep may be the keyword "delimiters" which
181causes the functions to preserve the delimiters in each string as
182tokens in the token lists, in addition to preserving quote and
183backslash characters.
184
185&shellwords() is written as a special case of &quotewords(), and it
186does token parsing with whitespace as a delimiter-- similar to most
187Unix shells.
188
189=head1 EXAMPLES
190
191The sample program:
192
193 use Text::ParseWords;
194 @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
195 $i = 0;
196 foreach (@words) {
197 print "$i: <$_>\n";
198 $i++;
199 }
200
201produces:
202
203 0: <this>
204 1: <is>
205 2: <a test>
206 3: <of quotewords>
207 4: <"for>
208 5: <you>
209
210demonstrating:
211
212=over 4
213
214=item 0
215a simple word
216
217=item 1
218multiple spaces are skipped because of our $delim
219
220=item 2
221use of quotes to include a space in a word
222
223=item 3
224use of a backslash to include a space in a word
225
226=item 4
227use of a backslash to remove the special meaning of a double-quote
228
229=item 5
230another simple word (note the lack of effect of the
231backslashed double-quote)
232
233=back
234
235Replacing C<&quotewords('\s+', 0, q{this is...})>
236with C<&shellwords(q{this is...})>
237is a simpler way to accomplish the same thing.
238
239=head1 AUTHORS
240
241Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
242author unknown). Much of the code for &parse_line() (including the
243primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
244
245Examples section another documentation provided by John Heidemann
246<johnh@ISI.EDU>
247
248Bug reports, patches, and nagging provided by lots of folks-- thanks
249everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
250for assuring me that a &nested_quotewords() would be useful, and to
251Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
252error-checking (sort of-- you had to be there).
253
254=cut