Commit | Line | Data |
---|---|---|
a0d0e21e LW |
1 | package Text::ParseWords; |
2 | ||
3 | require 5.000; | |
a0d0e21e LW |
4 | use Carp; |
5 | ||
dc848c6f | 6 | require AutoLoader; |
7 | *AUTOLOAD = \&AutoLoader::AUTOLOAD; | |
8 | ||
9 | require Exporter; | |
10 | @ISA = qw(Exporter); | |
a0d0e21e LW |
11 | @EXPORT = qw(shellwords quotewords); |
12 | @EXPORT_OK = qw(old_shellwords); | |
13 | ||
a5f75d66 | 14 | =head1 NAME |
2304df62 | 15 | |
a5f75d66 | 16 | Text::ParseWords - parse text into an array of tokens |
a0d0e21e | 17 | |
a5f75d66 | 18 | =head1 SYNOPSIS |
a0d0e21e | 19 | |
a5f75d66 AD |
20 | use Text::ParseWords; |
21 | @words = "ewords($delim, $keep, @lines); | |
22 | @words = &shellwords(@lines); | |
23 | @words = &old_shellwords(@lines); | |
24 | ||
25 | =head1 DESCRIPTION | |
26 | ||
27 | "ewords() accepts a delimiter (which can be a regular expression) | |
28 | and a list of lines and then breaks those lines up into a list of | |
29 | words ignoring delimiters that appear inside quotes. | |
30 | ||
31 | The $keep argument is a boolean flag. If true, the quotes are kept | |
32 | with each word, otherwise quotes are stripped in the splitting process. | |
33 | $keep also defines whether unprotected backslashes are retained. | |
34 | ||
35 | A &shellwords() replacement is included to demonstrate the new package. | |
36 | This version differs from the original in that it will _NOT_ default | |
37 | to using $_ if no arguments are given. I personally find the old behavior | |
38 | to be a mis-feature. | |
39 | ||
a5f75d66 AD |
40 | "ewords() works by simply jamming all of @lines into a single |
41 | string in $_ and then pulling off words a bit at a time until $_ | |
42 | is exhausted. | |
43 | ||
a5f75d66 AD |
44 | =head1 AUTHORS |
45 | ||
46 | Hal Pomeranz (pomeranz@netcom.com), 23 March 1994 | |
47 | ||
48 | Basically an update and generalization of the old shellwords.pl. | |
49 | Much code shamelessly stolen from the old version (author unknown). | |
50 | ||
51 | =cut | |
52 | ||
53 | 1; | |
54 | __END__ | |
55 | ||
56 | sub shellwords { | |
2304df62 AD |
57 | local(@lines) = @_; |
58 | $lines[$#lines] =~ s/\s+$//; | |
a0d0e21e | 59 | "ewords('\s+', 0, @lines); |
2304df62 AD |
60 | } |
61 | ||
62 | ||
a0d0e21e LW |
63 | |
64 | sub quotewords { | |
0dbfbeef | 65 | |
66 | # The inner "for" loop builds up each word (or $field) one $snippet | |
67 | # at a time. A $snippet is a quoted string, a backslashed character, | |
68 | # or an unquoted string. We fall out of the "for" loop when we reach | |
69 | # the end of $_ or when we hit a delimiter. Falling out of the "for" | |
70 | # loop, we push the $field we've been building up onto the list of | |
71 | # @words we'll be returning, and then loop back and pull another word | |
72 | # off of $_. | |
73 | # | |
74 | # The first two cases inside the "for" loop deal with quoted strings. | |
75 | # The first case matches a double quoted string, removes it from $_, | |
76 | # and assigns the double quoted string to $snippet in the body of the | |
77 | # conditional. The second case handles single quoted strings. In | |
78 | # the third case we've found a quote at the current beginning of $_, | |
79 | # but it didn't match the quoted string regexps in the first two cases, | |
80 | # so it must be an unbalanced quote and we croak with an error (which can | |
81 | # be caught by eval()). | |
82 | # | |
83 | # The next case handles backslashed characters, and the next case is the | |
84 | # exit case on reaching the end of the string or finding a delimiter. | |
85 | # | |
86 | # Otherwise, we've found an unquoted thing and we pull of characters one | |
87 | # at a time until we reach something that could start another $snippet-- | |
88 | # a quote of some sort, a backslash, or the delimiter. This one character | |
89 | # at a time behavior was necessary if the delimiter was going to be a | |
90 | # regexp (love to hear it if you can figure out a better way). | |
91 | ||
936c8837 SG |
92 | my ($delim, $keep, @lines) = @_; |
93 | my (@words, $snippet, $field); | |
94 | ||
95 | local $_ = join ('', @lines); | |
2304df62 | 96 | |
936c8837 | 97 | while (length) { |
2304df62 | 98 | $field = ''; |
936c8837 | 99 | |
2304df62 | 100 | for (;;) { |
456e8aa7 | 101 | $snippet = ''; |
936c8837 SG |
102 | |
103 | if (s/^"([^"\\]*(\\.[^"\\]*)*)"//) { | |
2304df62 | 104 | $snippet = $1; |
936c8837 | 105 | $snippet = qq|"$snippet"| if $keep; |
2304df62 | 106 | } |
936c8837 | 107 | elsif (s/^'([^'\\]*(\\.[^'\\]*)*)'//) { |
2304df62 | 108 | $snippet = $1; |
936c8837 | 109 | $snippet = "'$snippet'" if $keep; |
2304df62 AD |
110 | } |
111 | elsif (/^["']/) { | |
936c8837 SG |
112 | croak 'Unmatched quote'; |
113 | } | |
114 | elsif (s/^\\(.)//) { | |
115 | $snippet = $1; | |
116 | $snippet = "\\$snippet" if $keep; | |
2304df62 | 117 | } |
936c8837 SG |
118 | elsif (!length || s/^$delim//) { |
119 | last; | |
2304df62 AD |
120 | } |
121 | else { | |
936c8837 SG |
122 | while (length && !(/^$delim/ || /^['"\\]/)) { |
123 | $snippet .= substr ($_, 0, 1); | |
124 | substr($_, 0, 1) = ''; | |
125 | } | |
2304df62 | 126 | } |
936c8837 | 127 | |
2304df62 AD |
128 | $field .= $snippet; |
129 | } | |
936c8837 SG |
130 | |
131 | push @words, $field; | |
2304df62 | 132 | } |
936c8837 SG |
133 | |
134 | return @words; | |
2304df62 | 135 | } |
2304df62 AD |
136 | |
137 | ||
a0d0e21e LW |
138 | sub old_shellwords { |
139 | ||
140 | # Usage: | |
141 | # use ParseWords; | |
142 | # @words = old_shellwords($line); | |
143 | # or | |
144 | # @words = old_shellwords(@lines); | |
145 | ||
146 | local($_) = join('', @_); | |
147 | my(@words,$snippet,$field); | |
148 | ||
149 | s/^\s+//; | |
150 | while ($_ ne '') { | |
151 | $field = ''; | |
152 | for (;;) { | |
153 | if (s/^"(([^"\\]|\\.)*)"//) { | |
154 | ($snippet = $1) =~ s#\\(.)#$1#g; | |
155 | } | |
156 | elsif (/^"/) { | |
157 | croak "Unmatched double quote: $_"; | |
158 | } | |
159 | elsif (s/^'(([^'\\]|\\.)*)'//) { | |
160 | ($snippet = $1) =~ s#\\(.)#$1#g; | |
161 | } | |
162 | elsif (/^'/) { | |
163 | croak "Unmatched single quote: $_"; | |
164 | } | |
165 | elsif (s/^\\(.)//) { | |
166 | $snippet = $1; | |
167 | } | |
168 | elsif (s/^([^\s\\'"]+)//) { | |
169 | $snippet = $1; | |
170 | } | |
171 | else { | |
172 | s/^\s+//; | |
173 | last; | |
174 | } | |
175 | $field .= $snippet; | |
176 | } | |
177 | push(@words, $field); | |
178 | } | |
179 | @words; | |
180 | } |