Commit | Line | Data |
---|---|---|
b3eb6a9b GS |
1 | package re; |
2 | ||
de8c5301 YO |
3 | # pragma for controlling the regex engine |
4 | use strict; | |
5 | use warnings; | |
6 | ||
44a2ac75 | 7 | our $VERSION = "0.08"; |
de8c5301 | 8 | our @ISA = qw(Exporter); |
44a2ac75 YO |
9 | our @EXPORT_OK = qw(is_regexp regexp_pattern regmust |
10 | regname regnames | |
11 | regnames_count regnames_iterinit regnames_iternext); | |
de8c5301 YO |
12 | our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK; |
13 | ||
14 | # *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** | |
15 | # | |
16 | # If you modify these values see comment below! | |
17 | ||
18 | my %bitmask = ( | |
19 | taint => 0x00100000, # HINT_RE_TAINT | |
20 | eval => 0x00200000, # HINT_RE_EVAL | |
21 | ); | |
22 | ||
23 | # - File::Basename contains a literal for 'taint' as a fallback. If | |
24 | # taint is changed here, File::Basename must be updated as well. | |
25 | # | |
26 | # - ExtUtils::ParseXS uses a hardcoded | |
27 | # BEGIN { $^H |= 0x00200000 } | |
28 | # in it to allow re.xs to be built. So if 'eval' is changed here then | |
29 | # ExtUtils::ParseXS must be changed as well. | |
30 | # | |
31 | # *** WARNING *** WARNING *** WARNING *** WARNING *** WARNING *** | |
32 | ||
33 | sub setcolor { | |
34 | eval { # Ignore errors | |
35 | require Term::Cap; | |
36 | ||
37 | my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning. | |
38 | my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue'; | |
39 | my @props = split /,/, $props; | |
40 | my $colors = join "\t", map {$terminal->Tputs($_,1)} @props; | |
41 | ||
42 | $colors =~ s/\0//g; | |
43 | $ENV{PERL_RE_COLORS} = $colors; | |
44 | }; | |
45 | if ($@) { | |
46 | $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t'; | |
47 | } | |
48 | ||
49 | } | |
50 | ||
51 | my %flags = ( | |
52 | COMPILE => 0x0000FF, | |
53 | PARSE => 0x000001, | |
54 | OPTIMISE => 0x000002, | |
55 | TRIEC => 0x000004, | |
56 | DUMP => 0x000008, | |
57 | ||
58 | EXECUTE => 0x00FF00, | |
59 | INTUIT => 0x000100, | |
60 | MATCH => 0x000200, | |
61 | TRIEE => 0x000400, | |
62 | ||
63 | EXTRA => 0xFF0000, | |
64 | TRIEM => 0x010000, | |
65 | OFFSETS => 0x020000, | |
66 | OFFSETSDBG => 0x040000, | |
67 | STATE => 0x080000, | |
68 | OPTIMISEM => 0x100000, | |
69 | STACK => 0x280000, | |
70 | ); | |
71 | $flags{ALL} = -1; | |
72 | $flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE}; | |
73 | $flags{Extra} = $flags{EXECUTE} | $flags{COMPILE}; | |
74 | $flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE}; | |
75 | $flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE}; | |
76 | $flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC}; | |
77 | ||
78 | my $installed; | |
79 | my $installed_error; | |
80 | ||
81 | sub _do_install { | |
82 | if ( ! defined($installed) ) { | |
83 | require XSLoader; | |
84 | $installed = eval { XSLoader::load('re', $VERSION) } || 0; | |
85 | $installed_error = $@; | |
86 | } | |
87 | } | |
88 | ||
89 | sub _load_unload { | |
90 | my ($on)= @_; | |
91 | if ($on) { | |
92 | _do_install(); | |
93 | if ( ! $installed ) { | |
94 | die "'re' not installed!? ($installed_error)"; | |
95 | } else { | |
96 | # We call install() every time, as if we didn't, we wouldn't | |
97 | # "see" any changes to the color environment var since | |
98 | # the last time it was called. | |
99 | ||
100 | # install() returns an integer, which if casted properly | |
101 | # in C resolves to a structure containing the regex | |
102 | # hooks. Setting it to a random integer will guarantee | |
103 | # segfaults. | |
104 | $^H{regcomp} = install(); | |
105 | } | |
106 | } else { | |
107 | delete $^H{regcomp}; | |
108 | } | |
109 | } | |
110 | ||
111 | sub bits { | |
112 | my $on = shift; | |
113 | my $bits = 0; | |
114 | unless (@_) { | |
115 | require Carp; | |
116 | Carp::carp("Useless use of \"re\" pragma"); | |
117 | } | |
118 | foreach my $idx (0..$#_){ | |
119 | my $s=$_[$idx]; | |
120 | if ($s eq 'Debug' or $s eq 'Debugcolor') { | |
121 | setcolor() if $s =~/color/i; | |
122 | ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS}; | |
123 | for my $idx ($idx+1..$#_) { | |
124 | if ($flags{$_[$idx]}) { | |
125 | if ($on) { | |
126 | ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]}; | |
127 | } else { | |
128 | ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]}; | |
129 | } | |
130 | } else { | |
131 | require Carp; | |
132 | Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ", | |
133 | join(", ",sort keys %flags ) ); | |
134 | } | |
135 | } | |
136 | _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS}); | |
137 | last; | |
138 | } elsif ($s eq 'debug' or $s eq 'debugcolor') { | |
139 | setcolor() if $s =~/color/i; | |
140 | _load_unload($on); | |
66e6b4c5 | 141 | last; |
de8c5301 YO |
142 | } elsif (exists $bitmask{$s}) { |
143 | $bits |= $bitmask{$s}; | |
144 | } elsif ($EXPORT_OK{$s}) { | |
145 | _do_install(); | |
146 | require Exporter; | |
147 | re->export_to_level(2, 're', $s); | |
148 | } else { | |
149 | require Carp; | |
150 | Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ", | |
151 | join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask), | |
152 | ")"); | |
153 | } | |
154 | } | |
155 | $bits; | |
156 | } | |
157 | ||
158 | sub import { | |
159 | shift; | |
160 | $^H |= bits(1, @_); | |
161 | } | |
162 | ||
163 | sub unimport { | |
164 | shift; | |
165 | $^H &= ~ bits(0, @_); | |
166 | } | |
167 | ||
168 | 1; | |
169 | ||
170 | __END__ | |
56953603 | 171 | |
b3eb6a9b GS |
172 | =head1 NAME |
173 | ||
174 | re - Perl pragma to alter regular expression behaviour | |
175 | ||
176 | =head1 SYNOPSIS | |
177 | ||
e4d48cc9 GS |
178 | use re 'taint'; |
179 | ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here | |
b3eb6a9b | 180 | |
2cd61cdb | 181 | $pat = '(?{ $foo = 1 })'; |
e4d48cc9 | 182 | use re 'eval'; |
2cd61cdb | 183 | /foo${pat}bar/; # won't fail (when not under -T switch) |
e4d48cc9 GS |
184 | |
185 | { | |
186 | no re 'taint'; # the default | |
187 | ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here | |
188 | ||
189 | no re 'eval'; # the default | |
2cd61cdb | 190 | /foo${pat}bar/; # disallowed (with or without -T switch) |
e4d48cc9 | 191 | } |
b3eb6a9b | 192 | |
1e2e3d02 YO |
193 | use re 'debug'; # output debugging info during |
194 | /^(.*)$/s; # compile and run time | |
195 | ||
2cd61cdb | 196 | |
02ea72ae IZ |
197 | use re 'debugcolor'; # same as 'debug', but with colored output |
198 | ... | |
199 | ||
a3621e74 | 200 | use re qw(Debug All); # Finer tuned debugging options. |
4ee9a43f | 201 | use re qw(Debug More); |
fe759410 | 202 | no re qw(Debug ALL); # Turn of all re debugging in this scope |
4ee9a43f | 203 | |
de8c5301 YO |
204 | use re qw(is_regexp regexp_pattern); # import utility functions |
205 | my ($pat,$mods)=regexp_pattern(qr/foo/i); | |
206 | if (is_regexp($obj)) { | |
207 | print "Got regexp: ", | |
208 | scalar regexp_pattern($obj); # just as perl would stringify it | |
209 | } # but no hassle with blessed re's. | |
a3621e74 | 210 | |
3ffabb8c GS |
211 | (We use $^X in these examples because it's tainted by default.) |
212 | ||
b3eb6a9b GS |
213 | =head1 DESCRIPTION |
214 | ||
de8c5301 YO |
215 | =head2 'taint' mode |
216 | ||
b3eb6a9b GS |
217 | When C<use re 'taint'> is in effect, and a tainted string is the target |
218 | of a regex, the regex memories (or values returned by the m// operator | |
e4d48cc9 GS |
219 | in list context) are tainted. This feature is useful when regex operations |
220 | on tainted data aren't meant to extract safe substrings, but to perform | |
221 | other transformations. | |
b3eb6a9b | 222 | |
de8c5301 YO |
223 | =head2 'eval' mode |
224 | ||
e4d48cc9 | 225 | When C<use re 'eval'> is in effect, a regex is allowed to contain |
2cd61cdb | 226 | C<(?{ ... })> zero-width assertions even if regular expression contains |
ffbc6a93 | 227 | variable interpolation. That is normally disallowed, since it is a |
2cd61cdb IZ |
228 | potential security risk. Note that this pragma is ignored when the regular |
229 | expression is obtained from tainted data, i.e. evaluation is always | |
3c4b39be | 230 | disallowed with tainted regular expressions. See L<perlre/(?{ code })>. |
2cd61cdb | 231 | |
ffbc6a93 | 232 | For the purpose of this pragma, interpolation of precompiled regular |
0a92e3a8 GS |
233 | expressions (i.e., the result of C<qr//>) is I<not> considered variable |
234 | interpolation. Thus: | |
2cd61cdb IZ |
235 | |
236 | /foo${pat}bar/ | |
237 | ||
ffbc6a93 | 238 | I<is> allowed if $pat is a precompiled regular expression, even |
2cd61cdb IZ |
239 | if $pat contains C<(?{ ... })> assertions. |
240 | ||
de8c5301 YO |
241 | =head2 'debug' mode |
242 | ||
ffbc6a93 | 243 | When C<use re 'debug'> is in effect, perl emits debugging messages when |
2cd61cdb IZ |
244 | compiling and using regular expressions. The output is the same as that |
245 | obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the | |
246 | B<-Dr> switch. It may be quite voluminous depending on the complexity | |
02ea72ae IZ |
247 | of the match. Using C<debugcolor> instead of C<debug> enables a |
248 | form of output that can be used to get a colorful display on terminals | |
249 | that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a | |
250 | comma-separated list of C<termcap> properties to use for highlighting | |
ffbc6a93 | 251 | strings on/off, pre-point part on/off. |
2cd61cdb IZ |
252 | See L<perldebug/"Debugging regular expressions"> for additional info. |
253 | ||
de8c5301 YO |
254 | As of 5.9.5 the directive C<use re 'debug'> and its equivalents are |
255 | lexically scoped, as the other directives are. However they have both | |
256 | compile-time and run-time effects. | |
257 | ||
258 | See L<perlmodlib/Pragmatic Modules>. | |
259 | ||
260 | =head2 'Debug' mode | |
261 | ||
a3621e74 YO |
262 | Similarly C<use re 'Debug'> produces debugging output, the difference |
263 | being that it allows the fine tuning of what debugging output will be | |
be8e71aa YO |
264 | emitted. Options are divided into three groups, those related to |
265 | compilation, those related to execution and those related to special | |
266 | purposes. The options are as follows: | |
267 | ||
268 | =over 4 | |
269 | ||
270 | =item Compile related options | |
271 | ||
272 | =over 4 | |
273 | ||
274 | =item COMPILE | |
275 | ||
276 | Turns on all compile related debug options. | |
277 | ||
278 | =item PARSE | |
279 | ||
280 | Turns on debug output related to the process of parsing the pattern. | |
281 | ||
282 | =item OPTIMISE | |
283 | ||
284 | Enables output related to the optimisation phase of compilation. | |
285 | ||
24b23f37 | 286 | =item TRIEC |
be8e71aa YO |
287 | |
288 | Detailed info about trie compilation. | |
289 | ||
290 | =item DUMP | |
291 | ||
292 | Dump the final program out after it is compiled and optimised. | |
293 | ||
be8e71aa YO |
294 | =back |
295 | ||
296 | =item Execute related options | |
297 | ||
298 | =over 4 | |
299 | ||
300 | =item EXECUTE | |
301 | ||
302 | Turns on all execute related debug options. | |
303 | ||
304 | =item MATCH | |
305 | ||
306 | Turns on debugging of the main matching loop. | |
307 | ||
24b23f37 | 308 | =item TRIEE |
be8e71aa YO |
309 | |
310 | Extra debugging of how tries execute. | |
311 | ||
312 | =item INTUIT | |
313 | ||
314 | Enable debugging of start point optimisations. | |
315 | ||
316 | =back | |
317 | ||
318 | =item Extra debugging options | |
319 | ||
320 | =over 4 | |
321 | ||
322 | =item EXTRA | |
323 | ||
324 | Turns on all "extra" debugging options. | |
325 | ||
24b23f37 YO |
326 | =item TRIEM |
327 | ||
328 | Enable enhanced TRIE debugging. Enhances both TRIEE | |
329 | and TRIEC. | |
330 | ||
331 | =item STATE | |
332 | ||
4ee9a43f | 333 | Enable debugging of states in the engine. |
24b23f37 YO |
334 | |
335 | =item STACK | |
be8e71aa | 336 | |
24b23f37 YO |
337 | Enable debugging of the recursion stack in the engine. Enabling |
338 | or disabling this option automatically does the same for debugging | |
339 | states as well. This output from this can be quite large. | |
340 | ||
341 | =item OPTIMISEM | |
342 | ||
343 | Enable enhanced optimisation debugging and start point optimisations. | |
344 | Probably not useful except when debugging the regex engine itself. | |
345 | ||
346 | =item OFFSETS | |
347 | ||
348 | Dump offset information. This can be used to see how regops correlate | |
349 | to the pattern. Output format is | |
350 | ||
351 | NODENUM:POSITION[LENGTH] | |
352 | ||
353 | Where 1 is the position of the first char in the string. Note that position | |
354 | can be 0, or larger than the actual length of the pattern, likewise length | |
355 | can be zero. | |
be8e71aa | 356 | |
24b23f37 | 357 | =item OFFSETSDBG |
be8e71aa YO |
358 | |
359 | Enable debugging of offsets information. This emits copious | |
fe759410 | 360 | amounts of trace information and doesn't mesh well with other |
be8e71aa YO |
361 | debug options. |
362 | ||
fe759410 | 363 | Almost definitely only useful to people hacking |
be8e71aa YO |
364 | on the offsets part of the debug engine. |
365 | ||
366 | =back | |
367 | ||
368 | =item Other useful flags | |
369 | ||
370 | These are useful shortcuts to save on the typing. | |
371 | ||
372 | =over 4 | |
373 | ||
374 | =item ALL | |
375 | ||
376 | Enable all compile and execute options at once. | |
377 | ||
378 | =item All | |
379 | ||
fe759410 | 380 | Enable DUMP and all execute options. Equivalent to: |
be8e71aa YO |
381 | |
382 | use re 'debug'; | |
383 | ||
384 | =item MORE | |
385 | ||
386 | =item More | |
387 | ||
24b23f37 | 388 | Enable TRIEM and all execute compile and execute options. |
be8e71aa | 389 | |
dba3f186 | 390 | =back |
be8e71aa | 391 | |
dba3f186 | 392 | =back |
a3621e74 | 393 | |
1e2e3d02 | 394 | As of 5.9.5 the directive C<use re 'debug'> and its equivalents are |
4ee9a43f | 395 | lexically scoped, as the other directives are. However they have both |
1e2e3d02 | 396 | compile-time and run-time effects. |
b3eb6a9b | 397 | |
de8c5301 | 398 | =head2 Exportable Functions |
b3eb6a9b | 399 | |
de8c5301 | 400 | As of perl 5.9.5 're' debug contains a number of utility functions that |
4ee9a43f | 401 | may be optionally exported into the caller's namespace. They are listed |
de8c5301 | 402 | below. |
b3eb6a9b | 403 | |
de8c5301 | 404 | =over 4 |
b3eb6a9b | 405 | |
de8c5301 | 406 | =item is_regexp($ref) |
02ea72ae | 407 | |
de8c5301 | 408 | Returns true if the argument is a compiled regular expression as returned |
4ee9a43f | 409 | by C<qr//>, false if it is not. |
02ea72ae | 410 | |
4ee9a43f RGS |
411 | This function will not be confused by overloading or blessing. In |
412 | internals terms, this extracts the regexp pointer out of the | |
de8c5301 | 413 | PERL_MAGIC_qr structure so it it cannot be fooled. |
894be9b7 | 414 | |
de8c5301 | 415 | =item regexp_pattern($ref) |
02ea72ae | 416 | |
4ee9a43f RGS |
417 | If the argument is a compiled regular expression as returned by C<qr//>, |
418 | then this function returns the pattern. | |
be8e71aa | 419 | |
4ee9a43f RGS |
420 | In list context it returns a two element list, the first element |
421 | containing the pattern and the second containing the modifiers used when | |
422 | the pattern was compiled. | |
be8e71aa | 423 | |
4ee9a43f | 424 | my ($pat, $mods) = regexp_pattern($ref); |
a3621e74 | 425 | |
4ee9a43f RGS |
426 | In scalar context it returns the same as perl would when strigifying a raw |
427 | C<qr//> with the same pattern inside. If the argument is not a compiled | |
428 | reference then this routine returns false but defined in scalar context, | |
429 | and the empty list in list context. Thus the following | |
f9f4320a | 430 | |
de8c5301 | 431 | if (regexp_pattern($ref) eq '(?i-xsm:foo)') |
dba3f186 | 432 | |
de8c5301 | 433 | will be warning free regardless of what $ref actually is. |
380e0b81 | 434 | |
4ee9a43f RGS |
435 | Like C<is_regexp> this function will not be confused by overloading |
436 | or blessing of the object. | |
b3eb6a9b | 437 | |
256ddcd0 YO |
438 | =item regmust($ref) |
439 | ||
432acd5f RGS |
440 | If the argument is a compiled regular expression as returned by C<qr//>, |
441 | then this function returns what the optimiser consiers to be the longest | |
442 | anchored fixed string and longest floating fixed string in the pattern. | |
443 | ||
444 | A I<fixed string> is defined as being a substring that must appear for the | |
445 | pattern to match. An I<anchored fixed string> is a fixed string that must | |
446 | appear at a particular offset from the beginning of the match. A I<floating | |
447 | fixed string> is defined as a fixed string that can appear at any point in | |
448 | a range of positions relative to the start of the match. For example, | |
449 | ||
450 | my $qr = qr/here .* there/x; | |
451 | my ($anchored, $floating) = regmust($qr); | |
256ddcd0 | 452 | print "anchored:'$anchored'\nfloating:'$floating'\n"; |
432acd5f | 453 | |
256ddcd0 YO |
454 | results in |
455 | ||
456 | anchored:'here' | |
457 | floating:'there' | |
458 | ||
432acd5f RGS |
459 | Because the C<here> is before the C<.*> in the pattern, its position |
460 | can be determined exactly. That's not true, however, for the C<there>; | |
461 | it could appear at any point after where the anchored string appeared. | |
256ddcd0 YO |
462 | Perl uses both for its optimisations, prefering the longer, or, if they are |
463 | equal, the floating. | |
464 | ||
465 | B<NOTE:> This may not necessarily be the definitive longest anchored and | |
432acd5f | 466 | floating string. This will be what the optimiser of the Perl that you |
256ddcd0 YO |
467 | are using thinks is the longest. If you believe that the result is wrong |
468 | please report it via the L<perlbug> utility. | |
469 | ||
44a2ac75 YO |
470 | =item regname($name,$qr,$all) |
471 | ||
472 | Returns the contents of a named buffer. If $qr is missing, or is not the | |
473 | result of a qr// then returns the result of the last successful match. If | |
474 | $all is true then returns an array ref containing one entry per buffer, | |
475 | otherwise returns the first defined buffer. | |
476 | ||
477 | =item regnames($qr,$all) | |
478 | ||
479 | Returns a list of all of the named buffers defined in a pattern. If | |
480 | $all is true then it returns all names defined, if not returns only | |
481 | names which were involved in the last successful match. If $qr is omitted | |
482 | or is not the result of a qr// then returns the details for the last | |
483 | successful match. | |
484 | ||
485 | =item regnames_iterinit($qr) | |
486 | ||
487 | Initializes the internal hash iterator associated to a regexps named capture | |
488 | buffers. If $qr is omitted resets the iterator associated with the regexp used | |
489 | in the last successful match. | |
490 | ||
491 | =item regnames_iternext($qr,$all) | |
492 | ||
493 | Gets the next key from the hash associated with a regexp. If $qr | |
494 | is omitted resets the iterator associated with the regexp used in the | |
495 | last successful match. If $all is true returns the keys of all of the | |
496 | distinct named buffers in the pattern, if not returns only those names | |
497 | used in the last successful match. | |
498 | ||
499 | =item regnames_count($qr) | |
500 | ||
501 | Returns the number of distinct names defined in the regexp $qr. If | |
502 | $qr is omitted or not a regexp returns the count of names in the | |
503 | last successful match. | |
504 | ||
505 | B<Note:> that this result is always the actual number of distinct | |
506 | named buffers defined, it may not actually match that which is | |
507 | returned by C<regnames()> and related routines when those routines | |
508 | have not been called with the $all parameter set.. | |
509 | ||
de8c5301 | 510 | =back |
b3eb6a9b | 511 | |
de8c5301 | 512 | =head1 SEE ALSO |
b3eb6a9b | 513 | |
de8c5301 YO |
514 | L<perlmodlib/Pragmatic Modules>. |
515 | ||
516 | =cut |