This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Improve -Dr output of bracketed char classes
[perl5.git] / ext / re / re.pm
CommitLineData
b3eb6a9b
GS
1package re;
2
99cc5cc6 3# pragma for controlling the regexp engine
de8c5301
YO
4use strict;
5use warnings;
6
dd4728cd 7our $VERSION = "0.26";
de8c5301 8our @ISA = qw(Exporter);
ec781434 9our @EXPORT_OK = ('regmust',
192c1e27
JH
10 qw(is_regexp regexp_pattern
11 regname regnames regnames_count));
de8c5301
YO
12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
13
de8c5301
YO
14my %bitmask = (
15 taint => 0x00100000, # HINT_RE_TAINT
16 eval => 0x00200000, # HINT_RE_EVAL
17);
18
1e215989
FC
19my $flags_hint = 0x02000000; # HINT_RE_FLAGS
20my $PMMOD_SHIFT = 0;
21my %reflags = (
22 m => 1 << ($PMMOD_SHIFT + 0),
23 s => 1 << ($PMMOD_SHIFT + 1),
24 i => 1 << ($PMMOD_SHIFT + 2),
25 x => 1 << ($PMMOD_SHIFT + 3),
26 p => 1 << ($PMMOD_SHIFT + 4),
27# special cases:
1e215989 28 d => 0,
a62b1201
KW
29 l => 1,
30 u => 2,
cfaf538b 31 a => 3,
b4ab316d 32 aa => 4,
1e215989
FC
33);
34
de8c5301
YO
35sub setcolor {
36 eval { # Ignore errors
37 require Term::Cap;
38
39 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
40 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
41 my @props = split /,/, $props;
42 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
43
44 $colors =~ s/\0//g;
45 $ENV{PERL_RE_COLORS} = $colors;
46 };
47 if ($@) {
48 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
49 }
50
51}
52
53my %flags = (
54 COMPILE => 0x0000FF,
55 PARSE => 0x000001,
56 OPTIMISE => 0x000002,
57 TRIEC => 0x000004,
58 DUMP => 0x000008,
f7819f85 59 FLAGS => 0x000010,
de8c5301
YO
60
61 EXECUTE => 0x00FF00,
62 INTUIT => 0x000100,
63 MATCH => 0x000200,
64 TRIEE => 0x000400,
65
66 EXTRA => 0xFF0000,
67 TRIEM => 0x010000,
68 OFFSETS => 0x020000,
69 OFFSETSDBG => 0x040000,
70 STATE => 0x080000,
71 OPTIMISEM => 0x100000,
72 STACK => 0x280000,
e7707071 73 BUFFERS => 0x400000,
2c296965 74 GPOS => 0x800000,
de8c5301 75);
e7707071 76$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
de8c5301 77$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
2c296965 78$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS};
de8c5301
YO
79$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
80$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
81$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
82
ec781434
NC
83if (defined &DynaLoader::boot_DynaLoader) {
84 require XSLoader;
da4061d3 85 XSLoader::load();
de8c5301 86}
ec781434
NC
87# else we're miniperl
88# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which
89# uses re 'taint'.
de8c5301
YO
90
91sub _load_unload {
92 my ($on)= @_;
93 if ($on) {
ec781434
NC
94 # We call install() every time, as if we didn't, we wouldn't
95 # "see" any changes to the color environment var since
96 # the last time it was called.
97
98 # install() returns an integer, which if casted properly
99cc5cc6 99 # in C resolves to a structure containing the regexp
ec781434
NC
100 # hooks. Setting it to a random integer will guarantee
101 # segfaults.
102 $^H{regcomp} = install();
de8c5301
YO
103 } else {
104 delete $^H{regcomp};
105 }
106}
107
108sub bits {
109 my $on = shift;
110 my $bits = 0;
1e215989 111 ARG:
de8c5301
YO
112 foreach my $idx (0..$#_){
113 my $s=$_[$idx];
114 if ($s eq 'Debug' or $s eq 'Debugcolor') {
115 setcolor() if $s =~/color/i;
116 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
117 for my $idx ($idx+1..$#_) {
118 if ($flags{$_[$idx]}) {
119 if ($on) {
120 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
121 } else {
122 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
123 }
124 } else {
125 require Carp;
126 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
127 join(", ",sort keys %flags ) );
128 }
129 }
130 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
131 last;
132 } elsif ($s eq 'debug' or $s eq 'debugcolor') {
133 setcolor() if $s =~/color/i;
134 _load_unload($on);
66e6b4c5 135 last;
de8c5301
YO
136 } elsif (exists $bitmask{$s}) {
137 $bits |= $bitmask{$s};
138 } elsif ($EXPORT_OK{$s}) {
de8c5301
YO
139 require Exporter;
140 re->export_to_level(2, 're', $s);
1e215989
FC
141 } elsif ($s =~ s/^\///) {
142 my $reflags = $^H{reflags} || 0;
6320bfaf 143 my $seen_charset;
342c8524 144 while ($s =~ m/( . )/gx) {
48895a0d 145 local $_ = $1;
cfaf538b 146 if (/[adul]/) {
342c8524
KW
147 # The 'a' may be repeated; hide this from the rest of the
148 # code by counting and getting rid of all of them, then
149 # changing to 'aa' if there is a repeat.
150 if ($_ eq 'a') {
151 my $sav_pos = pos $s;
152 my $a_count = $s =~ s/a//g;
153 pos $s = $sav_pos - 1; # -1 because got rid of the 'a'
154 if ($a_count > 2) {
155 require Carp;
156 Carp::carp(
157 qq 'The "a" flag may only appear a maximum of twice'
158 );
159 }
160 elsif ($a_count == 2) {
161 $_ = 'aa';
162 }
163 }
1e215989 164 if ($on) {
45a507fa 165 if ($seen_charset) {
96ef02be 166 require Carp;
45a507fa
KW
167 if ($seen_charset ne $_) {
168 Carp::carp(
169 qq 'The "$seen_charset" and "$_" flags '
170 .qq 'are exclusive'
171 );
172 }
173 else {
174 Carp::carp(
175 qq 'The "$seen_charset" flag may not appear '
176 .qq 'twice'
177 );
178 }
96ef02be 179 }
6320bfaf
KW
180 $^H{reflags_charset} = $reflags{$_};
181 $seen_charset = $_;
1e215989
FC
182 }
183 else {
6320bfaf
KW
184 delete $^H{reflags_charset}
185 if defined $^H{reflags_charset}
186 && $^H{reflags_charset} == $reflags{$_};
1e215989
FC
187 }
188 } elsif (exists $reflags{$_}) {
189 $on
190 ? $reflags |= $reflags{$_}
191 : ($reflags &= ~$reflags{$_});
192 } else {
193 require Carp;
194 Carp::carp(
195 qq'Unknown regular expression flag "$_"'
196 );
197 next ARG;
198 }
199 }
6320bfaf 200 ($^H{reflags} = $reflags or defined $^H{reflags_charset})
1e215989
FC
201 ? $^H |= $flags_hint
202 : ($^H &= ~$flags_hint);
de8c5301
YO
203 } else {
204 require Carp;
205 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
206 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
207 ")");
208 }
209 }
210 $bits;
211}
212
213sub import {
214 shift;
215 $^H |= bits(1, @_);
216}
217
218sub unimport {
219 shift;
220 $^H &= ~ bits(0, @_);
221}
222
2231;
224
225__END__
56953603 226
b3eb6a9b
GS
227=head1 NAME
228
229re - Perl pragma to alter regular expression behaviour
230
231=head1 SYNOPSIS
232
e4d48cc9
GS
233 use re 'taint';
234 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here
b3eb6a9b 235
2cd61cdb 236 $pat = '(?{ $foo = 1 })';
e4d48cc9 237 use re 'eval';
48fe68f5
KW
238 /foo${pat}bar/; # won't fail (when not under -T
239 # switch)
e4d48cc9
GS
240
241 {
242 no re 'taint'; # the default
243 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here
244
245 no re 'eval'; # the default
48fe68f5
KW
246 /foo${pat}bar/; # disallowed (with or without -T
247 # switch)
e4d48cc9 248 }
b3eb6a9b 249
1e215989
FC
250 use re '/ix';
251 "FOO" =~ / foo /; # /ix implied
252 no re '/x';
253 "FOO" =~ /foo/; # just /i implied
254
1e2e3d02 255 use re 'debug'; # output debugging info during
48fe68f5 256 /^(.*)$/s; # compile and run time
1e2e3d02 257
2cd61cdb 258
48fe68f5
KW
259 use re 'debugcolor'; # same as 'debug', but with colored
260 # output
02ea72ae
IZ
261 ...
262
48fe68f5
KW
263 use re qw(Debug All); # Same as "use re 'debug'", but you
264 # can use "Debug" with things other
265 # than 'All'
266 use re qw(Debug More); # 'All' plus output more details
267 no re qw(Debug ALL); # Turn on (almost) all re debugging
268 # in this scope
4ee9a43f 269
de8c5301
YO
270 use re qw(is_regexp regexp_pattern); # import utility functions
271 my ($pat,$mods)=regexp_pattern(qr/foo/i);
272 if (is_regexp($obj)) {
273 print "Got regexp: ",
48fe68f5
KW
274 scalar regexp_pattern($obj); # just as perl would stringify
275 } # it but no hassle with blessed
276 # re's.
a3621e74 277
3ffabb8c
GS
278(We use $^X in these examples because it's tainted by default.)
279
b3eb6a9b
GS
280=head1 DESCRIPTION
281
de8c5301
YO
282=head2 'taint' mode
283
b3eb6a9b 284When C<use re 'taint'> is in effect, and a tainted string is the target
99cc5cc6
A
285of a regexp, the regexp memories (or values returned by the m// operator
286in list context) are tainted. This feature is useful when regexp operations
e4d48cc9
GS
287on tainted data aren't meant to extract safe substrings, but to perform
288other transformations.
b3eb6a9b 289
de8c5301
YO
290=head2 'eval' mode
291
99cc5cc6 292When C<use re 'eval'> is in effect, a regexp is allowed to contain
0b370c0a 293C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed
e128ab2c
DM
294subexpressions that are derived from variable interpolation, rather than
295appearing literally within the regexp. That is normally disallowed, since
296it is a
2cd61cdb
IZ
297potential security risk. Note that this pragma is ignored when the regular
298expression is obtained from tainted data, i.e. evaluation is always
0b370c0a 299disallowed with tainted regular expressions. See L<perlre/(?{ code })>
bb1773de 300and L<perlre/(??{ code })>.
2cd61cdb 301
ffbc6a93 302For the purpose of this pragma, interpolation of precompiled regular
0a92e3a8
GS
303expressions (i.e., the result of C<qr//>) is I<not> considered variable
304interpolation. Thus:
2cd61cdb
IZ
305
306 /foo${pat}bar/
307
ffbc6a93 308I<is> allowed if $pat is a precompiled regular expression, even
0b370c0a 309if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions.
2cd61cdb 310
1e215989
FC
311=head2 '/flags' mode
312
313When C<use re '/flags'> is specified, the given flags are automatically
314added to every regular expression till the end of the lexical scope.
315
316C<no re '/flags'> will turn off the effect of C<use re '/flags'> for the
317given flags.
318
319For example, if you want all your regular expressions to have /msx on by
320default, simply put
321
322 use re '/msx';
323
324at the top of your code.
325
cfaf538b 326The character set /adul flags cancel each other out. So, in this example,
1e215989
FC
327
328 use re "/u";
329 "ss" =~ /\xdf/;
330 use re "/d";
331 "ss" =~ /\xdf/;
332
4d220a7d 333the second C<use re> does an implicit C<no re '/u'>.
1e215989 334
59640339 335Turning on one of the character set flags with C<use re> takes precedence over the
1e215989
FC
336C<locale> pragma and the 'unicode_strings' C<feature>, for regular
337expressions. Turning off one of these flags when it is active reverts to
338the behaviour specified by whatever other pragmata are in scope. For
339example:
340
341 use feature "unicode_strings";
342 no re "/u"; # does nothing
343 use re "/l";
344 no re "/l"; # reverts to unicode_strings behaviour
345
de8c5301
YO
346=head2 'debug' mode
347
ffbc6a93 348When C<use re 'debug'> is in effect, perl emits debugging messages when
2cd61cdb
IZ
349compiling and using regular expressions. The output is the same as that
350obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
351B<-Dr> switch. It may be quite voluminous depending on the complexity
02ea72ae
IZ
352of the match. Using C<debugcolor> instead of C<debug> enables a
353form of output that can be used to get a colorful display on terminals
354that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
355comma-separated list of C<termcap> properties to use for highlighting
ffbc6a93 356strings on/off, pre-point part on/off.
57e8c15d 357See L<perldebug/"Debugging Regular Expressions"> for additional info.
2cd61cdb 358
de8c5301
YO
359As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
360lexically scoped, as the other directives are. However they have both
361compile-time and run-time effects.
362
363See L<perlmodlib/Pragmatic Modules>.
364
365=head2 'Debug' mode
366
a3621e74
YO
367Similarly C<use re 'Debug'> produces debugging output, the difference
368being that it allows the fine tuning of what debugging output will be
be8e71aa
YO
369emitted. Options are divided into three groups, those related to
370compilation, those related to execution and those related to special
371purposes. The options are as follows:
372
373=over 4
374
375=item Compile related options
376
377=over 4
378
379=item COMPILE
380
381Turns on all compile related debug options.
382
383=item PARSE
384
385Turns on debug output related to the process of parsing the pattern.
386
387=item OPTIMISE
388
389Enables output related to the optimisation phase of compilation.
390
24b23f37 391=item TRIEC
be8e71aa
YO
392
393Detailed info about trie compilation.
394
395=item DUMP
396
397Dump the final program out after it is compiled and optimised.
398
be8e71aa
YO
399=back
400
401=item Execute related options
402
403=over 4
404
405=item EXECUTE
406
407Turns on all execute related debug options.
408
409=item MATCH
410
411Turns on debugging of the main matching loop.
412
24b23f37 413=item TRIEE
be8e71aa
YO
414
415Extra debugging of how tries execute.
416
417=item INTUIT
418
48fe68f5 419Enable debugging of start-point optimisations.
be8e71aa
YO
420
421=back
422
423=item Extra debugging options
424
425=over 4
426
427=item EXTRA
428
429Turns on all "extra" debugging options.
430
e7707071
YO
431=item BUFFERS
432
c27a5cfe 433Enable debugging the capture group storage during match. Warning,
e7707071
YO
434this can potentially produce extremely large output.
435
24b23f37
YO
436=item TRIEM
437
438Enable enhanced TRIE debugging. Enhances both TRIEE
439and TRIEC.
440
441=item STATE
442
4ee9a43f 443Enable debugging of states in the engine.
24b23f37
YO
444
445=item STACK
be8e71aa 446
24b23f37
YO
447Enable debugging of the recursion stack in the engine. Enabling
448or disabling this option automatically does the same for debugging
449states as well. This output from this can be quite large.
450
451=item OPTIMISEM
452
48fe68f5 453Enable enhanced optimisation debugging and start-point optimisations.
99cc5cc6 454Probably not useful except when debugging the regexp engine itself.
24b23f37
YO
455
456=item OFFSETS
457
458Dump offset information. This can be used to see how regops correlate
459to the pattern. Output format is
460
461 NODENUM:POSITION[LENGTH]
462
463Where 1 is the position of the first char in the string. Note that position
464can be 0, or larger than the actual length of the pattern, likewise length
465can be zero.
be8e71aa 466
24b23f37 467=item OFFSETSDBG
be8e71aa
YO
468
469Enable debugging of offsets information. This emits copious
fe759410 470amounts of trace information and doesn't mesh well with other
be8e71aa
YO
471debug options.
472
fe759410 473Almost definitely only useful to people hacking
be8e71aa
YO
474on the offsets part of the debug engine.
475
476=back
477
478=item Other useful flags
479
480These are useful shortcuts to save on the typing.
481
482=over 4
483
484=item ALL
485
48fe68f5
KW
486Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS.
487(To get every single option without exception, use both ALL and EXTRA.)
be8e71aa
YO
488
489=item All
490
fe759410 491Enable DUMP and all execute options. Equivalent to:
be8e71aa
YO
492
493 use re 'debug';
494
495=item MORE
496
497=item More
498
48fe68f5 499Enable the options enabled by "All", plus STATE, TRIEC, and TRIEM.
be8e71aa 500
dba3f186 501=back
be8e71aa 502
dba3f186 503=back
a3621e74 504
1e2e3d02 505As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
48fe68f5 506lexically scoped, as are the other directives. However they have both
1e2e3d02 507compile-time and run-time effects.
b3eb6a9b 508
de8c5301 509=head2 Exportable Functions
b3eb6a9b 510
de8c5301 511As of perl 5.9.5 're' debug contains a number of utility functions that
4ee9a43f 512may be optionally exported into the caller's namespace. They are listed
de8c5301 513below.
b3eb6a9b 514
de8c5301 515=over 4
b3eb6a9b 516
de8c5301 517=item is_regexp($ref)
02ea72ae 518
de8c5301 519Returns true if the argument is a compiled regular expression as returned
4ee9a43f 520by C<qr//>, false if it is not.
02ea72ae 521
4ee9a43f
RGS
522This function will not be confused by overloading or blessing. In
523internals terms, this extracts the regexp pointer out of the
3a5e0888 524PERL_MAGIC_qr structure so it cannot be fooled.
894be9b7 525
de8c5301 526=item regexp_pattern($ref)
02ea72ae 527
4ee9a43f
RGS
528If the argument is a compiled regular expression as returned by C<qr//>,
529then this function returns the pattern.
be8e71aa 530
4ee9a43f
RGS
531In list context it returns a two element list, the first element
532containing the pattern and the second containing the modifiers used when
533the pattern was compiled.
be8e71aa 534
4ee9a43f 535 my ($pat, $mods) = regexp_pattern($ref);
a3621e74 536
99cc5cc6 537In scalar context it returns the same as perl would when stringifying a raw
4ee9a43f
RGS
538C<qr//> with the same pattern inside. If the argument is not a compiled
539reference then this routine returns false but defined in scalar context,
540and the empty list in list context. Thus the following
f9f4320a 541
dff5e0c4 542 if (regexp_pattern($ref) eq '(?^i:foo)')
dba3f186 543
de8c5301 544will be warning free regardless of what $ref actually is.
380e0b81 545
4ee9a43f
RGS
546Like C<is_regexp> this function will not be confused by overloading
547or blessing of the object.
b3eb6a9b 548
256ddcd0
YO
549=item regmust($ref)
550
432acd5f 551If the argument is a compiled regular expression as returned by C<qr//>,
99cc5cc6 552then this function returns what the optimiser considers to be the longest
432acd5f
RGS
553anchored fixed string and longest floating fixed string in the pattern.
554
555A I<fixed string> is defined as being a substring that must appear for the
556pattern to match. An I<anchored fixed string> is a fixed string that must
557appear at a particular offset from the beginning of the match. A I<floating
558fixed string> is defined as a fixed string that can appear at any point in
559a range of positions relative to the start of the match. For example,
560
561 my $qr = qr/here .* there/x;
562 my ($anchored, $floating) = regmust($qr);
256ddcd0 563 print "anchored:'$anchored'\nfloating:'$floating'\n";
432acd5f 564
256ddcd0
YO
565results in
566
567 anchored:'here'
568 floating:'there'
569
432acd5f
RGS
570Because the C<here> is before the C<.*> in the pattern, its position
571can be determined exactly. That's not true, however, for the C<there>;
572it could appear at any point after where the anchored string appeared.
d952710b 573Perl uses both for its optimisations, preferring the longer, or, if they are
256ddcd0
YO
574equal, the floating.
575
576B<NOTE:> This may not necessarily be the definitive longest anchored and
432acd5f 577floating string. This will be what the optimiser of the Perl that you
256ddcd0
YO
578are using thinks is the longest. If you believe that the result is wrong
579please report it via the L<perlbug> utility.
580
28d8d7f4 581=item regname($name,$all)
44a2ac75 582
28d8d7f4
YO
583Returns the contents of a named buffer of the last successful match. If
584$all is true, then returns an array ref containing one entry per buffer,
44a2ac75
YO
585otherwise returns the first defined buffer.
586
28d8d7f4 587=item regnames($all)
44a2ac75 588
28d8d7f4
YO
589Returns a list of all of the named buffers defined in the last successful
590match. If $all is true, then it returns all names defined, if not it returns
591only names which were involved in the match.
44a2ac75 592
28d8d7f4 593=item regnames_count()
44a2ac75 594
28d8d7f4
YO
595Returns the number of distinct names defined in the pattern used
596for the last successful match.
44a2ac75 597
28d8d7f4
YO
598B<Note:> this result is always the actual number of distinct
599named buffers defined, it may not actually match that which is
600returned by C<regnames()> and related routines when those routines
601have not been called with the $all parameter set.
44a2ac75 602
de8c5301 603=back
b3eb6a9b 604
de8c5301 605=head1 SEE ALSO
b3eb6a9b 606
de8c5301
YO
607L<perlmodlib/Pragmatic Modules>.
608
609=cut