This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Add 'strict' parameter to S_regclass()
[perl5.git] / ext / re / re.pm
CommitLineData
b3eb6a9b
GS
1package re;
2
99cc5cc6 3# pragma for controlling the regexp engine
de8c5301
YO
4use strict;
5use warnings;
6
cf4f23c7 7our $VERSION = "0.29";
de8c5301 8our @ISA = qw(Exporter);
ec781434 9our @EXPORT_OK = ('regmust',
192c1e27
JH
10 qw(is_regexp regexp_pattern
11 regname regnames regnames_count));
de8c5301
YO
12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
13
de8c5301
YO
14my %bitmask = (
15 taint => 0x00100000, # HINT_RE_TAINT
16 eval => 0x00200000, # HINT_RE_EVAL
17);
18
1e215989
FC
19my $flags_hint = 0x02000000; # HINT_RE_FLAGS
20my $PMMOD_SHIFT = 0;
21my %reflags = (
22 m => 1 << ($PMMOD_SHIFT + 0),
23 s => 1 << ($PMMOD_SHIFT + 1),
24 i => 1 << ($PMMOD_SHIFT + 2),
25 x => 1 << ($PMMOD_SHIFT + 3),
41d7c59e
MH
26 n => 1 << ($PMMOD_SHIFT + 5),
27 p => 1 << ($PMMOD_SHIFT + 6),
1e215989 28# special cases:
1e215989 29 d => 0,
a62b1201
KW
30 l => 1,
31 u => 2,
cfaf538b 32 a => 3,
b4ab316d 33 aa => 4,
1e215989
FC
34);
35
de8c5301
YO
36sub setcolor {
37 eval { # Ignore errors
38 require Term::Cap;
39
40 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
41 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
42 my @props = split /,/, $props;
43 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
44
45 $colors =~ s/\0//g;
46 $ENV{PERL_RE_COLORS} = $colors;
47 };
48 if ($@) {
49 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
50 }
51
52}
53
54my %flags = (
55 COMPILE => 0x0000FF,
56 PARSE => 0x000001,
57 OPTIMISE => 0x000002,
58 TRIEC => 0x000004,
59 DUMP => 0x000008,
f7819f85 60 FLAGS => 0x000010,
d9a72fcc 61 TEST => 0x000020,
de8c5301
YO
62
63 EXECUTE => 0x00FF00,
64 INTUIT => 0x000100,
65 MATCH => 0x000200,
66 TRIEE => 0x000400,
67
68 EXTRA => 0xFF0000,
69 TRIEM => 0x010000,
70 OFFSETS => 0x020000,
71 OFFSETSDBG => 0x040000,
72 STATE => 0x080000,
73 OPTIMISEM => 0x100000,
74 STACK => 0x280000,
e7707071 75 BUFFERS => 0x400000,
2c296965 76 GPOS => 0x800000,
de8c5301 77);
e7707071 78$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
de8c5301 79$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
2c296965 80$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS};
de8c5301
YO
81$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
82$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
83$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
84
ec781434
NC
85if (defined &DynaLoader::boot_DynaLoader) {
86 require XSLoader;
da4061d3 87 XSLoader::load();
de8c5301 88}
ec781434
NC
89# else we're miniperl
90# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which
91# uses re 'taint'.
de8c5301
YO
92
93sub _load_unload {
94 my ($on)= @_;
95 if ($on) {
ec781434
NC
96 # We call install() every time, as if we didn't, we wouldn't
97 # "see" any changes to the color environment var since
98 # the last time it was called.
99
100 # install() returns an integer, which if casted properly
99cc5cc6 101 # in C resolves to a structure containing the regexp
ec781434
NC
102 # hooks. Setting it to a random integer will guarantee
103 # segfaults.
104 $^H{regcomp} = install();
de8c5301
YO
105 } else {
106 delete $^H{regcomp};
107 }
108}
109
110sub bits {
111 my $on = shift;
112 my $bits = 0;
cc4d09e1 113 my %seen; # Has flag already been seen?
1e215989 114 ARG:
de8c5301
YO
115 foreach my $idx (0..$#_){
116 my $s=$_[$idx];
117 if ($s eq 'Debug' or $s eq 'Debugcolor') {
118 setcolor() if $s =~/color/i;
119 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
120 for my $idx ($idx+1..$#_) {
121 if ($flags{$_[$idx]}) {
122 if ($on) {
123 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
124 } else {
125 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
126 }
127 } else {
128 require Carp;
129 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
130 join(", ",sort keys %flags ) );
131 }
132 }
133 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
134 last;
135 } elsif ($s eq 'debug' or $s eq 'debugcolor') {
136 setcolor() if $s =~/color/i;
137 _load_unload($on);
66e6b4c5 138 last;
de8c5301
YO
139 } elsif (exists $bitmask{$s}) {
140 $bits |= $bitmask{$s};
141 } elsif ($EXPORT_OK{$s}) {
de8c5301
YO
142 require Exporter;
143 re->export_to_level(2, 're', $s);
1e215989
FC
144 } elsif ($s =~ s/^\///) {
145 my $reflags = $^H{reflags} || 0;
6320bfaf 146 my $seen_charset;
342c8524 147 while ($s =~ m/( . )/gx) {
48895a0d 148 local $_ = $1;
cfaf538b 149 if (/[adul]/) {
342c8524
KW
150 # The 'a' may be repeated; hide this from the rest of the
151 # code by counting and getting rid of all of them, then
152 # changing to 'aa' if there is a repeat.
153 if ($_ eq 'a') {
154 my $sav_pos = pos $s;
155 my $a_count = $s =~ s/a//g;
156 pos $s = $sav_pos - 1; # -1 because got rid of the 'a'
157 if ($a_count > 2) {
158 require Carp;
159 Carp::carp(
160 qq 'The "a" flag may only appear a maximum of twice'
161 );
162 }
163 elsif ($a_count == 2) {
164 $_ = 'aa';
165 }
166 }
1e215989 167 if ($on) {
45a507fa 168 if ($seen_charset) {
96ef02be 169 require Carp;
45a507fa
KW
170 if ($seen_charset ne $_) {
171 Carp::carp(
172 qq 'The "$seen_charset" and "$_" flags '
173 .qq 'are exclusive'
174 );
175 }
176 else {
177 Carp::carp(
178 qq 'The "$seen_charset" flag may not appear '
179 .qq 'twice'
180 );
181 }
96ef02be 182 }
6320bfaf
KW
183 $^H{reflags_charset} = $reflags{$_};
184 $seen_charset = $_;
1e215989
FC
185 }
186 else {
6320bfaf
KW
187 delete $^H{reflags_charset}
188 if defined $^H{reflags_charset}
189 && $^H{reflags_charset} == $reflags{$_};
1e215989
FC
190 }
191 } elsif (exists $reflags{$_}) {
cc4d09e1
KW
192 $seen{$_}++;
193 $on
1e215989
FC
194 ? $reflags |= $reflags{$_}
195 : ($reflags &= ~$reflags{$_});
196 } else {
197 require Carp;
198 Carp::carp(
199 qq'Unknown regular expression flag "$_"'
200 );
201 next ARG;
202 }
203 }
6320bfaf 204 ($^H{reflags} = $reflags or defined $^H{reflags_charset})
1e215989
FC
205 ? $^H |= $flags_hint
206 : ($^H &= ~$flags_hint);
de8c5301
YO
207 } else {
208 require Carp;
209 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
210 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
211 ")");
212 }
213 }
cc4d09e1
KW
214 if (exists $seen{'x'} && $seen{'x'} > 1
215 && (warnings::enabled("deprecated")
216 || warnings::enabled("regexp")))
217 {
218 my $message = "Having more than one /x regexp modifier is deprecated";
219 if (warnings::enabled("deprecated")) {
220 warnings::warn("deprecated", $message);
221 }
222 else {
223 warnings::warn("regexp", $message);
224 }
225 }
de8c5301
YO
226 $bits;
227}
228
229sub import {
230 shift;
231 $^H |= bits(1, @_);
232}
233
234sub unimport {
235 shift;
236 $^H &= ~ bits(0, @_);
237}
238
2391;
240
241__END__
56953603 242
b3eb6a9b
GS
243=head1 NAME
244
245re - Perl pragma to alter regular expression behaviour
246
247=head1 SYNOPSIS
248
e4d48cc9
GS
249 use re 'taint';
250 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here
b3eb6a9b 251
2cd61cdb 252 $pat = '(?{ $foo = 1 })';
e4d48cc9 253 use re 'eval';
48fe68f5
KW
254 /foo${pat}bar/; # won't fail (when not under -T
255 # switch)
e4d48cc9
GS
256
257 {
258 no re 'taint'; # the default
259 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here
260
261 no re 'eval'; # the default
48fe68f5
KW
262 /foo${pat}bar/; # disallowed (with or without -T
263 # switch)
e4d48cc9 264 }
b3eb6a9b 265
1e215989
FC
266 use re '/ix';
267 "FOO" =~ / foo /; # /ix implied
268 no re '/x';
269 "FOO" =~ /foo/; # just /i implied
270
1e2e3d02 271 use re 'debug'; # output debugging info during
48fe68f5 272 /^(.*)$/s; # compile and run time
1e2e3d02 273
2cd61cdb 274
48fe68f5
KW
275 use re 'debugcolor'; # same as 'debug', but with colored
276 # output
02ea72ae
IZ
277 ...
278
48fe68f5
KW
279 use re qw(Debug All); # Same as "use re 'debug'", but you
280 # can use "Debug" with things other
281 # than 'All'
282 use re qw(Debug More); # 'All' plus output more details
283 no re qw(Debug ALL); # Turn on (almost) all re debugging
284 # in this scope
4ee9a43f 285
de8c5301
YO
286 use re qw(is_regexp regexp_pattern); # import utility functions
287 my ($pat,$mods)=regexp_pattern(qr/foo/i);
288 if (is_regexp($obj)) {
289 print "Got regexp: ",
48fe68f5
KW
290 scalar regexp_pattern($obj); # just as perl would stringify
291 } # it but no hassle with blessed
292 # re's.
a3621e74 293
3ffabb8c
GS
294(We use $^X in these examples because it's tainted by default.)
295
b3eb6a9b
GS
296=head1 DESCRIPTION
297
de8c5301
YO
298=head2 'taint' mode
299
b3eb6a9b 300When C<use re 'taint'> is in effect, and a tainted string is the target
99cc5cc6
A
301of a regexp, the regexp memories (or values returned by the m// operator
302in list context) are tainted. This feature is useful when regexp operations
e4d48cc9
GS
303on tainted data aren't meant to extract safe substrings, but to perform
304other transformations.
b3eb6a9b 305
de8c5301
YO
306=head2 'eval' mode
307
99cc5cc6 308When C<use re 'eval'> is in effect, a regexp is allowed to contain
0b370c0a 309C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed
e128ab2c
DM
310subexpressions that are derived from variable interpolation, rather than
311appearing literally within the regexp. That is normally disallowed, since
312it is a
2cd61cdb
IZ
313potential security risk. Note that this pragma is ignored when the regular
314expression is obtained from tainted data, i.e. evaluation is always
0b370c0a 315disallowed with tainted regular expressions. See L<perlre/(?{ code })>
bb1773de 316and L<perlre/(??{ code })>.
2cd61cdb 317
ffbc6a93 318For the purpose of this pragma, interpolation of precompiled regular
0a92e3a8
GS
319expressions (i.e., the result of C<qr//>) is I<not> considered variable
320interpolation. Thus:
2cd61cdb
IZ
321
322 /foo${pat}bar/
323
ffbc6a93 324I<is> allowed if $pat is a precompiled regular expression, even
0b370c0a 325if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions.
2cd61cdb 326
1e215989
FC
327=head2 '/flags' mode
328
329When C<use re '/flags'> is specified, the given flags are automatically
330added to every regular expression till the end of the lexical scope.
331
332C<no re '/flags'> will turn off the effect of C<use re '/flags'> for the
333given flags.
334
335For example, if you want all your regular expressions to have /msx on by
336default, simply put
337
338 use re '/msx';
339
340at the top of your code.
341
cfaf538b 342The character set /adul flags cancel each other out. So, in this example,
1e215989
FC
343
344 use re "/u";
345 "ss" =~ /\xdf/;
346 use re "/d";
347 "ss" =~ /\xdf/;
348
4d220a7d 349the second C<use re> does an implicit C<no re '/u'>.
1e215989 350
59640339 351Turning on one of the character set flags with C<use re> takes precedence over the
1e215989
FC
352C<locale> pragma and the 'unicode_strings' C<feature>, for regular
353expressions. Turning off one of these flags when it is active reverts to
354the behaviour specified by whatever other pragmata are in scope. For
355example:
356
357 use feature "unicode_strings";
358 no re "/u"; # does nothing
359 use re "/l";
360 no re "/l"; # reverts to unicode_strings behaviour
361
de8c5301
YO
362=head2 'debug' mode
363
ffbc6a93 364When C<use re 'debug'> is in effect, perl emits debugging messages when
2cd61cdb
IZ
365compiling and using regular expressions. The output is the same as that
366obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
367B<-Dr> switch. It may be quite voluminous depending on the complexity
02ea72ae
IZ
368of the match. Using C<debugcolor> instead of C<debug> enables a
369form of output that can be used to get a colorful display on terminals
370that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
371comma-separated list of C<termcap> properties to use for highlighting
ffbc6a93 372strings on/off, pre-point part on/off.
57e8c15d 373See L<perldebug/"Debugging Regular Expressions"> for additional info.
2cd61cdb 374
de8c5301
YO
375As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
376lexically scoped, as the other directives are. However they have both
377compile-time and run-time effects.
378
379See L<perlmodlib/Pragmatic Modules>.
380
381=head2 'Debug' mode
382
a3621e74
YO
383Similarly C<use re 'Debug'> produces debugging output, the difference
384being that it allows the fine tuning of what debugging output will be
be8e71aa
YO
385emitted. Options are divided into three groups, those related to
386compilation, those related to execution and those related to special
387purposes. The options are as follows:
388
389=over 4
390
391=item Compile related options
392
393=over 4
394
395=item COMPILE
396
397Turns on all compile related debug options.
398
399=item PARSE
400
401Turns on debug output related to the process of parsing the pattern.
402
403=item OPTIMISE
404
405Enables output related to the optimisation phase of compilation.
406
24b23f37 407=item TRIEC
be8e71aa
YO
408
409Detailed info about trie compilation.
410
411=item DUMP
412
413Dump the final program out after it is compiled and optimised.
414
d9a72fcc
YO
415=item FLAGS
416
417Dump the flags associated with the program
418
419=item TEST
420
421Print output intended for testing the internals of the compile process
422
be8e71aa
YO
423=back
424
425=item Execute related options
426
427=over 4
428
429=item EXECUTE
430
431Turns on all execute related debug options.
432
433=item MATCH
434
435Turns on debugging of the main matching loop.
436
24b23f37 437=item TRIEE
be8e71aa
YO
438
439Extra debugging of how tries execute.
440
441=item INTUIT
442
48fe68f5 443Enable debugging of start-point optimisations.
be8e71aa
YO
444
445=back
446
447=item Extra debugging options
448
449=over 4
450
451=item EXTRA
452
453Turns on all "extra" debugging options.
454
e7707071
YO
455=item BUFFERS
456
c27a5cfe 457Enable debugging the capture group storage during match. Warning,
e7707071
YO
458this can potentially produce extremely large output.
459
24b23f37
YO
460=item TRIEM
461
462Enable enhanced TRIE debugging. Enhances both TRIEE
463and TRIEC.
464
465=item STATE
466
4ee9a43f 467Enable debugging of states in the engine.
24b23f37
YO
468
469=item STACK
be8e71aa 470
24b23f37
YO
471Enable debugging of the recursion stack in the engine. Enabling
472or disabling this option automatically does the same for debugging
473states as well. This output from this can be quite large.
474
d9a72fcc
YO
475=item GPOS
476
477Enable debugging of the \G modifier.
478
24b23f37
YO
479=item OPTIMISEM
480
48fe68f5 481Enable enhanced optimisation debugging and start-point optimisations.
99cc5cc6 482Probably not useful except when debugging the regexp engine itself.
24b23f37
YO
483
484=item OFFSETS
485
486Dump offset information. This can be used to see how regops correlate
487to the pattern. Output format is
488
489 NODENUM:POSITION[LENGTH]
490
491Where 1 is the position of the first char in the string. Note that position
492can be 0, or larger than the actual length of the pattern, likewise length
493can be zero.
be8e71aa 494
24b23f37 495=item OFFSETSDBG
be8e71aa
YO
496
497Enable debugging of offsets information. This emits copious
fe759410 498amounts of trace information and doesn't mesh well with other
be8e71aa
YO
499debug options.
500
fe759410 501Almost definitely only useful to people hacking
be8e71aa
YO
502on the offsets part of the debug engine.
503
d9a72fcc 504
be8e71aa
YO
505=back
506
507=item Other useful flags
508
509These are useful shortcuts to save on the typing.
510
511=over 4
512
513=item ALL
514
48fe68f5
KW
515Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS.
516(To get every single option without exception, use both ALL and EXTRA.)
be8e71aa
YO
517
518=item All
519
fe759410 520Enable DUMP and all execute options. Equivalent to:
be8e71aa
YO
521
522 use re 'debug';
523
524=item MORE
525
526=item More
527
48fe68f5 528Enable the options enabled by "All", plus STATE, TRIEC, and TRIEM.
be8e71aa 529
dba3f186 530=back
be8e71aa 531
dba3f186 532=back
a3621e74 533
1e2e3d02 534As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
48fe68f5 535lexically scoped, as are the other directives. However they have both
1e2e3d02 536compile-time and run-time effects.
b3eb6a9b 537
de8c5301 538=head2 Exportable Functions
b3eb6a9b 539
de8c5301 540As of perl 5.9.5 're' debug contains a number of utility functions that
4ee9a43f 541may be optionally exported into the caller's namespace. They are listed
de8c5301 542below.
b3eb6a9b 543
de8c5301 544=over 4
b3eb6a9b 545
de8c5301 546=item is_regexp($ref)
02ea72ae 547
de8c5301 548Returns true if the argument is a compiled regular expression as returned
4ee9a43f 549by C<qr//>, false if it is not.
02ea72ae 550
4ee9a43f
RGS
551This function will not be confused by overloading or blessing. In
552internals terms, this extracts the regexp pointer out of the
3a5e0888 553PERL_MAGIC_qr structure so it cannot be fooled.
894be9b7 554
de8c5301 555=item regexp_pattern($ref)
02ea72ae 556
4ee9a43f
RGS
557If the argument is a compiled regular expression as returned by C<qr//>,
558then this function returns the pattern.
be8e71aa 559
4ee9a43f
RGS
560In list context it returns a two element list, the first element
561containing the pattern and the second containing the modifiers used when
562the pattern was compiled.
be8e71aa 563
4ee9a43f 564 my ($pat, $mods) = regexp_pattern($ref);
a3621e74 565
99cc5cc6 566In scalar context it returns the same as perl would when stringifying a raw
4ee9a43f
RGS
567C<qr//> with the same pattern inside. If the argument is not a compiled
568reference then this routine returns false but defined in scalar context,
569and the empty list in list context. Thus the following
f9f4320a 570
dff5e0c4 571 if (regexp_pattern($ref) eq '(?^i:foo)')
dba3f186 572
de8c5301 573will be warning free regardless of what $ref actually is.
380e0b81 574
4ee9a43f
RGS
575Like C<is_regexp> this function will not be confused by overloading
576or blessing of the object.
b3eb6a9b 577
256ddcd0
YO
578=item regmust($ref)
579
432acd5f 580If the argument is a compiled regular expression as returned by C<qr//>,
99cc5cc6 581then this function returns what the optimiser considers to be the longest
432acd5f
RGS
582anchored fixed string and longest floating fixed string in the pattern.
583
584A I<fixed string> is defined as being a substring that must appear for the
585pattern to match. An I<anchored fixed string> is a fixed string that must
586appear at a particular offset from the beginning of the match. A I<floating
587fixed string> is defined as a fixed string that can appear at any point in
588a range of positions relative to the start of the match. For example,
589
590 my $qr = qr/here .* there/x;
591 my ($anchored, $floating) = regmust($qr);
256ddcd0 592 print "anchored:'$anchored'\nfloating:'$floating'\n";
432acd5f 593
256ddcd0
YO
594results in
595
596 anchored:'here'
597 floating:'there'
598
432acd5f
RGS
599Because the C<here> is before the C<.*> in the pattern, its position
600can be determined exactly. That's not true, however, for the C<there>;
601it could appear at any point after where the anchored string appeared.
d952710b 602Perl uses both for its optimisations, preferring the longer, or, if they are
256ddcd0
YO
603equal, the floating.
604
605B<NOTE:> This may not necessarily be the definitive longest anchored and
432acd5f 606floating string. This will be what the optimiser of the Perl that you
256ddcd0
YO
607are using thinks is the longest. If you believe that the result is wrong
608please report it via the L<perlbug> utility.
609
28d8d7f4 610=item regname($name,$all)
44a2ac75 611
28d8d7f4
YO
612Returns the contents of a named buffer of the last successful match. If
613$all is true, then returns an array ref containing one entry per buffer,
44a2ac75
YO
614otherwise returns the first defined buffer.
615
28d8d7f4 616=item regnames($all)
44a2ac75 617
28d8d7f4
YO
618Returns a list of all of the named buffers defined in the last successful
619match. If $all is true, then it returns all names defined, if not it returns
620only names which were involved in the match.
44a2ac75 621
28d8d7f4 622=item regnames_count()
44a2ac75 623
28d8d7f4
YO
624Returns the number of distinct names defined in the pattern used
625for the last successful match.
44a2ac75 626
28d8d7f4
YO
627B<Note:> this result is always the actual number of distinct
628named buffers defined, it may not actually match that which is
629returned by C<regnames()> and related routines when those routines
630have not been called with the $all parameter set.
44a2ac75 631
de8c5301 632=back
b3eb6a9b 633
de8c5301 634=head1 SEE ALSO
b3eb6a9b 635
de8c5301
YO
636L<perlmodlib/Pragmatic Modules>.
637
638=cut