This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re.pm: correct typo
[perl5.git] / ext / re / re.pm
CommitLineData
b3eb6a9b
GS
1package re;
2
99cc5cc6 3# pragma for controlling the regexp engine
de8c5301
YO
4use strict;
5use warnings;
6
94a75ecf 7our $VERSION = "0.15";
de8c5301 8our @ISA = qw(Exporter);
ec781434 9our @EXPORT_OK = ('regmust',
192c1e27
JH
10 qw(is_regexp regexp_pattern
11 regname regnames regnames_count));
de8c5301
YO
12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
13
de8c5301
YO
14my %bitmask = (
15 taint => 0x00100000, # HINT_RE_TAINT
16 eval => 0x00200000, # HINT_RE_EVAL
17);
18
1e215989
FC
19my $flags_hint = 0x02000000; # HINT_RE_FLAGS
20my $PMMOD_SHIFT = 0;
21my %reflags = (
22 m => 1 << ($PMMOD_SHIFT + 0),
23 s => 1 << ($PMMOD_SHIFT + 1),
24 i => 1 << ($PMMOD_SHIFT + 2),
25 x => 1 << ($PMMOD_SHIFT + 3),
26 p => 1 << ($PMMOD_SHIFT + 4),
27# special cases:
28 l => 1 << ($PMMOD_SHIFT + 5),
29 u => 1 << ($PMMOD_SHIFT + 6),
30 d => 0,
31);
32
de8c5301
YO
33sub setcolor {
34 eval { # Ignore errors
35 require Term::Cap;
36
37 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
38 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
39 my @props = split /,/, $props;
40 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
41
42 $colors =~ s/\0//g;
43 $ENV{PERL_RE_COLORS} = $colors;
44 };
45 if ($@) {
46 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
47 }
48
49}
50
51my %flags = (
52 COMPILE => 0x0000FF,
53 PARSE => 0x000001,
54 OPTIMISE => 0x000002,
55 TRIEC => 0x000004,
56 DUMP => 0x000008,
f7819f85 57 FLAGS => 0x000010,
de8c5301
YO
58
59 EXECUTE => 0x00FF00,
60 INTUIT => 0x000100,
61 MATCH => 0x000200,
62 TRIEE => 0x000400,
63
64 EXTRA => 0xFF0000,
65 TRIEM => 0x010000,
66 OFFSETS => 0x020000,
67 OFFSETSDBG => 0x040000,
68 STATE => 0x080000,
69 OPTIMISEM => 0x100000,
70 STACK => 0x280000,
e7707071 71 BUFFERS => 0x400000,
2c296965 72 GPOS => 0x800000,
de8c5301 73);
e7707071 74$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
de8c5301 75$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
2c296965 76$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS};
de8c5301
YO
77$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
78$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
79$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
80
ec781434
NC
81if (defined &DynaLoader::boot_DynaLoader) {
82 require XSLoader;
da4061d3 83 XSLoader::load();
de8c5301 84}
ec781434
NC
85# else we're miniperl
86# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which
87# uses re 'taint'.
de8c5301
YO
88
89sub _load_unload {
90 my ($on)= @_;
91 if ($on) {
ec781434
NC
92 # We call install() every time, as if we didn't, we wouldn't
93 # "see" any changes to the color environment var since
94 # the last time it was called.
95
96 # install() returns an integer, which if casted properly
99cc5cc6 97 # in C resolves to a structure containing the regexp
ec781434
NC
98 # hooks. Setting it to a random integer will guarantee
99 # segfaults.
100 $^H{regcomp} = install();
de8c5301
YO
101 } else {
102 delete $^H{regcomp};
103 }
104}
105
106sub bits {
107 my $on = shift;
108 my $bits = 0;
109 unless (@_) {
110 require Carp;
111 Carp::carp("Useless use of \"re\" pragma");
112 }
1e215989 113 ARG:
de8c5301
YO
114 foreach my $idx (0..$#_){
115 my $s=$_[$idx];
116 if ($s eq 'Debug' or $s eq 'Debugcolor') {
117 setcolor() if $s =~/color/i;
118 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
119 for my $idx ($idx+1..$#_) {
120 if ($flags{$_[$idx]}) {
121 if ($on) {
122 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
123 } else {
124 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
125 }
126 } else {
127 require Carp;
128 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
129 join(", ",sort keys %flags ) );
130 }
131 }
132 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
133 last;
134 } elsif ($s eq 'debug' or $s eq 'debugcolor') {
135 setcolor() if $s =~/color/i;
136 _load_unload($on);
66e6b4c5 137 last;
de8c5301
YO
138 } elsif (exists $bitmask{$s}) {
139 $bits |= $bitmask{$s};
140 } elsif ($EXPORT_OK{$s}) {
de8c5301
YO
141 require Exporter;
142 re->export_to_level(2, 're', $s);
1e215989
FC
143 } elsif ($s =~ s/^\///) {
144 my $reflags = $^H{reflags} || 0;
96ef02be 145 my $seen_dul;
1e215989
FC
146 for(split//, $s) {
147 if (/[dul]/) {
148 if ($on) {
96ef02be
FC
149 if ($seen_dul && $seen_dul ne $_) {
150 require Carp;
151 Carp::carp(
152 qq 'The "$seen_dul" and "$_" flags '
153 .qq 'are exclusive'
154 );
155 }
1e215989 156 $^H{reflags_dul} = $reflags{$_};
96ef02be 157 $seen_dul = $_;
1e215989
FC
158 }
159 else {
160 delete $^H{reflags_dul}
161 if defined $^H{reflags_dul}
162 && $^H{reflags_dul} == $reflags{$_};
163 }
164 } elsif (exists $reflags{$_}) {
165 $on
166 ? $reflags |= $reflags{$_}
167 : ($reflags &= ~$reflags{$_});
168 } else {
169 require Carp;
170 Carp::carp(
171 qq'Unknown regular expression flag "$_"'
172 );
173 next ARG;
174 }
175 }
176 ($^H{reflags} = $reflags or defined $^H{reflags_dul})
177 ? $^H |= $flags_hint
178 : ($^H &= ~$flags_hint);
de8c5301
YO
179 } else {
180 require Carp;
181 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
182 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
183 ")");
184 }
185 }
186 $bits;
187}
188
189sub import {
190 shift;
191 $^H |= bits(1, @_);
192}
193
194sub unimport {
195 shift;
196 $^H &= ~ bits(0, @_);
197}
198
1991;
200
201__END__
56953603 202
b3eb6a9b
GS
203=head1 NAME
204
205re - Perl pragma to alter regular expression behaviour
206
207=head1 SYNOPSIS
208
e4d48cc9
GS
209 use re 'taint';
210 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here
b3eb6a9b 211
2cd61cdb 212 $pat = '(?{ $foo = 1 })';
e4d48cc9 213 use re 'eval';
2cd61cdb 214 /foo${pat}bar/; # won't fail (when not under -T switch)
e4d48cc9
GS
215
216 {
217 no re 'taint'; # the default
218 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here
219
220 no re 'eval'; # the default
2cd61cdb 221 /foo${pat}bar/; # disallowed (with or without -T switch)
e4d48cc9 222 }
b3eb6a9b 223
1e215989
FC
224 use re '/ix';
225 "FOO" =~ / foo /; # /ix implied
226 no re '/x';
227 "FOO" =~ /foo/; # just /i implied
228
1e2e3d02
YO
229 use re 'debug'; # output debugging info during
230 /^(.*)$/s; # compile and run time
231
2cd61cdb 232
02ea72ae
IZ
233 use re 'debugcolor'; # same as 'debug', but with colored output
234 ...
235
a3621e74 236 use re qw(Debug All); # Finer tuned debugging options.
4ee9a43f 237 use re qw(Debug More);
fe759410 238 no re qw(Debug ALL); # Turn of all re debugging in this scope
4ee9a43f 239
de8c5301
YO
240 use re qw(is_regexp regexp_pattern); # import utility functions
241 my ($pat,$mods)=regexp_pattern(qr/foo/i);
242 if (is_regexp($obj)) {
243 print "Got regexp: ",
244 scalar regexp_pattern($obj); # just as perl would stringify it
245 } # but no hassle with blessed re's.
a3621e74 246
3ffabb8c
GS
247(We use $^X in these examples because it's tainted by default.)
248
b3eb6a9b
GS
249=head1 DESCRIPTION
250
de8c5301
YO
251=head2 'taint' mode
252
b3eb6a9b 253When C<use re 'taint'> is in effect, and a tainted string is the target
99cc5cc6
A
254of a regexp, the regexp memories (or values returned by the m// operator
255in list context) are tainted. This feature is useful when regexp operations
e4d48cc9
GS
256on tainted data aren't meant to extract safe substrings, but to perform
257other transformations.
b3eb6a9b 258
de8c5301
YO
259=head2 'eval' mode
260
99cc5cc6 261When C<use re 'eval'> is in effect, a regexp is allowed to contain
0b370c0a
A
262C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed
263subexpressions, even if the regular expression contains
ffbc6a93 264variable interpolation. That is normally disallowed, since it is a
2cd61cdb
IZ
265potential security risk. Note that this pragma is ignored when the regular
266expression is obtained from tainted data, i.e. evaluation is always
0b370c0a 267disallowed with tainted regular expressions. See L<perlre/(?{ code })>
bb1773de 268and L<perlre/(??{ code })>.
2cd61cdb 269
ffbc6a93 270For the purpose of this pragma, interpolation of precompiled regular
0a92e3a8
GS
271expressions (i.e., the result of C<qr//>) is I<not> considered variable
272interpolation. Thus:
2cd61cdb
IZ
273
274 /foo${pat}bar/
275
ffbc6a93 276I<is> allowed if $pat is a precompiled regular expression, even
0b370c0a 277if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions.
2cd61cdb 278
1e215989
FC
279=head2 '/flags' mode
280
281When C<use re '/flags'> is specified, the given flags are automatically
282added to every regular expression till the end of the lexical scope.
283
284C<no re '/flags'> will turn off the effect of C<use re '/flags'> for the
285given flags.
286
287For example, if you want all your regular expressions to have /msx on by
288default, simply put
289
290 use re '/msx';
291
292at the top of your code.
293
294The /dul flags cancel each other out. So, in this example,
295
296 use re "/u";
297 "ss" =~ /\xdf/;
298 use re "/d";
299 "ss" =~ /\xdf/;
300
4d220a7d 301the second C<use re> does an implicit C<no re '/u'>.
1e215989
FC
302
303Turning on the /l and /u flags with C<use re> takes precedence over the
304C<locale> pragma and the 'unicode_strings' C<feature>, for regular
305expressions. Turning off one of these flags when it is active reverts to
306the behaviour specified by whatever other pragmata are in scope. For
307example:
308
309 use feature "unicode_strings";
310 no re "/u"; # does nothing
311 use re "/l";
312 no re "/l"; # reverts to unicode_strings behaviour
313
de8c5301
YO
314=head2 'debug' mode
315
ffbc6a93 316When C<use re 'debug'> is in effect, perl emits debugging messages when
2cd61cdb
IZ
317compiling and using regular expressions. The output is the same as that
318obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
319B<-Dr> switch. It may be quite voluminous depending on the complexity
02ea72ae
IZ
320of the match. Using C<debugcolor> instead of C<debug> enables a
321form of output that can be used to get a colorful display on terminals
322that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
323comma-separated list of C<termcap> properties to use for highlighting
ffbc6a93 324strings on/off, pre-point part on/off.
2cd61cdb
IZ
325See L<perldebug/"Debugging regular expressions"> for additional info.
326
de8c5301
YO
327As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
328lexically scoped, as the other directives are. However they have both
329compile-time and run-time effects.
330
331See L<perlmodlib/Pragmatic Modules>.
332
333=head2 'Debug' mode
334
a3621e74
YO
335Similarly C<use re 'Debug'> produces debugging output, the difference
336being that it allows the fine tuning of what debugging output will be
be8e71aa
YO
337emitted. Options are divided into three groups, those related to
338compilation, those related to execution and those related to special
339purposes. The options are as follows:
340
341=over 4
342
343=item Compile related options
344
345=over 4
346
347=item COMPILE
348
349Turns on all compile related debug options.
350
351=item PARSE
352
353Turns on debug output related to the process of parsing the pattern.
354
355=item OPTIMISE
356
357Enables output related to the optimisation phase of compilation.
358
24b23f37 359=item TRIEC
be8e71aa
YO
360
361Detailed info about trie compilation.
362
363=item DUMP
364
365Dump the final program out after it is compiled and optimised.
366
be8e71aa
YO
367=back
368
369=item Execute related options
370
371=over 4
372
373=item EXECUTE
374
375Turns on all execute related debug options.
376
377=item MATCH
378
379Turns on debugging of the main matching loop.
380
24b23f37 381=item TRIEE
be8e71aa
YO
382
383Extra debugging of how tries execute.
384
385=item INTUIT
386
387Enable debugging of start point optimisations.
388
389=back
390
391=item Extra debugging options
392
393=over 4
394
395=item EXTRA
396
397Turns on all "extra" debugging options.
398
e7707071
YO
399=item BUFFERS
400
c27a5cfe 401Enable debugging the capture group storage during match. Warning,
e7707071
YO
402this can potentially produce extremely large output.
403
24b23f37
YO
404=item TRIEM
405
406Enable enhanced TRIE debugging. Enhances both TRIEE
407and TRIEC.
408
409=item STATE
410
4ee9a43f 411Enable debugging of states in the engine.
24b23f37
YO
412
413=item STACK
be8e71aa 414
24b23f37
YO
415Enable debugging of the recursion stack in the engine. Enabling
416or disabling this option automatically does the same for debugging
417states as well. This output from this can be quite large.
418
419=item OPTIMISEM
420
421Enable enhanced optimisation debugging and start point optimisations.
99cc5cc6 422Probably not useful except when debugging the regexp engine itself.
24b23f37
YO
423
424=item OFFSETS
425
426Dump offset information. This can be used to see how regops correlate
427to the pattern. Output format is
428
429 NODENUM:POSITION[LENGTH]
430
431Where 1 is the position of the first char in the string. Note that position
432can be 0, or larger than the actual length of the pattern, likewise length
433can be zero.
be8e71aa 434
24b23f37 435=item OFFSETSDBG
be8e71aa
YO
436
437Enable debugging of offsets information. This emits copious
fe759410 438amounts of trace information and doesn't mesh well with other
be8e71aa
YO
439debug options.
440
fe759410 441Almost definitely only useful to people hacking
be8e71aa
YO
442on the offsets part of the debug engine.
443
444=back
445
446=item Other useful flags
447
448These are useful shortcuts to save on the typing.
449
450=over 4
451
452=item ALL
453
e7707071 454Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS
be8e71aa
YO
455
456=item All
457
fe759410 458Enable DUMP and all execute options. Equivalent to:
be8e71aa
YO
459
460 use re 'debug';
461
462=item MORE
463
464=item More
465
24b23f37 466Enable TRIEM and all execute compile and execute options.
be8e71aa 467
dba3f186 468=back
be8e71aa 469
dba3f186 470=back
a3621e74 471
1e2e3d02 472As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
4ee9a43f 473lexically scoped, as the other directives are. However they have both
1e2e3d02 474compile-time and run-time effects.
b3eb6a9b 475
de8c5301 476=head2 Exportable Functions
b3eb6a9b 477
de8c5301 478As of perl 5.9.5 're' debug contains a number of utility functions that
4ee9a43f 479may be optionally exported into the caller's namespace. They are listed
de8c5301 480below.
b3eb6a9b 481
de8c5301 482=over 4
b3eb6a9b 483
de8c5301 484=item is_regexp($ref)
02ea72ae 485
de8c5301 486Returns true if the argument is a compiled regular expression as returned
4ee9a43f 487by C<qr//>, false if it is not.
02ea72ae 488
4ee9a43f
RGS
489This function will not be confused by overloading or blessing. In
490internals terms, this extracts the regexp pointer out of the
3a5e0888 491PERL_MAGIC_qr structure so it cannot be fooled.
894be9b7 492
de8c5301 493=item regexp_pattern($ref)
02ea72ae 494
4ee9a43f
RGS
495If the argument is a compiled regular expression as returned by C<qr//>,
496then this function returns the pattern.
be8e71aa 497
4ee9a43f
RGS
498In list context it returns a two element list, the first element
499containing the pattern and the second containing the modifiers used when
500the pattern was compiled.
be8e71aa 501
4ee9a43f 502 my ($pat, $mods) = regexp_pattern($ref);
a3621e74 503
99cc5cc6 504In scalar context it returns the same as perl would when stringifying a raw
4ee9a43f
RGS
505C<qr//> with the same pattern inside. If the argument is not a compiled
506reference then this routine returns false but defined in scalar context,
507and the empty list in list context. Thus the following
f9f4320a 508
dff5e0c4 509 if (regexp_pattern($ref) eq '(?^i:foo)')
dba3f186 510
de8c5301 511will be warning free regardless of what $ref actually is.
380e0b81 512
4ee9a43f
RGS
513Like C<is_regexp> this function will not be confused by overloading
514or blessing of the object.
b3eb6a9b 515
256ddcd0
YO
516=item regmust($ref)
517
432acd5f 518If the argument is a compiled regular expression as returned by C<qr//>,
99cc5cc6 519then this function returns what the optimiser considers to be the longest
432acd5f
RGS
520anchored fixed string and longest floating fixed string in the pattern.
521
522A I<fixed string> is defined as being a substring that must appear for the
523pattern to match. An I<anchored fixed string> is a fixed string that must
524appear at a particular offset from the beginning of the match. A I<floating
525fixed string> is defined as a fixed string that can appear at any point in
526a range of positions relative to the start of the match. For example,
527
528 my $qr = qr/here .* there/x;
529 my ($anchored, $floating) = regmust($qr);
256ddcd0 530 print "anchored:'$anchored'\nfloating:'$floating'\n";
432acd5f 531
256ddcd0
YO
532results in
533
534 anchored:'here'
535 floating:'there'
536
432acd5f
RGS
537Because the C<here> is before the C<.*> in the pattern, its position
538can be determined exactly. That's not true, however, for the C<there>;
539it could appear at any point after where the anchored string appeared.
256ddcd0
YO
540Perl uses both for its optimisations, prefering the longer, or, if they are
541equal, the floating.
542
543B<NOTE:> This may not necessarily be the definitive longest anchored and
432acd5f 544floating string. This will be what the optimiser of the Perl that you
256ddcd0
YO
545are using thinks is the longest. If you believe that the result is wrong
546please report it via the L<perlbug> utility.
547
28d8d7f4 548=item regname($name,$all)
44a2ac75 549
28d8d7f4
YO
550Returns the contents of a named buffer of the last successful match. If
551$all is true, then returns an array ref containing one entry per buffer,
44a2ac75
YO
552otherwise returns the first defined buffer.
553
28d8d7f4 554=item regnames($all)
44a2ac75 555
28d8d7f4
YO
556Returns a list of all of the named buffers defined in the last successful
557match. If $all is true, then it returns all names defined, if not it returns
558only names which were involved in the match.
44a2ac75 559
28d8d7f4 560=item regnames_count()
44a2ac75 561
28d8d7f4
YO
562Returns the number of distinct names defined in the pattern used
563for the last successful match.
44a2ac75 564
28d8d7f4
YO
565B<Note:> this result is always the actual number of distinct
566named buffers defined, it may not actually match that which is
567returned by C<regnames()> and related routines when those routines
568have not been called with the $all parameter set.
44a2ac75 569
de8c5301 570=back
b3eb6a9b 571
de8c5301 572=head1 SEE ALSO
b3eb6a9b 573
de8c5301
YO
574L<perlmodlib/Pragmatic Modules>.
575
576=cut