This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Bump re.pm’s version
[perl5.git] / ext / re / re.pm
CommitLineData
b3eb6a9b
GS
1package re;
2
99cc5cc6 3# pragma for controlling the regexp engine
de8c5301
YO
4use strict;
5use warnings;
6
ffedb8ca 7our $VERSION = "0.17";
de8c5301 8our @ISA = qw(Exporter);
ec781434 9our @EXPORT_OK = ('regmust',
192c1e27
JH
10 qw(is_regexp regexp_pattern
11 regname regnames regnames_count));
de8c5301
YO
12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
13
de8c5301
YO
14my %bitmask = (
15 taint => 0x00100000, # HINT_RE_TAINT
16 eval => 0x00200000, # HINT_RE_EVAL
17);
18
1e215989
FC
19my $flags_hint = 0x02000000; # HINT_RE_FLAGS
20my $PMMOD_SHIFT = 0;
21my %reflags = (
22 m => 1 << ($PMMOD_SHIFT + 0),
23 s => 1 << ($PMMOD_SHIFT + 1),
24 i => 1 << ($PMMOD_SHIFT + 2),
25 x => 1 << ($PMMOD_SHIFT + 3),
26 p => 1 << ($PMMOD_SHIFT + 4),
27# special cases:
1e215989 28 d => 0,
a62b1201
KW
29 l => 1,
30 u => 2,
cfaf538b 31 a => 3,
1e215989
FC
32);
33
de8c5301
YO
34sub setcolor {
35 eval { # Ignore errors
36 require Term::Cap;
37
38 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
39 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
40 my @props = split /,/, $props;
41 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
42
43 $colors =~ s/\0//g;
44 $ENV{PERL_RE_COLORS} = $colors;
45 };
46 if ($@) {
47 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
48 }
49
50}
51
52my %flags = (
53 COMPILE => 0x0000FF,
54 PARSE => 0x000001,
55 OPTIMISE => 0x000002,
56 TRIEC => 0x000004,
57 DUMP => 0x000008,
f7819f85 58 FLAGS => 0x000010,
de8c5301
YO
59
60 EXECUTE => 0x00FF00,
61 INTUIT => 0x000100,
62 MATCH => 0x000200,
63 TRIEE => 0x000400,
64
65 EXTRA => 0xFF0000,
66 TRIEM => 0x010000,
67 OFFSETS => 0x020000,
68 OFFSETSDBG => 0x040000,
69 STATE => 0x080000,
70 OPTIMISEM => 0x100000,
71 STACK => 0x280000,
e7707071 72 BUFFERS => 0x400000,
2c296965 73 GPOS => 0x800000,
de8c5301 74);
e7707071 75$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
de8c5301 76$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
2c296965 77$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS};
de8c5301
YO
78$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
79$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
80$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
81
ec781434
NC
82if (defined &DynaLoader::boot_DynaLoader) {
83 require XSLoader;
da4061d3 84 XSLoader::load();
de8c5301 85}
ec781434
NC
86# else we're miniperl
87# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which
88# uses re 'taint'.
de8c5301
YO
89
90sub _load_unload {
91 my ($on)= @_;
92 if ($on) {
ec781434
NC
93 # We call install() every time, as if we didn't, we wouldn't
94 # "see" any changes to the color environment var since
95 # the last time it was called.
96
97 # install() returns an integer, which if casted properly
99cc5cc6 98 # in C resolves to a structure containing the regexp
ec781434
NC
99 # hooks. Setting it to a random integer will guarantee
100 # segfaults.
101 $^H{regcomp} = install();
de8c5301
YO
102 } else {
103 delete $^H{regcomp};
104 }
105}
106
107sub bits {
108 my $on = shift;
109 my $bits = 0;
110 unless (@_) {
111 require Carp;
112 Carp::carp("Useless use of \"re\" pragma");
113 }
1e215989 114 ARG:
de8c5301
YO
115 foreach my $idx (0..$#_){
116 my $s=$_[$idx];
117 if ($s eq 'Debug' or $s eq 'Debugcolor') {
118 setcolor() if $s =~/color/i;
119 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
120 for my $idx ($idx+1..$#_) {
121 if ($flags{$_[$idx]}) {
122 if ($on) {
123 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
124 } else {
125 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
126 }
127 } else {
128 require Carp;
129 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
130 join(", ",sort keys %flags ) );
131 }
132 }
133 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
134 last;
135 } elsif ($s eq 'debug' or $s eq 'debugcolor') {
136 setcolor() if $s =~/color/i;
137 _load_unload($on);
66e6b4c5 138 last;
de8c5301
YO
139 } elsif (exists $bitmask{$s}) {
140 $bits |= $bitmask{$s};
141 } elsif ($EXPORT_OK{$s}) {
de8c5301
YO
142 require Exporter;
143 re->export_to_level(2, 're', $s);
1e215989
FC
144 } elsif ($s =~ s/^\///) {
145 my $reflags = $^H{reflags} || 0;
6320bfaf 146 my $seen_charset;
1e215989 147 for(split//, $s) {
cfaf538b 148 if (/[adul]/) {
1e215989 149 if ($on) {
6320bfaf 150 if ($seen_charset && $seen_charset ne $_) {
96ef02be
FC
151 require Carp;
152 Carp::carp(
6320bfaf 153 qq 'The "$seen_charset" and "$_" flags '
96ef02be
FC
154 .qq 'are exclusive'
155 );
156 }
6320bfaf
KW
157 $^H{reflags_charset} = $reflags{$_};
158 $seen_charset = $_;
1e215989
FC
159 }
160 else {
6320bfaf
KW
161 delete $^H{reflags_charset}
162 if defined $^H{reflags_charset}
163 && $^H{reflags_charset} == $reflags{$_};
1e215989
FC
164 }
165 } elsif (exists $reflags{$_}) {
166 $on
167 ? $reflags |= $reflags{$_}
168 : ($reflags &= ~$reflags{$_});
169 } else {
170 require Carp;
171 Carp::carp(
172 qq'Unknown regular expression flag "$_"'
173 );
174 next ARG;
175 }
176 }
6320bfaf 177 ($^H{reflags} = $reflags or defined $^H{reflags_charset})
1e215989
FC
178 ? $^H |= $flags_hint
179 : ($^H &= ~$flags_hint);
de8c5301
YO
180 } else {
181 require Carp;
182 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
183 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
184 ")");
185 }
186 }
187 $bits;
188}
189
190sub import {
191 shift;
192 $^H |= bits(1, @_);
193}
194
195sub unimport {
196 shift;
197 $^H &= ~ bits(0, @_);
198}
199
2001;
201
202__END__
56953603 203
b3eb6a9b
GS
204=head1 NAME
205
206re - Perl pragma to alter regular expression behaviour
207
208=head1 SYNOPSIS
209
e4d48cc9
GS
210 use re 'taint';
211 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here
b3eb6a9b 212
2cd61cdb 213 $pat = '(?{ $foo = 1 })';
e4d48cc9 214 use re 'eval';
2cd61cdb 215 /foo${pat}bar/; # won't fail (when not under -T switch)
e4d48cc9
GS
216
217 {
218 no re 'taint'; # the default
219 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here
220
221 no re 'eval'; # the default
2cd61cdb 222 /foo${pat}bar/; # disallowed (with or without -T switch)
e4d48cc9 223 }
b3eb6a9b 224
1e215989
FC
225 use re '/ix';
226 "FOO" =~ / foo /; # /ix implied
227 no re '/x';
228 "FOO" =~ /foo/; # just /i implied
229
1e2e3d02
YO
230 use re 'debug'; # output debugging info during
231 /^(.*)$/s; # compile and run time
232
2cd61cdb 233
02ea72ae
IZ
234 use re 'debugcolor'; # same as 'debug', but with colored output
235 ...
236
a3621e74 237 use re qw(Debug All); # Finer tuned debugging options.
4ee9a43f 238 use re qw(Debug More);
fe759410 239 no re qw(Debug ALL); # Turn of all re debugging in this scope
4ee9a43f 240
de8c5301
YO
241 use re qw(is_regexp regexp_pattern); # import utility functions
242 my ($pat,$mods)=regexp_pattern(qr/foo/i);
243 if (is_regexp($obj)) {
244 print "Got regexp: ",
245 scalar regexp_pattern($obj); # just as perl would stringify it
246 } # but no hassle with blessed re's.
a3621e74 247
3ffabb8c
GS
248(We use $^X in these examples because it's tainted by default.)
249
b3eb6a9b
GS
250=head1 DESCRIPTION
251
de8c5301
YO
252=head2 'taint' mode
253
b3eb6a9b 254When C<use re 'taint'> is in effect, and a tainted string is the target
99cc5cc6
A
255of a regexp, the regexp memories (or values returned by the m// operator
256in list context) are tainted. This feature is useful when regexp operations
e4d48cc9
GS
257on tainted data aren't meant to extract safe substrings, but to perform
258other transformations.
b3eb6a9b 259
de8c5301
YO
260=head2 'eval' mode
261
99cc5cc6 262When C<use re 'eval'> is in effect, a regexp is allowed to contain
0b370c0a
A
263C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed
264subexpressions, even if the regular expression contains
ffbc6a93 265variable interpolation. That is normally disallowed, since it is a
2cd61cdb
IZ
266potential security risk. Note that this pragma is ignored when the regular
267expression is obtained from tainted data, i.e. evaluation is always
0b370c0a 268disallowed with tainted regular expressions. See L<perlre/(?{ code })>
bb1773de 269and L<perlre/(??{ code })>.
2cd61cdb 270
ffbc6a93 271For the purpose of this pragma, interpolation of precompiled regular
0a92e3a8
GS
272expressions (i.e., the result of C<qr//>) is I<not> considered variable
273interpolation. Thus:
2cd61cdb
IZ
274
275 /foo${pat}bar/
276
ffbc6a93 277I<is> allowed if $pat is a precompiled regular expression, even
0b370c0a 278if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions.
2cd61cdb 279
1e215989
FC
280=head2 '/flags' mode
281
282When C<use re '/flags'> is specified, the given flags are automatically
283added to every regular expression till the end of the lexical scope.
284
285C<no re '/flags'> will turn off the effect of C<use re '/flags'> for the
286given flags.
287
288For example, if you want all your regular expressions to have /msx on by
289default, simply put
290
291 use re '/msx';
292
293at the top of your code.
294
cfaf538b 295The character set /adul flags cancel each other out. So, in this example,
1e215989
FC
296
297 use re "/u";
298 "ss" =~ /\xdf/;
299 use re "/d";
300 "ss" =~ /\xdf/;
301
4d220a7d 302the second C<use re> does an implicit C<no re '/u'>.
1e215989 303
59640339 304Turning on one of the character set flags with C<use re> takes precedence over the
1e215989
FC
305C<locale> pragma and the 'unicode_strings' C<feature>, for regular
306expressions. Turning off one of these flags when it is active reverts to
307the behaviour specified by whatever other pragmata are in scope. For
308example:
309
310 use feature "unicode_strings";
311 no re "/u"; # does nothing
312 use re "/l";
313 no re "/l"; # reverts to unicode_strings behaviour
314
de8c5301
YO
315=head2 'debug' mode
316
ffbc6a93 317When C<use re 'debug'> is in effect, perl emits debugging messages when
2cd61cdb
IZ
318compiling and using regular expressions. The output is the same as that
319obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
320B<-Dr> switch. It may be quite voluminous depending on the complexity
02ea72ae
IZ
321of the match. Using C<debugcolor> instead of C<debug> enables a
322form of output that can be used to get a colorful display on terminals
323that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
324comma-separated list of C<termcap> properties to use for highlighting
ffbc6a93 325strings on/off, pre-point part on/off.
57e8c15d 326See L<perldebug/"Debugging Regular Expressions"> for additional info.
2cd61cdb 327
de8c5301
YO
328As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
329lexically scoped, as the other directives are. However they have both
330compile-time and run-time effects.
331
332See L<perlmodlib/Pragmatic Modules>.
333
334=head2 'Debug' mode
335
a3621e74
YO
336Similarly C<use re 'Debug'> produces debugging output, the difference
337being that it allows the fine tuning of what debugging output will be
be8e71aa
YO
338emitted. Options are divided into three groups, those related to
339compilation, those related to execution and those related to special
340purposes. The options are as follows:
341
342=over 4
343
344=item Compile related options
345
346=over 4
347
348=item COMPILE
349
350Turns on all compile related debug options.
351
352=item PARSE
353
354Turns on debug output related to the process of parsing the pattern.
355
356=item OPTIMISE
357
358Enables output related to the optimisation phase of compilation.
359
24b23f37 360=item TRIEC
be8e71aa
YO
361
362Detailed info about trie compilation.
363
364=item DUMP
365
366Dump the final program out after it is compiled and optimised.
367
be8e71aa
YO
368=back
369
370=item Execute related options
371
372=over 4
373
374=item EXECUTE
375
376Turns on all execute related debug options.
377
378=item MATCH
379
380Turns on debugging of the main matching loop.
381
24b23f37 382=item TRIEE
be8e71aa
YO
383
384Extra debugging of how tries execute.
385
386=item INTUIT
387
388Enable debugging of start point optimisations.
389
390=back
391
392=item Extra debugging options
393
394=over 4
395
396=item EXTRA
397
398Turns on all "extra" debugging options.
399
e7707071
YO
400=item BUFFERS
401
c27a5cfe 402Enable debugging the capture group storage during match. Warning,
e7707071
YO
403this can potentially produce extremely large output.
404
24b23f37
YO
405=item TRIEM
406
407Enable enhanced TRIE debugging. Enhances both TRIEE
408and TRIEC.
409
410=item STATE
411
4ee9a43f 412Enable debugging of states in the engine.
24b23f37
YO
413
414=item STACK
be8e71aa 415
24b23f37
YO
416Enable debugging of the recursion stack in the engine. Enabling
417or disabling this option automatically does the same for debugging
418states as well. This output from this can be quite large.
419
420=item OPTIMISEM
421
422Enable enhanced optimisation debugging and start point optimisations.
99cc5cc6 423Probably not useful except when debugging the regexp engine itself.
24b23f37
YO
424
425=item OFFSETS
426
427Dump offset information. This can be used to see how regops correlate
428to the pattern. Output format is
429
430 NODENUM:POSITION[LENGTH]
431
432Where 1 is the position of the first char in the string. Note that position
433can be 0, or larger than the actual length of the pattern, likewise length
434can be zero.
be8e71aa 435
24b23f37 436=item OFFSETSDBG
be8e71aa
YO
437
438Enable debugging of offsets information. This emits copious
fe759410 439amounts of trace information and doesn't mesh well with other
be8e71aa
YO
440debug options.
441
fe759410 442Almost definitely only useful to people hacking
be8e71aa
YO
443on the offsets part of the debug engine.
444
445=back
446
447=item Other useful flags
448
449These are useful shortcuts to save on the typing.
450
451=over 4
452
453=item ALL
454
e7707071 455Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS
be8e71aa
YO
456
457=item All
458
fe759410 459Enable DUMP and all execute options. Equivalent to:
be8e71aa
YO
460
461 use re 'debug';
462
463=item MORE
464
465=item More
466
24b23f37 467Enable TRIEM and all execute compile and execute options.
be8e71aa 468
dba3f186 469=back
be8e71aa 470
dba3f186 471=back
a3621e74 472
1e2e3d02 473As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
4ee9a43f 474lexically scoped, as the other directives are. However they have both
1e2e3d02 475compile-time and run-time effects.
b3eb6a9b 476
de8c5301 477=head2 Exportable Functions
b3eb6a9b 478
de8c5301 479As of perl 5.9.5 're' debug contains a number of utility functions that
4ee9a43f 480may be optionally exported into the caller's namespace. They are listed
de8c5301 481below.
b3eb6a9b 482
de8c5301 483=over 4
b3eb6a9b 484
de8c5301 485=item is_regexp($ref)
02ea72ae 486
de8c5301 487Returns true if the argument is a compiled regular expression as returned
4ee9a43f 488by C<qr//>, false if it is not.
02ea72ae 489
4ee9a43f
RGS
490This function will not be confused by overloading or blessing. In
491internals terms, this extracts the regexp pointer out of the
3a5e0888 492PERL_MAGIC_qr structure so it cannot be fooled.
894be9b7 493
de8c5301 494=item regexp_pattern($ref)
02ea72ae 495
4ee9a43f
RGS
496If the argument is a compiled regular expression as returned by C<qr//>,
497then this function returns the pattern.
be8e71aa 498
4ee9a43f
RGS
499In list context it returns a two element list, the first element
500containing the pattern and the second containing the modifiers used when
501the pattern was compiled.
be8e71aa 502
4ee9a43f 503 my ($pat, $mods) = regexp_pattern($ref);
a3621e74 504
99cc5cc6 505In scalar context it returns the same as perl would when stringifying a raw
4ee9a43f
RGS
506C<qr//> with the same pattern inside. If the argument is not a compiled
507reference then this routine returns false but defined in scalar context,
508and the empty list in list context. Thus the following
f9f4320a 509
dff5e0c4 510 if (regexp_pattern($ref) eq '(?^i:foo)')
dba3f186 511
de8c5301 512will be warning free regardless of what $ref actually is.
380e0b81 513
4ee9a43f
RGS
514Like C<is_regexp> this function will not be confused by overloading
515or blessing of the object.
b3eb6a9b 516
256ddcd0
YO
517=item regmust($ref)
518
432acd5f 519If the argument is a compiled regular expression as returned by C<qr//>,
99cc5cc6 520then this function returns what the optimiser considers to be the longest
432acd5f
RGS
521anchored fixed string and longest floating fixed string in the pattern.
522
523A I<fixed string> is defined as being a substring that must appear for the
524pattern to match. An I<anchored fixed string> is a fixed string that must
525appear at a particular offset from the beginning of the match. A I<floating
526fixed string> is defined as a fixed string that can appear at any point in
527a range of positions relative to the start of the match. For example,
528
529 my $qr = qr/here .* there/x;
530 my ($anchored, $floating) = regmust($qr);
256ddcd0 531 print "anchored:'$anchored'\nfloating:'$floating'\n";
432acd5f 532
256ddcd0
YO
533results in
534
535 anchored:'here'
536 floating:'there'
537
432acd5f
RGS
538Because the C<here> is before the C<.*> in the pattern, its position
539can be determined exactly. That's not true, however, for the C<there>;
540it could appear at any point after where the anchored string appeared.
256ddcd0
YO
541Perl uses both for its optimisations, prefering the longer, or, if they are
542equal, the floating.
543
544B<NOTE:> This may not necessarily be the definitive longest anchored and
432acd5f 545floating string. This will be what the optimiser of the Perl that you
256ddcd0
YO
546are using thinks is the longest. If you believe that the result is wrong
547please report it via the L<perlbug> utility.
548
28d8d7f4 549=item regname($name,$all)
44a2ac75 550
28d8d7f4
YO
551Returns the contents of a named buffer of the last successful match. If
552$all is true, then returns an array ref containing one entry per buffer,
44a2ac75
YO
553otherwise returns the first defined buffer.
554
28d8d7f4 555=item regnames($all)
44a2ac75 556
28d8d7f4
YO
557Returns a list of all of the named buffers defined in the last successful
558match. If $all is true, then it returns all names defined, if not it returns
559only names which were involved in the match.
44a2ac75 560
28d8d7f4 561=item regnames_count()
44a2ac75 562
28d8d7f4
YO
563Returns the number of distinct names defined in the pattern used
564for the last successful match.
44a2ac75 565
28d8d7f4
YO
566B<Note:> this result is always the actual number of distinct
567named buffers defined, it may not actually match that which is
568returned by C<regnames()> and related routines when those routines
569have not been called with the $all parameter set.
44a2ac75 570
de8c5301 571=back
b3eb6a9b 572
de8c5301 573=head1 SEE ALSO
b3eb6a9b 574
de8c5301
YO
575L<perlmodlib/Pragmatic Modules>.
576
577=cut