This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
bump re.pm version number
[perl5.git] / ext / re / re.pm
CommitLineData
b3eb6a9b
GS
1package re;
2
99cc5cc6 3# pragma for controlling the regexp engine
de8c5301
YO
4use strict;
5use warnings;
6
7a529e48 7our $VERSION = "0.20";
de8c5301 8our @ISA = qw(Exporter);
ec781434 9our @EXPORT_OK = ('regmust',
192c1e27
JH
10 qw(is_regexp regexp_pattern
11 regname regnames regnames_count));
de8c5301
YO
12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
13
de8c5301
YO
14my %bitmask = (
15 taint => 0x00100000, # HINT_RE_TAINT
16 eval => 0x00200000, # HINT_RE_EVAL
17);
18
1e215989
FC
19my $flags_hint = 0x02000000; # HINT_RE_FLAGS
20my $PMMOD_SHIFT = 0;
21my %reflags = (
22 m => 1 << ($PMMOD_SHIFT + 0),
23 s => 1 << ($PMMOD_SHIFT + 1),
24 i => 1 << ($PMMOD_SHIFT + 2),
25 x => 1 << ($PMMOD_SHIFT + 3),
26 p => 1 << ($PMMOD_SHIFT + 4),
27# special cases:
1e215989 28 d => 0,
a62b1201
KW
29 l => 1,
30 u => 2,
cfaf538b 31 a => 3,
b4ab316d 32 aa => 4,
1e215989
FC
33);
34
de8c5301
YO
35sub setcolor {
36 eval { # Ignore errors
37 require Term::Cap;
38
39 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
40 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
41 my @props = split /,/, $props;
42 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
43
44 $colors =~ s/\0//g;
45 $ENV{PERL_RE_COLORS} = $colors;
46 };
47 if ($@) {
48 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
49 }
50
51}
52
53my %flags = (
54 COMPILE => 0x0000FF,
55 PARSE => 0x000001,
56 OPTIMISE => 0x000002,
57 TRIEC => 0x000004,
58 DUMP => 0x000008,
f7819f85 59 FLAGS => 0x000010,
de8c5301
YO
60
61 EXECUTE => 0x00FF00,
62 INTUIT => 0x000100,
63 MATCH => 0x000200,
64 TRIEE => 0x000400,
65
66 EXTRA => 0xFF0000,
67 TRIEM => 0x010000,
68 OFFSETS => 0x020000,
69 OFFSETSDBG => 0x040000,
70 STATE => 0x080000,
71 OPTIMISEM => 0x100000,
72 STACK => 0x280000,
e7707071 73 BUFFERS => 0x400000,
2c296965 74 GPOS => 0x800000,
de8c5301 75);
e7707071 76$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
de8c5301 77$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
2c296965 78$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS};
de8c5301
YO
79$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
80$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
81$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
82
ec781434
NC
83if (defined &DynaLoader::boot_DynaLoader) {
84 require XSLoader;
da4061d3 85 XSLoader::load();
de8c5301 86}
ec781434
NC
87# else we're miniperl
88# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which
89# uses re 'taint'.
de8c5301
YO
90
91sub _load_unload {
92 my ($on)= @_;
93 if ($on) {
ec781434
NC
94 # We call install() every time, as if we didn't, we wouldn't
95 # "see" any changes to the color environment var since
96 # the last time it was called.
97
98 # install() returns an integer, which if casted properly
99cc5cc6 99 # in C resolves to a structure containing the regexp
ec781434
NC
100 # hooks. Setting it to a random integer will guarantee
101 # segfaults.
102 $^H{regcomp} = install();
de8c5301
YO
103 } else {
104 delete $^H{regcomp};
105 }
106}
107
108sub bits {
109 my $on = shift;
110 my $bits = 0;
1e215989 111 ARG:
de8c5301
YO
112 foreach my $idx (0..$#_){
113 my $s=$_[$idx];
114 if ($s eq 'Debug' or $s eq 'Debugcolor') {
115 setcolor() if $s =~/color/i;
116 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
117 for my $idx ($idx+1..$#_) {
118 if ($flags{$_[$idx]}) {
119 if ($on) {
120 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
121 } else {
122 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
123 }
124 } else {
125 require Carp;
126 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
127 join(", ",sort keys %flags ) );
128 }
129 }
130 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
131 last;
132 } elsif ($s eq 'debug' or $s eq 'debugcolor') {
133 setcolor() if $s =~/color/i;
134 _load_unload($on);
66e6b4c5 135 last;
de8c5301
YO
136 } elsif (exists $bitmask{$s}) {
137 $bits |= $bitmask{$s};
138 } elsif ($EXPORT_OK{$s}) {
de8c5301
YO
139 require Exporter;
140 re->export_to_level(2, 're', $s);
1e215989
FC
141 } elsif ($s =~ s/^\///) {
142 my $reflags = $^H{reflags} || 0;
6320bfaf 143 my $seen_charset;
342c8524 144 while ($s =~ m/( . )/gx) {
b4ab316d 145 $_ = $1;
cfaf538b 146 if (/[adul]/) {
342c8524
KW
147 # The 'a' may be repeated; hide this from the rest of the
148 # code by counting and getting rid of all of them, then
149 # changing to 'aa' if there is a repeat.
150 if ($_ eq 'a') {
151 my $sav_pos = pos $s;
152 my $a_count = $s =~ s/a//g;
153 pos $s = $sav_pos - 1; # -1 because got rid of the 'a'
154 if ($a_count > 2) {
155 require Carp;
156 Carp::carp(
157 qq 'The "a" flag may only appear a maximum of twice'
158 );
159 }
160 elsif ($a_count == 2) {
161 $_ = 'aa';
162 }
163 }
1e215989 164 if ($on) {
45a507fa 165 if ($seen_charset) {
96ef02be 166 require Carp;
45a507fa
KW
167 if ($seen_charset ne $_) {
168 Carp::carp(
169 qq 'The "$seen_charset" and "$_" flags '
170 .qq 'are exclusive'
171 );
172 }
173 else {
174 Carp::carp(
175 qq 'The "$seen_charset" flag may not appear '
176 .qq 'twice'
177 );
178 }
96ef02be 179 }
6320bfaf
KW
180 $^H{reflags_charset} = $reflags{$_};
181 $seen_charset = $_;
1e215989
FC
182 }
183 else {
6320bfaf
KW
184 delete $^H{reflags_charset}
185 if defined $^H{reflags_charset}
186 && $^H{reflags_charset} == $reflags{$_};
1e215989
FC
187 }
188 } elsif (exists $reflags{$_}) {
189 $on
190 ? $reflags |= $reflags{$_}
191 : ($reflags &= ~$reflags{$_});
192 } else {
193 require Carp;
194 Carp::carp(
195 qq'Unknown regular expression flag "$_"'
196 );
197 next ARG;
198 }
199 }
6320bfaf 200 ($^H{reflags} = $reflags or defined $^H{reflags_charset})
1e215989
FC
201 ? $^H |= $flags_hint
202 : ($^H &= ~$flags_hint);
de8c5301
YO
203 } else {
204 require Carp;
205 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
206 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
207 ")");
208 }
209 }
210 $bits;
211}
212
213sub import {
214 shift;
215 $^H |= bits(1, @_);
216}
217
218sub unimport {
219 shift;
220 $^H &= ~ bits(0, @_);
221}
222
2231;
224
225__END__
56953603 226
b3eb6a9b
GS
227=head1 NAME
228
229re - Perl pragma to alter regular expression behaviour
230
231=head1 SYNOPSIS
232
e4d48cc9
GS
233 use re 'taint';
234 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here
b3eb6a9b 235
2cd61cdb 236 $pat = '(?{ $foo = 1 })';
e4d48cc9 237 use re 'eval';
2cd61cdb 238 /foo${pat}bar/; # won't fail (when not under -T switch)
e4d48cc9
GS
239
240 {
241 no re 'taint'; # the default
242 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here
243
244 no re 'eval'; # the default
2cd61cdb 245 /foo${pat}bar/; # disallowed (with or without -T switch)
e4d48cc9 246 }
b3eb6a9b 247
1e215989
FC
248 use re '/ix';
249 "FOO" =~ / foo /; # /ix implied
250 no re '/x';
251 "FOO" =~ /foo/; # just /i implied
252
1e2e3d02
YO
253 use re 'debug'; # output debugging info during
254 /^(.*)$/s; # compile and run time
255
2cd61cdb 256
02ea72ae
IZ
257 use re 'debugcolor'; # same as 'debug', but with colored output
258 ...
259
a3621e74 260 use re qw(Debug All); # Finer tuned debugging options.
4ee9a43f 261 use re qw(Debug More);
fe759410 262 no re qw(Debug ALL); # Turn of all re debugging in this scope
4ee9a43f 263
de8c5301
YO
264 use re qw(is_regexp regexp_pattern); # import utility functions
265 my ($pat,$mods)=regexp_pattern(qr/foo/i);
266 if (is_regexp($obj)) {
267 print "Got regexp: ",
268 scalar regexp_pattern($obj); # just as perl would stringify it
269 } # but no hassle with blessed re's.
a3621e74 270
3ffabb8c
GS
271(We use $^X in these examples because it's tainted by default.)
272
b3eb6a9b
GS
273=head1 DESCRIPTION
274
de8c5301
YO
275=head2 'taint' mode
276
b3eb6a9b 277When C<use re 'taint'> is in effect, and a tainted string is the target
99cc5cc6
A
278of a regexp, the regexp memories (or values returned by the m// operator
279in list context) are tainted. This feature is useful when regexp operations
e4d48cc9
GS
280on tainted data aren't meant to extract safe substrings, but to perform
281other transformations.
b3eb6a9b 282
de8c5301
YO
283=head2 'eval' mode
284
99cc5cc6 285When C<use re 'eval'> is in effect, a regexp is allowed to contain
0b370c0a
A
286C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed
287subexpressions, even if the regular expression contains
ffbc6a93 288variable interpolation. That is normally disallowed, since it is a
2cd61cdb
IZ
289potential security risk. Note that this pragma is ignored when the regular
290expression is obtained from tainted data, i.e. evaluation is always
0b370c0a 291disallowed with tainted regular expressions. See L<perlre/(?{ code })>
bb1773de 292and L<perlre/(??{ code })>.
2cd61cdb 293
ffbc6a93 294For the purpose of this pragma, interpolation of precompiled regular
0a92e3a8
GS
295expressions (i.e., the result of C<qr//>) is I<not> considered variable
296interpolation. Thus:
2cd61cdb
IZ
297
298 /foo${pat}bar/
299
ffbc6a93 300I<is> allowed if $pat is a precompiled regular expression, even
0b370c0a 301if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions.
2cd61cdb 302
1e215989
FC
303=head2 '/flags' mode
304
305When C<use re '/flags'> is specified, the given flags are automatically
306added to every regular expression till the end of the lexical scope.
307
308C<no re '/flags'> will turn off the effect of C<use re '/flags'> for the
309given flags.
310
311For example, if you want all your regular expressions to have /msx on by
312default, simply put
313
314 use re '/msx';
315
316at the top of your code.
317
cfaf538b 318The character set /adul flags cancel each other out. So, in this example,
1e215989
FC
319
320 use re "/u";
321 "ss" =~ /\xdf/;
322 use re "/d";
323 "ss" =~ /\xdf/;
324
4d220a7d 325the second C<use re> does an implicit C<no re '/u'>.
1e215989 326
59640339 327Turning on one of the character set flags with C<use re> takes precedence over the
1e215989
FC
328C<locale> pragma and the 'unicode_strings' C<feature>, for regular
329expressions. Turning off one of these flags when it is active reverts to
330the behaviour specified by whatever other pragmata are in scope. For
331example:
332
333 use feature "unicode_strings";
334 no re "/u"; # does nothing
335 use re "/l";
336 no re "/l"; # reverts to unicode_strings behaviour
337
de8c5301
YO
338=head2 'debug' mode
339
ffbc6a93 340When C<use re 'debug'> is in effect, perl emits debugging messages when
2cd61cdb
IZ
341compiling and using regular expressions. The output is the same as that
342obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
343B<-Dr> switch. It may be quite voluminous depending on the complexity
02ea72ae
IZ
344of the match. Using C<debugcolor> instead of C<debug> enables a
345form of output that can be used to get a colorful display on terminals
346that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
347comma-separated list of C<termcap> properties to use for highlighting
ffbc6a93 348strings on/off, pre-point part on/off.
57e8c15d 349See L<perldebug/"Debugging Regular Expressions"> for additional info.
2cd61cdb 350
de8c5301
YO
351As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
352lexically scoped, as the other directives are. However they have both
353compile-time and run-time effects.
354
355See L<perlmodlib/Pragmatic Modules>.
356
357=head2 'Debug' mode
358
a3621e74
YO
359Similarly C<use re 'Debug'> produces debugging output, the difference
360being that it allows the fine tuning of what debugging output will be
be8e71aa
YO
361emitted. Options are divided into three groups, those related to
362compilation, those related to execution and those related to special
363purposes. The options are as follows:
364
365=over 4
366
367=item Compile related options
368
369=over 4
370
371=item COMPILE
372
373Turns on all compile related debug options.
374
375=item PARSE
376
377Turns on debug output related to the process of parsing the pattern.
378
379=item OPTIMISE
380
381Enables output related to the optimisation phase of compilation.
382
24b23f37 383=item TRIEC
be8e71aa
YO
384
385Detailed info about trie compilation.
386
387=item DUMP
388
389Dump the final program out after it is compiled and optimised.
390
be8e71aa
YO
391=back
392
393=item Execute related options
394
395=over 4
396
397=item EXECUTE
398
399Turns on all execute related debug options.
400
401=item MATCH
402
403Turns on debugging of the main matching loop.
404
24b23f37 405=item TRIEE
be8e71aa
YO
406
407Extra debugging of how tries execute.
408
409=item INTUIT
410
411Enable debugging of start point optimisations.
412
413=back
414
415=item Extra debugging options
416
417=over 4
418
419=item EXTRA
420
421Turns on all "extra" debugging options.
422
e7707071
YO
423=item BUFFERS
424
c27a5cfe 425Enable debugging the capture group storage during match. Warning,
e7707071
YO
426this can potentially produce extremely large output.
427
24b23f37
YO
428=item TRIEM
429
430Enable enhanced TRIE debugging. Enhances both TRIEE
431and TRIEC.
432
433=item STATE
434
4ee9a43f 435Enable debugging of states in the engine.
24b23f37
YO
436
437=item STACK
be8e71aa 438
24b23f37
YO
439Enable debugging of the recursion stack in the engine. Enabling
440or disabling this option automatically does the same for debugging
441states as well. This output from this can be quite large.
442
443=item OPTIMISEM
444
445Enable enhanced optimisation debugging and start point optimisations.
99cc5cc6 446Probably not useful except when debugging the regexp engine itself.
24b23f37
YO
447
448=item OFFSETS
449
450Dump offset information. This can be used to see how regops correlate
451to the pattern. Output format is
452
453 NODENUM:POSITION[LENGTH]
454
455Where 1 is the position of the first char in the string. Note that position
456can be 0, or larger than the actual length of the pattern, likewise length
457can be zero.
be8e71aa 458
24b23f37 459=item OFFSETSDBG
be8e71aa
YO
460
461Enable debugging of offsets information. This emits copious
fe759410 462amounts of trace information and doesn't mesh well with other
be8e71aa
YO
463debug options.
464
fe759410 465Almost definitely only useful to people hacking
be8e71aa
YO
466on the offsets part of the debug engine.
467
468=back
469
470=item Other useful flags
471
472These are useful shortcuts to save on the typing.
473
474=over 4
475
476=item ALL
477
e7707071 478Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS
be8e71aa
YO
479
480=item All
481
fe759410 482Enable DUMP and all execute options. Equivalent to:
be8e71aa
YO
483
484 use re 'debug';
485
486=item MORE
487
488=item More
489
24b23f37 490Enable TRIEM and all execute compile and execute options.
be8e71aa 491
dba3f186 492=back
be8e71aa 493
dba3f186 494=back
a3621e74 495
1e2e3d02 496As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
4ee9a43f 497lexically scoped, as the other directives are. However they have both
1e2e3d02 498compile-time and run-time effects.
b3eb6a9b 499
de8c5301 500=head2 Exportable Functions
b3eb6a9b 501
de8c5301 502As of perl 5.9.5 're' debug contains a number of utility functions that
4ee9a43f 503may be optionally exported into the caller's namespace. They are listed
de8c5301 504below.
b3eb6a9b 505
de8c5301 506=over 4
b3eb6a9b 507
de8c5301 508=item is_regexp($ref)
02ea72ae 509
de8c5301 510Returns true if the argument is a compiled regular expression as returned
4ee9a43f 511by C<qr//>, false if it is not.
02ea72ae 512
4ee9a43f
RGS
513This function will not be confused by overloading or blessing. In
514internals terms, this extracts the regexp pointer out of the
3a5e0888 515PERL_MAGIC_qr structure so it cannot be fooled.
894be9b7 516
de8c5301 517=item regexp_pattern($ref)
02ea72ae 518
4ee9a43f
RGS
519If the argument is a compiled regular expression as returned by C<qr//>,
520then this function returns the pattern.
be8e71aa 521
4ee9a43f
RGS
522In list context it returns a two element list, the first element
523containing the pattern and the second containing the modifiers used when
524the pattern was compiled.
be8e71aa 525
4ee9a43f 526 my ($pat, $mods) = regexp_pattern($ref);
a3621e74 527
99cc5cc6 528In scalar context it returns the same as perl would when stringifying a raw
4ee9a43f
RGS
529C<qr//> with the same pattern inside. If the argument is not a compiled
530reference then this routine returns false but defined in scalar context,
531and the empty list in list context. Thus the following
f9f4320a 532
dff5e0c4 533 if (regexp_pattern($ref) eq '(?^i:foo)')
dba3f186 534
de8c5301 535will be warning free regardless of what $ref actually is.
380e0b81 536
4ee9a43f
RGS
537Like C<is_regexp> this function will not be confused by overloading
538or blessing of the object.
b3eb6a9b 539
256ddcd0
YO
540=item regmust($ref)
541
432acd5f 542If the argument is a compiled regular expression as returned by C<qr//>,
99cc5cc6 543then this function returns what the optimiser considers to be the longest
432acd5f
RGS
544anchored fixed string and longest floating fixed string in the pattern.
545
546A I<fixed string> is defined as being a substring that must appear for the
547pattern to match. An I<anchored fixed string> is a fixed string that must
548appear at a particular offset from the beginning of the match. A I<floating
549fixed string> is defined as a fixed string that can appear at any point in
550a range of positions relative to the start of the match. For example,
551
552 my $qr = qr/here .* there/x;
553 my ($anchored, $floating) = regmust($qr);
256ddcd0 554 print "anchored:'$anchored'\nfloating:'$floating'\n";
432acd5f 555
256ddcd0
YO
556results in
557
558 anchored:'here'
559 floating:'there'
560
432acd5f
RGS
561Because the C<here> is before the C<.*> in the pattern, its position
562can be determined exactly. That's not true, however, for the C<there>;
563it could appear at any point after where the anchored string appeared.
256ddcd0
YO
564Perl uses both for its optimisations, prefering the longer, or, if they are
565equal, the floating.
566
567B<NOTE:> This may not necessarily be the definitive longest anchored and
432acd5f 568floating string. This will be what the optimiser of the Perl that you
256ddcd0
YO
569are using thinks is the longest. If you believe that the result is wrong
570please report it via the L<perlbug> utility.
571
28d8d7f4 572=item regname($name,$all)
44a2ac75 573
28d8d7f4
YO
574Returns the contents of a named buffer of the last successful match. If
575$all is true, then returns an array ref containing one entry per buffer,
44a2ac75
YO
576otherwise returns the first defined buffer.
577
28d8d7f4 578=item regnames($all)
44a2ac75 579
28d8d7f4
YO
580Returns a list of all of the named buffers defined in the last successful
581match. If $all is true, then it returns all names defined, if not it returns
582only names which were involved in the match.
44a2ac75 583
28d8d7f4 584=item regnames_count()
44a2ac75 585
28d8d7f4
YO
586Returns the number of distinct names defined in the pattern used
587for the last successful match.
44a2ac75 588
28d8d7f4
YO
589B<Note:> this result is always the actual number of distinct
590named buffers defined, it may not actually match that which is
591returned by C<regnames()> and related routines when those routines
592have not been called with the $all parameter set.
44a2ac75 593
de8c5301 594=back
b3eb6a9b 595
de8c5301 596=head1 SEE ALSO
b3eb6a9b 597
de8c5301
YO
598L<perlmodlib/Pragmatic Modules>.
599
600=cut