This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
update docs for (?{}) jumbo fix
[perl5.git] / ext / re / re.pm
CommitLineData
b3eb6a9b
GS
1package re;
2
99cc5cc6 3# pragma for controlling the regexp engine
de8c5301
YO
4use strict;
5use warnings;
6
7a529e48 7our $VERSION = "0.20";
de8c5301 8our @ISA = qw(Exporter);
ec781434 9our @EXPORT_OK = ('regmust',
192c1e27
JH
10 qw(is_regexp regexp_pattern
11 regname regnames regnames_count));
de8c5301
YO
12our %EXPORT_OK = map { $_ => 1 } @EXPORT_OK;
13
de8c5301
YO
14my %bitmask = (
15 taint => 0x00100000, # HINT_RE_TAINT
16 eval => 0x00200000, # HINT_RE_EVAL
17);
18
1e215989
FC
19my $flags_hint = 0x02000000; # HINT_RE_FLAGS
20my $PMMOD_SHIFT = 0;
21my %reflags = (
22 m => 1 << ($PMMOD_SHIFT + 0),
23 s => 1 << ($PMMOD_SHIFT + 1),
24 i => 1 << ($PMMOD_SHIFT + 2),
25 x => 1 << ($PMMOD_SHIFT + 3),
26 p => 1 << ($PMMOD_SHIFT + 4),
27# special cases:
1e215989 28 d => 0,
a62b1201
KW
29 l => 1,
30 u => 2,
cfaf538b 31 a => 3,
b4ab316d 32 aa => 4,
1e215989
FC
33);
34
de8c5301
YO
35sub setcolor {
36 eval { # Ignore errors
37 require Term::Cap;
38
39 my $terminal = Tgetent Term::Cap ({OSPEED => 9600}); # Avoid warning.
40 my $props = $ENV{PERL_RE_TC} || 'md,me,so,se,us,ue';
41 my @props = split /,/, $props;
42 my $colors = join "\t", map {$terminal->Tputs($_,1)} @props;
43
44 $colors =~ s/\0//g;
45 $ENV{PERL_RE_COLORS} = $colors;
46 };
47 if ($@) {
48 $ENV{PERL_RE_COLORS} ||= qq'\t\t> <\t> <\t\t';
49 }
50
51}
52
53my %flags = (
54 COMPILE => 0x0000FF,
55 PARSE => 0x000001,
56 OPTIMISE => 0x000002,
57 TRIEC => 0x000004,
58 DUMP => 0x000008,
f7819f85 59 FLAGS => 0x000010,
de8c5301
YO
60
61 EXECUTE => 0x00FF00,
62 INTUIT => 0x000100,
63 MATCH => 0x000200,
64 TRIEE => 0x000400,
65
66 EXTRA => 0xFF0000,
67 TRIEM => 0x010000,
68 OFFSETS => 0x020000,
69 OFFSETSDBG => 0x040000,
70 STATE => 0x080000,
71 OPTIMISEM => 0x100000,
72 STACK => 0x280000,
e7707071 73 BUFFERS => 0x400000,
2c296965 74 GPOS => 0x800000,
de8c5301 75);
e7707071 76$flags{ALL} = -1 & ~($flags{OFFSETS}|$flags{OFFSETSDBG}|$flags{BUFFERS});
de8c5301 77$flags{All} = $flags{all} = $flags{DUMP} | $flags{EXECUTE};
2c296965 78$flags{Extra} = $flags{EXECUTE} | $flags{COMPILE} | $flags{GPOS};
de8c5301
YO
79$flags{More} = $flags{MORE} = $flags{All} | $flags{TRIEC} | $flags{TRIEM} | $flags{STATE};
80$flags{State} = $flags{DUMP} | $flags{EXECUTE} | $flags{STATE};
81$flags{TRIE} = $flags{DUMP} | $flags{EXECUTE} | $flags{TRIEC};
82
ec781434
NC
83if (defined &DynaLoader::boot_DynaLoader) {
84 require XSLoader;
da4061d3 85 XSLoader::load();
de8c5301 86}
ec781434
NC
87# else we're miniperl
88# We need to work for miniperl, because the XS toolchain uses Text::Wrap, which
89# uses re 'taint'.
de8c5301
YO
90
91sub _load_unload {
92 my ($on)= @_;
93 if ($on) {
ec781434
NC
94 # We call install() every time, as if we didn't, we wouldn't
95 # "see" any changes to the color environment var since
96 # the last time it was called.
97
98 # install() returns an integer, which if casted properly
99cc5cc6 99 # in C resolves to a structure containing the regexp
ec781434
NC
100 # hooks. Setting it to a random integer will guarantee
101 # segfaults.
102 $^H{regcomp} = install();
de8c5301
YO
103 } else {
104 delete $^H{regcomp};
105 }
106}
107
108sub bits {
109 my $on = shift;
110 my $bits = 0;
1e215989 111 ARG:
de8c5301
YO
112 foreach my $idx (0..$#_){
113 my $s=$_[$idx];
114 if ($s eq 'Debug' or $s eq 'Debugcolor') {
115 setcolor() if $s =~/color/i;
116 ${^RE_DEBUG_FLAGS} = 0 unless defined ${^RE_DEBUG_FLAGS};
117 for my $idx ($idx+1..$#_) {
118 if ($flags{$_[$idx]}) {
119 if ($on) {
120 ${^RE_DEBUG_FLAGS} |= $flags{$_[$idx]};
121 } else {
122 ${^RE_DEBUG_FLAGS} &= ~ $flags{$_[$idx]};
123 }
124 } else {
125 require Carp;
126 Carp::carp("Unknown \"re\" Debug flag '$_[$idx]', possible flags: ",
127 join(", ",sort keys %flags ) );
128 }
129 }
130 _load_unload($on ? 1 : ${^RE_DEBUG_FLAGS});
131 last;
132 } elsif ($s eq 'debug' or $s eq 'debugcolor') {
133 setcolor() if $s =~/color/i;
134 _load_unload($on);
66e6b4c5 135 last;
de8c5301
YO
136 } elsif (exists $bitmask{$s}) {
137 $bits |= $bitmask{$s};
138 } elsif ($EXPORT_OK{$s}) {
de8c5301
YO
139 require Exporter;
140 re->export_to_level(2, 're', $s);
1e215989
FC
141 } elsif ($s =~ s/^\///) {
142 my $reflags = $^H{reflags} || 0;
6320bfaf 143 my $seen_charset;
342c8524 144 while ($s =~ m/( . )/gx) {
b4ab316d 145 $_ = $1;
cfaf538b 146 if (/[adul]/) {
342c8524
KW
147 # The 'a' may be repeated; hide this from the rest of the
148 # code by counting and getting rid of all of them, then
149 # changing to 'aa' if there is a repeat.
150 if ($_ eq 'a') {
151 my $sav_pos = pos $s;
152 my $a_count = $s =~ s/a//g;
153 pos $s = $sav_pos - 1; # -1 because got rid of the 'a'
154 if ($a_count > 2) {
155 require Carp;
156 Carp::carp(
157 qq 'The "a" flag may only appear a maximum of twice'
158 );
159 }
160 elsif ($a_count == 2) {
161 $_ = 'aa';
162 }
163 }
1e215989 164 if ($on) {
45a507fa 165 if ($seen_charset) {
96ef02be 166 require Carp;
45a507fa
KW
167 if ($seen_charset ne $_) {
168 Carp::carp(
169 qq 'The "$seen_charset" and "$_" flags '
170 .qq 'are exclusive'
171 );
172 }
173 else {
174 Carp::carp(
175 qq 'The "$seen_charset" flag may not appear '
176 .qq 'twice'
177 );
178 }
96ef02be 179 }
6320bfaf
KW
180 $^H{reflags_charset} = $reflags{$_};
181 $seen_charset = $_;
1e215989
FC
182 }
183 else {
6320bfaf
KW
184 delete $^H{reflags_charset}
185 if defined $^H{reflags_charset}
186 && $^H{reflags_charset} == $reflags{$_};
1e215989
FC
187 }
188 } elsif (exists $reflags{$_}) {
189 $on
190 ? $reflags |= $reflags{$_}
191 : ($reflags &= ~$reflags{$_});
192 } else {
193 require Carp;
194 Carp::carp(
195 qq'Unknown regular expression flag "$_"'
196 );
197 next ARG;
198 }
199 }
6320bfaf 200 ($^H{reflags} = $reflags or defined $^H{reflags_charset})
1e215989
FC
201 ? $^H |= $flags_hint
202 : ($^H &= ~$flags_hint);
de8c5301
YO
203 } else {
204 require Carp;
205 Carp::carp("Unknown \"re\" subpragma '$s' (known ones are: ",
206 join(', ', map {qq('$_')} 'debug', 'debugcolor', sort keys %bitmask),
207 ")");
208 }
209 }
210 $bits;
211}
212
213sub import {
214 shift;
215 $^H |= bits(1, @_);
216}
217
218sub unimport {
219 shift;
220 $^H &= ~ bits(0, @_);
221}
222
2231;
224
225__END__
56953603 226
b3eb6a9b
GS
227=head1 NAME
228
229re - Perl pragma to alter regular expression behaviour
230
231=head1 SYNOPSIS
232
e4d48cc9
GS
233 use re 'taint';
234 ($x) = ($^X =~ /^(.*)$/s); # $x is tainted here
b3eb6a9b 235
2cd61cdb 236 $pat = '(?{ $foo = 1 })';
e4d48cc9 237 use re 'eval';
2cd61cdb 238 /foo${pat}bar/; # won't fail (when not under -T switch)
e4d48cc9
GS
239
240 {
241 no re 'taint'; # the default
242 ($x) = ($^X =~ /^(.*)$/s); # $x is not tainted here
243
244 no re 'eval'; # the default
2cd61cdb 245 /foo${pat}bar/; # disallowed (with or without -T switch)
e4d48cc9 246 }
b3eb6a9b 247
1e215989
FC
248 use re '/ix';
249 "FOO" =~ / foo /; # /ix implied
250 no re '/x';
251 "FOO" =~ /foo/; # just /i implied
252
1e2e3d02
YO
253 use re 'debug'; # output debugging info during
254 /^(.*)$/s; # compile and run time
255
2cd61cdb 256
02ea72ae
IZ
257 use re 'debugcolor'; # same as 'debug', but with colored output
258 ...
259
a3621e74 260 use re qw(Debug All); # Finer tuned debugging options.
4ee9a43f 261 use re qw(Debug More);
fe759410 262 no re qw(Debug ALL); # Turn of all re debugging in this scope
4ee9a43f 263
de8c5301
YO
264 use re qw(is_regexp regexp_pattern); # import utility functions
265 my ($pat,$mods)=regexp_pattern(qr/foo/i);
266 if (is_regexp($obj)) {
267 print "Got regexp: ",
268 scalar regexp_pattern($obj); # just as perl would stringify it
269 } # but no hassle with blessed re's.
a3621e74 270
3ffabb8c
GS
271(We use $^X in these examples because it's tainted by default.)
272
b3eb6a9b
GS
273=head1 DESCRIPTION
274
de8c5301
YO
275=head2 'taint' mode
276
b3eb6a9b 277When C<use re 'taint'> is in effect, and a tainted string is the target
99cc5cc6
A
278of a regexp, the regexp memories (or values returned by the m// operator
279in list context) are tainted. This feature is useful when regexp operations
e4d48cc9
GS
280on tainted data aren't meant to extract safe substrings, but to perform
281other transformations.
b3eb6a9b 282
de8c5301
YO
283=head2 'eval' mode
284
99cc5cc6 285When C<use re 'eval'> is in effect, a regexp is allowed to contain
0b370c0a 286C<(?{ ... })> zero-width assertions and C<(??{ ... })> postponed
e128ab2c
DM
287subexpressions that are derived from variable interpolation, rather than
288appearing literally within the regexp. That is normally disallowed, since
289it is a
2cd61cdb
IZ
290potential security risk. Note that this pragma is ignored when the regular
291expression is obtained from tainted data, i.e. evaluation is always
0b370c0a 292disallowed with tainted regular expressions. See L<perlre/(?{ code })>
bb1773de 293and L<perlre/(??{ code })>.
2cd61cdb 294
ffbc6a93 295For the purpose of this pragma, interpolation of precompiled regular
0a92e3a8
GS
296expressions (i.e., the result of C<qr//>) is I<not> considered variable
297interpolation. Thus:
2cd61cdb
IZ
298
299 /foo${pat}bar/
300
ffbc6a93 301I<is> allowed if $pat is a precompiled regular expression, even
0b370c0a 302if $pat contains C<(?{ ... })> assertions or C<(??{ ... })> subexpressions.
2cd61cdb 303
1e215989
FC
304=head2 '/flags' mode
305
306When C<use re '/flags'> is specified, the given flags are automatically
307added to every regular expression till the end of the lexical scope.
308
309C<no re '/flags'> will turn off the effect of C<use re '/flags'> for the
310given flags.
311
312For example, if you want all your regular expressions to have /msx on by
313default, simply put
314
315 use re '/msx';
316
317at the top of your code.
318
cfaf538b 319The character set /adul flags cancel each other out. So, in this example,
1e215989
FC
320
321 use re "/u";
322 "ss" =~ /\xdf/;
323 use re "/d";
324 "ss" =~ /\xdf/;
325
4d220a7d 326the second C<use re> does an implicit C<no re '/u'>.
1e215989 327
59640339 328Turning on one of the character set flags with C<use re> takes precedence over the
1e215989
FC
329C<locale> pragma and the 'unicode_strings' C<feature>, for regular
330expressions. Turning off one of these flags when it is active reverts to
331the behaviour specified by whatever other pragmata are in scope. For
332example:
333
334 use feature "unicode_strings";
335 no re "/u"; # does nothing
336 use re "/l";
337 no re "/l"; # reverts to unicode_strings behaviour
338
de8c5301
YO
339=head2 'debug' mode
340
ffbc6a93 341When C<use re 'debug'> is in effect, perl emits debugging messages when
2cd61cdb
IZ
342compiling and using regular expressions. The output is the same as that
343obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
344B<-Dr> switch. It may be quite voluminous depending on the complexity
02ea72ae
IZ
345of the match. Using C<debugcolor> instead of C<debug> enables a
346form of output that can be used to get a colorful display on terminals
347that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
348comma-separated list of C<termcap> properties to use for highlighting
ffbc6a93 349strings on/off, pre-point part on/off.
57e8c15d 350See L<perldebug/"Debugging Regular Expressions"> for additional info.
2cd61cdb 351
de8c5301
YO
352As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
353lexically scoped, as the other directives are. However they have both
354compile-time and run-time effects.
355
356See L<perlmodlib/Pragmatic Modules>.
357
358=head2 'Debug' mode
359
a3621e74
YO
360Similarly C<use re 'Debug'> produces debugging output, the difference
361being that it allows the fine tuning of what debugging output will be
be8e71aa
YO
362emitted. Options are divided into three groups, those related to
363compilation, those related to execution and those related to special
364purposes. The options are as follows:
365
366=over 4
367
368=item Compile related options
369
370=over 4
371
372=item COMPILE
373
374Turns on all compile related debug options.
375
376=item PARSE
377
378Turns on debug output related to the process of parsing the pattern.
379
380=item OPTIMISE
381
382Enables output related to the optimisation phase of compilation.
383
24b23f37 384=item TRIEC
be8e71aa
YO
385
386Detailed info about trie compilation.
387
388=item DUMP
389
390Dump the final program out after it is compiled and optimised.
391
be8e71aa
YO
392=back
393
394=item Execute related options
395
396=over 4
397
398=item EXECUTE
399
400Turns on all execute related debug options.
401
402=item MATCH
403
404Turns on debugging of the main matching loop.
405
24b23f37 406=item TRIEE
be8e71aa
YO
407
408Extra debugging of how tries execute.
409
410=item INTUIT
411
412Enable debugging of start point optimisations.
413
414=back
415
416=item Extra debugging options
417
418=over 4
419
420=item EXTRA
421
422Turns on all "extra" debugging options.
423
e7707071
YO
424=item BUFFERS
425
c27a5cfe 426Enable debugging the capture group storage during match. Warning,
e7707071
YO
427this can potentially produce extremely large output.
428
24b23f37
YO
429=item TRIEM
430
431Enable enhanced TRIE debugging. Enhances both TRIEE
432and TRIEC.
433
434=item STATE
435
4ee9a43f 436Enable debugging of states in the engine.
24b23f37
YO
437
438=item STACK
be8e71aa 439
24b23f37
YO
440Enable debugging of the recursion stack in the engine. Enabling
441or disabling this option automatically does the same for debugging
442states as well. This output from this can be quite large.
443
444=item OPTIMISEM
445
446Enable enhanced optimisation debugging and start point optimisations.
99cc5cc6 447Probably not useful except when debugging the regexp engine itself.
24b23f37
YO
448
449=item OFFSETS
450
451Dump offset information. This can be used to see how regops correlate
452to the pattern. Output format is
453
454 NODENUM:POSITION[LENGTH]
455
456Where 1 is the position of the first char in the string. Note that position
457can be 0, or larger than the actual length of the pattern, likewise length
458can be zero.
be8e71aa 459
24b23f37 460=item OFFSETSDBG
be8e71aa
YO
461
462Enable debugging of offsets information. This emits copious
fe759410 463amounts of trace information and doesn't mesh well with other
be8e71aa
YO
464debug options.
465
fe759410 466Almost definitely only useful to people hacking
be8e71aa
YO
467on the offsets part of the debug engine.
468
469=back
470
471=item Other useful flags
472
473These are useful shortcuts to save on the typing.
474
475=over 4
476
477=item ALL
478
e7707071 479Enable all options at once except OFFSETS, OFFSETSDBG and BUFFERS
be8e71aa
YO
480
481=item All
482
fe759410 483Enable DUMP and all execute options. Equivalent to:
be8e71aa
YO
484
485 use re 'debug';
486
487=item MORE
488
489=item More
490
24b23f37 491Enable TRIEM and all execute compile and execute options.
be8e71aa 492
dba3f186 493=back
be8e71aa 494
dba3f186 495=back
a3621e74 496
1e2e3d02 497As of 5.9.5 the directive C<use re 'debug'> and its equivalents are
4ee9a43f 498lexically scoped, as the other directives are. However they have both
1e2e3d02 499compile-time and run-time effects.
b3eb6a9b 500
de8c5301 501=head2 Exportable Functions
b3eb6a9b 502
de8c5301 503As of perl 5.9.5 're' debug contains a number of utility functions that
4ee9a43f 504may be optionally exported into the caller's namespace. They are listed
de8c5301 505below.
b3eb6a9b 506
de8c5301 507=over 4
b3eb6a9b 508
de8c5301 509=item is_regexp($ref)
02ea72ae 510
de8c5301 511Returns true if the argument is a compiled regular expression as returned
4ee9a43f 512by C<qr//>, false if it is not.
02ea72ae 513
4ee9a43f
RGS
514This function will not be confused by overloading or blessing. In
515internals terms, this extracts the regexp pointer out of the
3a5e0888 516PERL_MAGIC_qr structure so it cannot be fooled.
894be9b7 517
de8c5301 518=item regexp_pattern($ref)
02ea72ae 519
4ee9a43f
RGS
520If the argument is a compiled regular expression as returned by C<qr//>,
521then this function returns the pattern.
be8e71aa 522
4ee9a43f
RGS
523In list context it returns a two element list, the first element
524containing the pattern and the second containing the modifiers used when
525the pattern was compiled.
be8e71aa 526
4ee9a43f 527 my ($pat, $mods) = regexp_pattern($ref);
a3621e74 528
99cc5cc6 529In scalar context it returns the same as perl would when stringifying a raw
4ee9a43f
RGS
530C<qr//> with the same pattern inside. If the argument is not a compiled
531reference then this routine returns false but defined in scalar context,
532and the empty list in list context. Thus the following
f9f4320a 533
dff5e0c4 534 if (regexp_pattern($ref) eq '(?^i:foo)')
dba3f186 535
de8c5301 536will be warning free regardless of what $ref actually is.
380e0b81 537
4ee9a43f
RGS
538Like C<is_regexp> this function will not be confused by overloading
539or blessing of the object.
b3eb6a9b 540
256ddcd0
YO
541=item regmust($ref)
542
432acd5f 543If the argument is a compiled regular expression as returned by C<qr//>,
99cc5cc6 544then this function returns what the optimiser considers to be the longest
432acd5f
RGS
545anchored fixed string and longest floating fixed string in the pattern.
546
547A I<fixed string> is defined as being a substring that must appear for the
548pattern to match. An I<anchored fixed string> is a fixed string that must
549appear at a particular offset from the beginning of the match. A I<floating
550fixed string> is defined as a fixed string that can appear at any point in
551a range of positions relative to the start of the match. For example,
552
553 my $qr = qr/here .* there/x;
554 my ($anchored, $floating) = regmust($qr);
256ddcd0 555 print "anchored:'$anchored'\nfloating:'$floating'\n";
432acd5f 556
256ddcd0
YO
557results in
558
559 anchored:'here'
560 floating:'there'
561
432acd5f
RGS
562Because the C<here> is before the C<.*> in the pattern, its position
563can be determined exactly. That's not true, however, for the C<there>;
564it could appear at any point after where the anchored string appeared.
256ddcd0
YO
565Perl uses both for its optimisations, prefering the longer, or, if they are
566equal, the floating.
567
568B<NOTE:> This may not necessarily be the definitive longest anchored and
432acd5f 569floating string. This will be what the optimiser of the Perl that you
256ddcd0
YO
570are using thinks is the longest. If you believe that the result is wrong
571please report it via the L<perlbug> utility.
572
28d8d7f4 573=item regname($name,$all)
44a2ac75 574
28d8d7f4
YO
575Returns the contents of a named buffer of the last successful match. If
576$all is true, then returns an array ref containing one entry per buffer,
44a2ac75
YO
577otherwise returns the first defined buffer.
578
28d8d7f4 579=item regnames($all)
44a2ac75 580
28d8d7f4
YO
581Returns a list of all of the named buffers defined in the last successful
582match. If $all is true, then it returns all names defined, if not it returns
583only names which were involved in the match.
44a2ac75 584
28d8d7f4 585=item regnames_count()
44a2ac75 586
28d8d7f4
YO
587Returns the number of distinct names defined in the pattern used
588for the last successful match.
44a2ac75 589
28d8d7f4
YO
590B<Note:> this result is always the actual number of distinct
591named buffers defined, it may not actually match that which is
592returned by C<regnames()> and related routines when those routines
593have not been called with the $all parameter set.
44a2ac75 594
de8c5301 595=back
b3eb6a9b 596
de8c5301 597=head1 SEE ALSO
b3eb6a9b 598
de8c5301
YO
599L<perlmodlib/Pragmatic Modules>.
600
601=cut