This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add alphabetic synonyms for regex assertions
authorKarl Williamson <khw@cpan.org>
Mon, 19 Feb 2018 04:30:17 +0000 (21:30 -0700)
committerKarl Williamson <khw@cpan.org>
Mon, 19 Feb 2018 05:00:33 +0000 (22:00 -0700)
This commit came out of the committee formed with PCRE members as a
result of http://nntp.perl.org/group/perl.perl5.porters/246762

MANIFEST
pod/perldelta.pod
pod/perldiag.pod
pod/perlexperiment.pod
pod/perlre.pod
pod/perlreref.pod
pod/perlretut.pod
regcomp.c
t/re/alpha_assertions.t [new file with mode: 0644]
t/re/reg_mesg.t
t/re/regexp.t

index c4995e3..e31f26f 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -5882,6 +5882,7 @@ t/porting/regen.t         Check that regen.pl doesn't need running
 t/porting/ss_dup.t             Check that sv.c:ss_dup handles everything
 t/porting/test_bootstrap.t     Test that the instructions for test bootstrapping aren't accidentally overlooked.
 t/porting/utils.t              Check that utility scripts still compile
+t/re/alpha_assertions.t                See if things like '(*postive_lookahed:...) work properly
 t/re/anyof.t                   See if bracketed char classes [...] compile properly
 t/re/charset.t                 See if regex modifiers like /d, /u work properly
 t/re/fold_grind.t              See if case folding works properly
index 6304fa8..691135e 100644 (file)
@@ -55,6 +55,24 @@ abbreviated form for it.  The syntax is now either of:
 
 Previously a C<"+"> was used instead of the C<"*">.
 
+=head2 Experimentally, there are now alphabetic synonyms for some
+regular expression assertions
+
+If you find it difficult to remember how to write certain of the pattern
+assertions, there are now alphabetic synonyms.
+
+ CURRENT                NEW SYNONYMS
+ ------                 ------------
+ (?=...)        (*pla:...) or (*positive_lookahead:...)
+ (?!...)        (*nla:...) or (*negative_lookahead:...)
+ (?<=...)       (*plb:...) or (*positive_lookbehind:...)
+ (?<!...)       (*nlb:...) or (*negative_lookbehind:...)
+ (?>...)        (*atomic:...)
+
+These are considered experimental, so using any of these will raise
+(unless turned off) a warning in the C<experimental::alpha_assertions>
+category.
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
index d070ba3..3e69aae 100644 (file)
@@ -6100,6 +6100,12 @@ as a compiler directive.  You may say only one of
 This is to prevent the problem of one module changing the array base out
 from under another module inadvertently.  See L<perlvar/$[> and L<arybase>.
 
+=item The alpha_assertions feature is experimental
+
+(S experimental::alpha_assertions) This feature is experimental
+and its behavior may in any future release of perl.  See
+L<perlre/Extended Patterns>.
+
 =item The crypt() function is unimplemented due to excessive paranoia.
 
 (F) Configure couldn't find the crypt() function on your machine,
@@ -6696,15 +6702,21 @@ m/%s/
 (F) The condition part of a (?(condition)if-clause|else-clause) construct
 is not known.  The condition must be one of the following:
 
- (1) (2) ...        true if 1st, 2nd, etc., capture matched
- (<NAME>) ('NAME')  true if named capture matched
- (?=...) (?<=...)   true if subpattern matches
- (?!...) (?<!...)   true if subpattern fails to match
- (?{ CODE })        true if code returns a true value
- (R)                true if evaluating inside recursion
- (R1) (R2) ...      true if directly inside capture group 1, 2, etc.
- (R&NAME)           true if directly inside named capture
- (DEFINE)           always false; for defining named subpatterns
+ (1) (2) ...            true if 1st, 2nd, etc., capture matched
+ (<NAME>) ('NAME')      true if named capture matched
+ (?=...) (?<=...)       true if subpattern matches
+ (*pla:...) (*plb:...)  true if subpattern matches; also
+                             (*positive_lookahead:...)
+                             (*positive_lookbehind:...)
+ (*nla:...) (*nlb:...)  true if subpattern fails to match; also
+                             (*negative_lookahead:...)
+                             (*negative_lookbehind:...)
+ (?{ CODE })            true if code returns a true value
+ (R)                    true if evaluating inside recursion
+ (R1) (R2) ...          true if directly inside capture group 1, 2,
+                             etc.
+ (R&NAME)               true if directly inside named capture
+ (DEFINE)               always false; for defining named subpatterns
 
 The S<<-- HERE> shows whereabouts in the regular expression the problem was
 discovered.  See L<perlre>.
index 8c2c8f0..7963c05 100644 (file)
@@ -127,6 +127,14 @@ C<experimental::script_run>.
 
 See also: L<perlre/Script Runs>
 
+=item Alpabetic assertions
+
+Introduced in Perl 5.28.0
+
+Using this feature triggers warnings in the category
+C<experimental::alpha_assertions>.
+
+See also: L<perlre/Extended Patterns>.
 
 =back
 
index e9a5e5f..b5d5517 100644 (file)
@@ -1570,13 +1570,30 @@ lookahead matches text following the current match position.
 =over 4
 
 =item C<(?=pattern)>
-X<(?=)> X<look-ahead, positive> X<lookahead, positive>
+
+=item C<(*pla:pattern)>
+
+=item C<(*positive_lookahead:pattern)>
+X<(?=)>
+X<(*pla>
+X<(*positive_lookahead>
+X<look-ahead, positive> X<lookahead, positive>
 
 A zero-width positive lookahead assertion.  For example, C</\w+(?=\t)/>
 matches a word followed by a tab, without including the tab in C<$&>.
 
+The alphabetic forms are experimental; using them yields a warning in the
+C<experimental::alpha_assertions> category.
+
 =item C<(?!pattern)>
-X<(?!)> X<look-ahead, negative> X<lookahead, negative>
+
+=item C<(*nla:pattern)>
+
+=item C<(*negative_lookahead:pattern)>
+X<(?!)>
+X<(*nla>
+X<(*negative_lookahead>
+X<look-ahead, negative> X<lookahead, negative>
 
 A zero-width negative lookahead assertion.  For example C</foo(?!bar)/>
 matches any occurrence of "foo" that isn't followed by "bar".  Note
@@ -1588,10 +1605,20 @@ will not do what you want.  That's because the C<(?!foo)> is just saying that
 the next thing cannot be "foo"--and it's not, it's a "bar", so "foobar" will
 match.  Use lookbehind instead (see below).
 
+The alphabetic forms are experimental; using them yields a warning in the
+C<experimental::alpha_assertions> category.
+
 =item C<(?<=pattern)>
 
 =item C<\K>
-X<(?<=)> X<look-behind, positive> X<lookbehind, positive> X<\K>
+
+=item C<(*plb:pattern)>
+
+=item C<(*positive_lookbehind:pattern)>
+X<(?<=)>
+X<(*plb>
+X<(*positive_lookbehind>
+X<look-behind, positive> X<lookbehind, positive> X<\K>
 
 A zero-width positive lookbehind assertion.  For example, C</(?<=\t)\w+/>
 matches a word that follows a tab, without including the tab in C<$&>.
@@ -1615,13 +1642,26 @@ can be rewritten as the much more efficient
 
   s/foo\Kbar//g;
 
+The alphabetic forms (not including C<\K> are experimental; using them
+yields a warning in the C<experimental::alpha_assertions> category.
+
 =item C<(?<!pattern)>
-X<(?<!)> X<look-behind, negative> X<lookbehind, negative>
+
+=item C<(*nlb:pattern)>
+
+=item C<(*negative_lookbehind:pattern)>
+X<(?<!)>
+X<(*nlb>
+X<(*negative_lookbehind>
+X<look-behind, negative> X<lookbehind, negative>
 
 A zero-width negative lookbehind assertion.  For example C</(?<!bar)foo/>
 matches any occurrence of "foo" that does not follow "bar".  Works
 only for fixed-width lookbehind.
 
+The alphabetic forms are experimental; using them yields a warning in the
+C<experimental::alpha_assertions> category.
+
 =back
 
 =item C<< (?<NAME>pattern) >>
@@ -2096,6 +2136,10 @@ compile the definitions with the C<qr//> operator, and later
 interpolate them in another pattern.
 
 =item C<< (?>pattern) >>
+
+=item C<< (*atomic:pattern) >>
+X<(?E<gt>pattern)>
+X<(*atomic>
 X<backtrack> X<backtracking> X<atomic> X<possessive>
 
 An "independent" subexpression, one which matches the substring
@@ -2204,6 +2248,9 @@ to inside of one of these constructs. The following equivalences apply:
     PAT?+               (?>PAT?)
     PAT{min,max}+       (?>PAT{min,max})
 
+The alphabetic form (C<(*atomic:...)>) is experimental; using it
+yields a warning in the C<experimental::alpha_assertions> category.
+
 =item C<(?[ ])>
 
 See L<perlrecharclass/Extended Bracketed Character Classes>.
index c9deafa..aaac153 100644 (file)
@@ -234,10 +234,15 @@ There is no quantifier C<{,n}>. That's interpreted as a literal string.
    (?:...)           Groups subexpressions without capturing (cluster)
    (?pimsx-imsx:...) Enable/disable option (as per m// modifiers)
    (?=...)           Zero-width positive lookahead assertion
+   (?*pla:...)       Same; avail experimentally starting in 5.28
    (?!...)           Zero-width negative lookahead assertion
+   (?*nla:...)       Same; avail experimentally starting in 5.28
    (?<=...)          Zero-width positive lookbehind assertion
+   (?*plb:...)       Same; avail experimentally starting in 5.28
    (?<!...)          Zero-width negative lookbehind assertion
+   (?*nlb:...)       Same; avail experimentally starting in 5.28
    (?>...)           Grab what we can, prohibit backtracking
+   (?*atomic:...)    Same; avail experimentally starting in 5.28
    (?|...)           Branch reset
    (?<name>...)      Named capture
    (?'name'...)      Named capture
index 9c1671e..2f7670e 100644 (file)
@@ -2355,6 +2355,18 @@ by looking ahead and behind:
                   | (?<=-)  (?=\S)   # a '-' followed by any non-space
                   /x, $str;          # @toks = qw(one two - - - 6 - 8)
 
+Starting in Perl 5.28, experimentally, alphabetic equivalents to these
+assertions are added, so you can use whichever is most memorable for
+your tastes.
+
+ (?=...)        (*pla:...) or (*positive_lookahead:...)
+ (?!...)        (*nla:...) or (*negative_lookahead:...)
+ (?<=...)       (*plb:...) or (*positive_lookbehind:...)
+ (?<!...)       (*nlb:...) or (*negative_lookbehind:...)
+ (?>...)        (*atomic:...)
+
+Using any of these will raise (unless turned off) a warning in the
+C<experimental::alpha_assertions> category.
 
 =head2 Using independent subexpressions to prevent backtracking
 
index 25f772e..ef15407 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10812,6 +10812,40 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                     RExC_seen |= REG_CUTGROUP_SEEN;
                 }
                 break;
+            case 'a':
+                if (memEQs(start_verb, verb_len, "atomic")) {
+                    paren = 't';    /* AtOMIC */
+                    goto alpha_assertions;
+                }
+                break;
+            case 'p':
+                if (   memEQs(start_verb, verb_len, "plb")
+                    || memEQs(start_verb, verb_len, "positive_lookbehind"))
+                {
+                    paren = 'b';
+                    goto lookbehind_alpha_assertions;
+                }
+                else if (   memEQs(start_verb, verb_len, "pla")
+                         || memEQs(start_verb, verb_len, "positive_lookahead"))
+                {
+                    paren = 'a';
+                    goto alpha_assertions;
+                }
+                break;
+            case 'n':
+                if (   memEQs(start_verb, verb_len, "nlb")
+                    || memEQs(start_verb, verb_len, "negative_lookbehind"))
+                {
+                    paren = 'B';
+                    goto lookbehind_alpha_assertions;
+                }
+                else if (   memEQs(start_verb, verb_len, "nla")
+                         || memEQs(start_verb, verb_len, "negative_lookahead"))
+                {
+                    paren = 'A';
+                    goto alpha_assertions;
+                }
+                break;
             case 's':
                 if (   memEQs(start_verb, verb_len, "sr")
                     || memEQs(start_verb, verb_len, "script_run"))
@@ -10851,6 +10885,36 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
                 break;
 
+            lookbehind_alpha_assertions:
+                RExC_seen |= REG_LOOKBEHIND_SEEN;
+                RExC_in_lookbehind++;
+                /*FALLTHROUGH*/
+
+            alpha_assertions:
+
+                if (PASS2) {
+                    Perl_ck_warner_d(aTHX_
+                        packWARN(WARN_EXPERIMENTAL__ALPHA_ASSERTIONS),
+                        "The alpha_assertions feature is experimental"
+                        REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
+                }
+
+                RExC_seen_zerolen++;
+
+                if (! start_arg) {
+                    goto no_colon;
+                }
+
+                /* An empty negative lookahead assertion simply is failure */
+                if (paren == 'A' && RExC_parse == start_arg) {
+                    ret=reganode(pRExC_state, OPFAIL, 0);
+                    nextchar(pRExC_state);
+                    return ret;
+               }
+
+                RExC_parse = start_arg;
+                goto parse_rest;
+
               no_colon:
                 vFAIL2utf8f(
                 "'(*%" UTF8f "' requires a terminating ':'",
@@ -11033,6 +11097,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                    paren = 1;
                    goto capturing_parens;
                }
+
                 RExC_seen |= REG_LOOKBEHIND_SEEN;
                RExC_in_lookbehind++;
                RExC_parse++;
@@ -11263,12 +11328,37 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
            {
                int is_define= 0;
                 const int DEFINE_len = sizeof("DEFINE") - 1;
-               if (RExC_parse[0] == '?') {        /* (?(?...)) */
-                    if (   RExC_parse < RExC_end - 1
-                        && (   RExC_parse[1] == '='
-                            || RExC_parse[1] == '!'
-                            || RExC_parse[1] == '<'
-                            || RExC_parse[1] == '{')
+               if (    RExC_parse < RExC_end - 1
+                    && (   (       RExC_parse[0] == '?'        /* (?(?...)) */
+                            && (   RExC_parse[1] == '='
+                                || RExC_parse[1] == '!'
+                                || RExC_parse[1] == '<'
+                                || RExC_parse[1] == '{'))
+                       || (       RExC_parse[0] == '*'        /* (?(*...)) */
+                            && (   memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "pla:")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "plb")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "nla")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "nlb")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "positive_lookahead")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "positive_lookbehind")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "negative_lookahead")
+                                || memBEGINs(RExC_parse +1,
+                                             (Size_t) (RExC_end - (RExC_parse + 1)),
+                                             "negative_lookbehind"))))
                     ) { /* Lookahead or eval. */
                        I32 flag;
                         regnode *tail;
@@ -11285,10 +11375,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         REGTAIL(pRExC_state, ret, tail);
                        goto insert_if;
                    }
-                   /* Fall through to ‘Unknown switch condition’ at the
-                      end of the if/else chain. */
-               }
-               else if ( RExC_parse[0] == '<'     /* (?(<NAME>)...) */
+               else if (   RExC_parse[0] == '<'     /* (?(<NAME>)...) */
                         || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
                {
                    char ch = RExC_parse[0] == '<' ? '>' : '\'';
@@ -11601,11 +11688,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             RExC_in_script_run = 0;
            break;
        case '<':
+        case 'a':
+        case 'A':
+        case 'b':
+        case 'B':
        case ',':
        case '=':
        case '!':
            *flagp &= ~HASWIDTH;
            /* FALLTHROUGH */
+        case 't':   /* aTomic */
        case '>':
            ender = reg_node(pRExC_state, SUCCEED);
            break;
@@ -11691,14 +11783,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
 
     {
         const char *p;
-        static const char parens[] = "=!<,>";
+         /* Even/odd or x=don't care: 010101x10x */
+        static const char parens[] = "=!aA<,>Bbt";
+         /* flag below is set to 0 up through 'A'; 1 for larger */
 
        if (paren && (p = strchr(parens, paren))) {
            U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
-           int flag = (p - parens) > 1;
+           int flag = (p - parens) > 3;
 
-           if (paren == '>')
+           if (paren == '>' || paren == 't') {
                node = SUSPEND, flag = 0;
+            }
            reginsert(pRExC_state, node,ret, depth+1);
             Set_Node_Cur_Length(ret, parse_start);
            Set_Node_Offset(ret, parse_start + 1);
diff --git a/t/re/alpha_assertions.t b/t/re/alpha_assertions.t
new file mode 100644 (file)
index 0000000..3d28bbc
--- /dev/null
@@ -0,0 +1,18 @@
+#!./perl
+
+use strict;
+use warnings;
+no warnings 'once';
+
+# This tests that the alphabetic assertions, like '(*atomic:...) work
+# It just sets a flag and calls regexp.t which will run through its test
+# suite, modifiying the tests to use the alphabetic synonyms.
+
+BEGIN { $::alpha_assertions = 1; }
+for my $file ('./re/regexp.t', './t/re/regexp.t', ':re:regexp.t') {
+    if (-r $file) {
+       do $file or die $@;
+       exit;
+    }
+}
+die "Cannot find ./re/regexp.t or ./t/re/regexp.t\n";
index aff5535..1bab9df 100644 (file)
@@ -290,6 +290,7 @@ my @death =
  '/(*srfoo)/' => 'Unknown \'(*...)\' construct \'srfoo\' {#} m/(*srfoo){#}/',
  '/(*script_run)/' => '\'(*script_run\' requires a terminating \':\' {#} m/(*script_run{#})/',
  '/(*sr)/' => '\'(*sr\' requires a terminating \':\' {#} m/(*sr{#})/',
+ '/(*pla)/' => '\'(*pla\' requires a terminating \':\' {#} m/(*pla{#})/',
  '/(*script_run/' => 'Unterminated \'(*...\' construct {#} m/(*script_run{#}/',
  '/(*sr/' => 'Unterminated \'(*...\' construct {#} m/(*sr{#}/',
  '/(*script_run:foo/' => 'Unterminated \'(*...\' argument {#} m/(*script_run:foo{#}/',
index cced1e0..835cbdc 100644 (file)
@@ -99,7 +99,10 @@ sub convert_from_ascii {
 use strict;
 use warnings FATAL=>"all";
 our ($bang, $ffff, $nulnul); # used by the tests
-our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $no_null); # set by our callers
+our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers
+
+my $expanded_text = "expanded name from original test number";
+my $expanded_text_re = qr/$expanded_text/;
 
 if (!defined $file) {
     open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!";
@@ -115,7 +118,6 @@ $nulnul = "\0" x 2;
 my $OP = $qr ? 'qr' : 'm';
 
 $| = 1;
-printf "1..%d\n# $iters iterations\n", scalar @tests;
 
 my $test;
 TEST:
@@ -131,6 +133,7 @@ foreach (@tests) {
     chomp;
     s/\\n/\n/g unless $regex_sets;
     my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7);
+    $comment = "" unless defined $comment;
     if (!defined $subject) {
         die "Bad test definition on line $test: $_\n";
     }
@@ -180,7 +183,42 @@ foreach (@tests) {
         $comment=~s/^\s*(?:#\s*)?//;
         $testname .= " - $comment" if $comment;
     }
-    if (! $skip && $regex_sets) {
+    if (! $skip && $alpha_assertions) {
+        my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x;
+        if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) {
+            $skip++;
+            $reason = "Pattern doesn't contain assertions";
+        }
+        elsif ($comment !~ $expanded_text_re) {
+            my $expanded_pat = $pat;
+
+            $pat =~ s/\( \? > /(*atomic:/xg;
+
+            if ($pat =~ s/\( \? = /(*pla:/xg) {
+                $expanded_pat =~ s//(*positive_lookahead:/g;
+            }
+            if ($pat =~ s/\( \? ! /(*nla:/xg) {
+                $expanded_pat =~ s//(*negative_lookahead:/g;
+            }
+            if ($pat =~ s/\( \? <= /(*plb:/xg) {
+                $expanded_pat =~ s//(*positive_lookbehind:/g;
+            }
+            if ($pat =~ s/\( \? <! /(*nlb:/xg) {
+                $expanded_pat =~ s//(*negative_lookbehind:/g;
+            }
+            if ($expanded_pat ne $pat) {
+                $comment .= " $expanded_text $test";
+                push @tests, join "\t", $expanded_pat,
+                                        $subject // "",
+                                        $result // "",
+                                        $repl // "",
+                                        $expect // "",
+                                        $reason // "",
+                                        $comment;
+            }
+        }
+    }
+    elsif (! $skip && $regex_sets) {
 
         # If testing regex sets, change the [bracketed] classes into
         # (?[bracketed]).  But note that '\[' and '\c[' don't introduce such a
@@ -412,6 +450,7 @@ EOFCODE
 EOFCODE
         }
         $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets;
+        $code = "no warnings 'experimental::alpha_assertions';$code" if $alpha_assertions;
         #$code.=qq[\n\$expect="$expect";\n];
         #use Devel::Peek;
         #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/;
@@ -465,4 +504,6 @@ EOFCODE
     print "ok $testname$todo\n";
 }
 
+printf "1..%d\n# $iters iterations\n", scalar @tests;
+
 1;