Change syntax of script runs

author Karl Williamson <khw@cpan.org>

Mon, 19 Feb 2018 03:28:34 +0000 (20:28 -0700)

committer Karl Williamson <khw@cpan.org>

Mon, 19 Feb 2018 05:00:33 +0000 (22:00 -0700)
author Karl Williamson <khw@cpan.org>
Mon, 19 Feb 2018 03:28:34 +0000 (20:28 -0700)
committer Karl Williamson <khw@cpan.org>
Mon, 19 Feb 2018 05:00:33 +0000 (22:00 -0700)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index e640f76..6304fa8 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -44,6 +44,17 @@ The implication is that you are now free to use locales and changes them
  in a threaded environment.  Your changes affect only your thread.
  See L<perllocale/Multi-threaded operation>
  
+=head2 Script runs now are specified with a different syntax
+
+This isn't really an enhancement, but is being put in this category
+because it changes an enhancement from 5.27.8, and there is a new
+abbreviated form for it.  The syntax is now either of:
+
+ (*script_run:...)
+ (*sr:...)
+
+Previously a C<"+"> was used instead of the C<"*">.
+
  =head1 Security
  
  XXX Any security-related notices go here.  In particular, any security
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index b4f88fa..d070ba3 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3009,12 +3009,13 @@ expression pattern should be an indivisible token, with nothing
  intervening between the C<"("> and the C<"?">, but you separated them
  with whitespace.
  
-=item In '(+...)', the '(' and '+' must be adjacent in regex;
+=item In '(*...)', the '(' and '*' must be adjacent in regex;
  marked by S<<-- HERE> in m/%s/
  
-(F) The two-character sequence C<"(+"> in this context in a regular
+(F) The two-character sequence C<"(*"> in this context in a regular
  expression pattern should be an indivisible token, with nothing
-intervening between the C<"("> and the C<"+">, but you separated them.
+intervening between the C<"("> and the C<"*">, but you separated them.
+Fix the pattern and retry.
  
  =item Invalid %s attribute: %s
  
@@ -5423,6 +5424,11 @@ terminates.  You might use ^# instead.  See L<perlform>.
  search list.  So the additional elements in the replacement list
  are meaningless.
  
+=item '(*%s' requires a terminating ':' in regex; marked by <-- HERE in m/%s/
+
+(F) You used a construct that needs a colon and pattern argument.
+Supply these or check that you are using the right construct.
+
  =item '%s' resolved to '\o{%s}%d'
  
  (W misc, regexp)  You wrote something like C<\08>, or C<\179> in a
@@ -6625,6 +6631,11 @@ exactly, regardless of whether C<:loose> is used or not.)  This error may
  also happen if the C<\N{}> is not in the scope of the corresponding
  C<S<use charnames>>.
  
+=item Unknown '(*...)' construct '%s' in regex; marked by <-- HERE in m/%s/
+
+(F) The C<(*> was followed by something that the regular expression
+compiler does not recognize.  Check your spelling.
+
  =item Unknown error
  
  (P) Perl was about to print an error message in C<$@>, but the C<$@> variable
@@ -6644,11 +6655,6 @@ your needs.
  of valid modes: C<< < >>, C<< > >>, C<<< >> >>>, C<< +< >>,
  C<< +> >>, C<<< +>> >>>, C<-|>, C<|->, C<< <& >>, C<< >& >>.
  
-=item Unknown (+ pattern in regex; marked by S<<-- HERE> in m/%s/
-
-(F) The C<(+> was followed by something that the regular expression
-compiler does not recognize.  Check your spelling.
-
  =item Unknown PerlIO layer "%s"
  
  (W layer) An attempt was made to push an unknown layer onto the Perl I/O
@@ -6841,6 +6847,11 @@ declares it to be in a Unicode encoding that Perl cannot read.
  (F) Your machine doesn't support the Berkeley socket mechanism, or at
  least that's what Configure thought.
  
+=item Unterminated '(*...' argument in regex; marked by <-- HERE in m/%s/
+
+(F) You used a pattern of the form C<(*...:...)> but did not terminate
+the pattern with a C<)>.  Fix the pattern and retry.
+
  =item Unterminated attribute list
  
  (F) The lexer found something other than a simple identifier at the
@@ -6861,6 +6872,11 @@ character to get your parentheses to balance.  See L<attributes>.
  compressed integer format and could not be converted to an integer.
  See L<perlfunc/pack>.
  
+=item Unterminated '(*...' construct in regex; marked by <-- HERE in m/%s/
+
+(F) You used a pattern of the form C<(*...)> but did not terminate
+the pattern with a C<)>.  Fix the pattern and retry.
+
  =item Unterminated delimiter for here document
  
  (F) This message occurs when a here document label has an initial
diff --git a/pod/perlre.pod b/pod/perlre.pod

index 74f44fe..e9a5e5f 100644 (file)
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -708,7 +708,7 @@ the pattern uses L</C<(?[ ])>>
  
  =item 8
  
-the pattern uses L<C<(+script_run: ...)>|/Script Runs>
+the pattern uses L<C<(*script_run: ...)>|/Script Runs>
  
  =back
  
@@ -2421,6 +2421,7 @@ where side-effects of lookahead I<might> have influenced the
  following match, see L</C<< (?>pattern) >>>.
  
  =head2 Script Runs
+X<(*script_run:...)> X<(sr:...)>
  
  A script run is basically a sequence of characters, all from the same
  Unicode script (see L<perlunicode/Scripts>), such as Latin or Greek.  In
@@ -2438,9 +2439,11 @@ the real Paypal website, but an attacker would craft a look-alike one to
  attempt to gather sensitive information from the person.
  
  Starting in Perl 5.28, it is now easy to detect strings that aren't
-script runs.  Simply enclose just about any pattern like this:
+script runs.  Simply enclose just about any pattern like either of
+these:
  
- (+script_run:pattern)
+ (*script_run:pattern)
+ (*sr:pattern)
  
  What happens is that after I<pattern> succeeds in matching, it is
  subjected to the additional criterion that every character in it must be
@@ -2451,7 +2454,7 @@ backtracking, but generally, only malicious input will result in this,
  though the slow down could cause a denial of service attack.  If your
  needs permit, it is best to make the pattern atomic.
  
- (+script_run:(?>pattern))
+ (*script_run:(?>pattern))
  
  (See L</C<(?E<gt>pattern)>>.)
  
@@ -2470,7 +2473,7 @@ own set.  This is because these are often used in commerce even in such
  scripts.  But any mixing of the ASCII and other digits will cause the
  sequence to not be a script run, failing the match.  As an example,
  
- qr/(+script_run: \d+ \b )/x
+ qr/(*script_run: \d+ \b )/x
  
  guarantees that the digits matched will all be from the same set of 10.
  You won't get a look-alike digit from a different script that has a
diff --git a/regcomp.c b/regcomp.c

index 50b7427..25f772e 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10699,45 +10699,48 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
           * here (if paren ==2).  The forms '(*VERB' and '(?...' disallow such
           * intervening space, as the sequence is a token, and a token should be
           * indivisible */
-        bool has_intervening_patws = (paren == 2 || paren == 's')
+        bool has_intervening_patws = (paren == 2)
                                    && *(RExC_parse - 1) != '(';
  
          if (RExC_parse >= RExC_end) {
             vFAIL("Unmatched (");
          }
  
-        if (paren == 's') {
-
-            /* A nested script run  is a no-op besides clustering */
-            if (RExC_in_script_run) {
-                paren = ':';
-                nextchar(pRExC_state);
-                ret = NULL;
-                goto parse_rest;
-            }
-            RExC_in_script_run = 1;
-
-           ret = reg_node(pRExC_state, SROPEN);
-            is_open = 1;
-        }
-        else if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+        if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
             char *start_verb = RExC_parse + 1;
             STRLEN verb_len;
             char *start_arg = NULL;
             unsigned char op = 0;
              int arg_required = 0;
              int internal_argval = -1; /* if >-1 we are not allowed an argument*/
+            bool has_upper = FALSE;
  
              if (has_intervening_patws) {
                  RExC_parse++;   /* past the '*' */
-                vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
+
+                /* For strict backwards compatibility, don't change the message
+                 * now that we also have lowercase operands */
+                if (isUPPER(*RExC_parse)) {
+                    vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
+                }
+                else {
+                    vFAIL("In '(*...)', the '(' and '*' must be adjacent");
+                }
              }
             while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
                 if ( *RExC_parse == ':' ) {
                     start_arg = RExC_parse + 1;
                     break;
                 }
-               RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+                else if (! UTF) {
+                    if (isUPPER(*RExC_parse)) {
+                        has_upper = TRUE;
+                    }
+                    RExC_parse++;
+                }
+                else {
+                    RExC_parse += UTF8SKIP(RExC_parse);
+                }
             }
             verb_len = RExC_parse - start_verb;
             if ( start_arg ) {
@@ -10746,16 +10749,27 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                  }
  
                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
-               while ( RExC_parse < RExC_end && *RExC_parse != ')' )
+               while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
                      RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
-               if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
+                }
+               if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
                    unterminated_verb_pattern:
-                   vFAIL("Unterminated verb pattern argument");
-               if ( RExC_parse == start_arg )
-                   start_arg = NULL;
+                    if (has_upper) {
+                        vFAIL("Unterminated verb pattern argument");
+                    }
+                    else {
+                        vFAIL("Unterminated '(*...' argument");
+                    }
+                }
             } else {
-               if ( RExC_parse >= RExC_end || *RExC_parse != ')' )
-                   vFAIL("Unterminated verb pattern");
+               if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
+                    if (has_upper) {
+                        vFAIL("Unterminated verb pattern");
+                    }
+                    else {
+                        vFAIL("Unterminated '(*...' construct");
+                    }
+                }
             }
  
              /* Here, we know that RExC_parse < RExC_end */
@@ -10798,13 +10812,68 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                      RExC_seen |= REG_CUTGROUP_SEEN;
                  }
                  break;
-           }
+            case 's':
+                if (   memEQs(start_verb, verb_len, "sr")
+                    || memEQs(start_verb, verb_len, "script_run"))
+                {
+                    paren = 's';
+
+                    /* This indicates Unicode rules. */
+                    REQUIRE_UNI_RULES(flagp, NULL);
+
+                    if (! start_arg) {
+                        goto no_colon;
+                    }
+
+                    RExC_parse = start_arg;
+
+                    if (PASS2) {
+                        Perl_ck_warner_d(aTHX_
+                            packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
+                            "The script_run feature is experimental"
+                            REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
+
+                    }
+
+                    if (RExC_in_script_run) {
+                        paren = ':';
+                        nextchar(pRExC_state);
+                        ret = NULL;
+                        goto parse_rest;
+                    }
+                    RExC_in_script_run = 1;
+
+                    ret = reg_node(pRExC_state, SROPEN);
+
+                    is_open = 1;
+                    goto parse_rest;
+                }
+
+                break;
+
+              no_colon:
+                vFAIL2utf8f(
+                "'(*%" UTF8f "' requires a terminating ':'",
+                UTF8fARG(UTF, verb_len, start_verb));
+               NOT_REACHED; /*NOTREACHED*/
+
+           } /* End of switch */
             if ( ! op ) {
                 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
-                vFAIL2utf8f(
+                if (has_upper || verb_len == 0) {
+                    vFAIL2utf8f(
                      "Unknown verb pattern '%" UTF8f "'",
                      UTF8fARG(UTF, verb_len, start_verb));
+                }
+                else {
+                    vFAIL2utf8f(
+                    "Unknown '(*...)' construct '%" UTF8f "'",
+                    UTF8fARG(UTF, verb_len, start_verb));
+                }
             }
+            if ( RExC_parse == start_arg ) {
+                start_arg = NULL;
+            }
              if ( arg_required && !start_arg ) {
                  vFAIL3("Verb pattern '%.*s' has a mandatory argument",
                      verb_len, start_verb);
@@ -10832,45 +10901,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
             nextchar(pRExC_state);
             return ret;
          }
-        else if (*RExC_parse == '+') { /* (+...) */
-            RExC_parse++;
-
-            if (has_intervening_patws) {
-                /* XXX Note that a potential gotcha is that outside of /x '( +
-                 * ...)' means to match a space at least once ...   This is a
-                 * problem elsewhere too */
-                vFAIL("In '(+...)', the '(' and '+' must be adjacent");
-            }
-
-            if (! memBEGINPs(RExC_parse, (STRLEN) (RExC_end - RExC_parse),
-                             "script_run:"))
-            {
-                RExC_parse += strcspn(RExC_parse, ":)");
-                vFAIL("Unknown (+ pattern");
-            }
-            else {
-
-                /* This indicates Unicode rules. */
-                REQUIRE_UNI_RULES(flagp, NULL);
-
-                RExC_parse += sizeof("script_run:") - 1;
-
-                if (PASS2) {
-                    Perl_ck_warner_d(aTHX_
-                        packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
-                        "The script_run feature is experimental"
-                        REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
-                }
-
-                ret = reg(pRExC_state, 's', &flags, depth+1);
-                if (flags & (RESTART_PASS1|NEED_UTF8)) {
-                    *flagp = flags & (RESTART_PASS1|NEED_UTF8);
-                    return NULL;
-                }
-
-                return ret;
-            }
-        }
          else if (*RExC_parse == '?') { /* (?...) */
             bool is_logical = 0;
             const char * const seqstart = RExC_parse;
@@ -11476,7 +11506,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
              paren = ':';
             ret = NULL;
         }
-       }
+        }
      }
      else                        /* ! paren */
         ret = NULL;
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t

index ad18de0..aff5535 100644 (file)
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -284,9 +284,16 @@ my @death =
   'm/\cß/' => "Character following \"\\c\" must be printable ASCII",
   '/((?# This is a comment in the middle of a token)?:foo)/' => 'In \'(?...)\', the \'(\' and \'?\' must be adjacent {#} m/((?# This is a comment in the middle of a token)?{#}:foo)/',
   '/((?# This is a comment in the middle of a token)*FAIL)/' => 'In \'(*VERB...)\', the \'(\' and \'*\' must be adjacent {#} m/((?# This is a comment in the middle of a token)*{#}FAIL)/',
- '/((?# This is a comment in the middle of a token)+script_run:foo)/' => 'In \'(+...)\', the \'(\' and \'+\' must be adjacent {#} m/((?# This is a comment in the middle of a token)+{#}script_run:foo)/',
-
- '/(+script_runfoo)/' => 'Unknown (+ pattern {#} m/(+script_runfoo{#})/',
+ '/((?# This is a comment in the middle of a token)*script_run:foo)/' => 'In \'(*...)\', the \'(\' and \'*\' must be adjacent {#} m/((?# This is a comment in the middle of a token)*{#}script_run:foo)/',
+
+ '/(*script_runfoo)/' => 'Unknown \'(*...)\' construct \'script_runfoo\' {#} m/(*script_runfoo){#}/',
+ '/(*srfoo)/' => 'Unknown \'(*...)\' construct \'srfoo\' {#} m/(*srfoo){#}/',
+ '/(*script_run)/' => '\'(*script_run\' requires a terminating \':\' {#} m/(*script_run{#})/',
+ '/(*sr)/' => '\'(*sr\' requires a terminating \':\' {#} m/(*sr{#})/',
+ '/(*script_run/' => 'Unterminated \'(*...\' construct {#} m/(*script_run{#}/',
+ '/(*sr/' => 'Unterminated \'(*...\' construct {#} m/(*sr{#}/',
+ '/(*script_run:foo/' => 'Unterminated \'(*...\' argument {#} m/(*script_run:foo{#}/',
+ '/(*sr:foo/' => 'Unterminated \'(*...\' argument {#} m/(*sr:foo{#}/',
   '/(?[\ &!])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[\ &!{#}])/',    # [perl #126180]
   '/(?[\ +!])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[\ +!{#}])/',    # [perl #126180]
   '/(?[\ -!])/' => 'Incomplete expression within \'(?[ ])\' {#} m/(?[\ -!{#}])/',    # [perl #126180]
@@ -461,7 +468,7 @@ my @death_utf8 = mark_as_utf8(
   '/ネ(?[ \t ]/' => "Unexpected ']' with no following ')' in (?[... {#} m/ネ(?[ \\t ]{#}/",
   '/(?[ \t + \e # ネ This was supposed to be a comment ])/' =>
      "Syntax error in (?[...]) {#} m/(?[ \\t + \\e # ネ This was supposed to be a comment ]){#}/",
- 'm/(*ネ)ネ/' => q<Unknown verb pattern 'ネ' {#} m/(*ネ){#}ネ/>,
+ 'm/(*ネ)ネ/' => q<Unknown '(*...)' construct 'ネ' {#} m/(*ネ){#}ネ/>,
   '/\cネ/' => "Character following \"\\c\" must be printable ASCII",
   '/\b{ネ}/' => "'ネ' is an unknown bound type {#} m/\\b{ネ{#}}/",
   '/\B{ネ}/' => "'ネ' is an unknown bound type {#} m/\\B{ネ{#}}/",
@@ -668,9 +675,9 @@ my @experimental_regex_sets = (
  );
  
  my @experimental_script_run = (
-    '/(+script_run:paypal.com)/' => 'The script_run feature is experimental {#} m/(+script_run:{#}paypal.com)/',
-    'use utf8; /utf8 ネ (+script_run:ネ)/' => do { use utf8; 'The script_run feature is experimental {#} m/utf8 ネ (+script_run:{#}ネ)/' },
-    '/noutf8 ネ (+script_run:ネ)/' => 'The script_run feature is experimental {#} m/noutf8 ネ (+script_run:{#}ネ)/',
+    '/(*script_run:paypal.com)/' => 'The script_run feature is experimental {#} m/(*script_run:{#}paypal.com)/',
+    'use utf8; /utf8 ネ (*script_run:ネ)/' => do { use utf8; 'The script_run feature is experimental {#} m/utf8 ネ (*script_run:{#}ネ)/' },
+    '/noutf8 ネ (*script_run:ネ)/' => 'The script_run feature is experimental {#} m/noutf8 ネ (*script_run:{#}ネ)/',
  );
  
  my @deprecated = (
diff --git a/t/re/script_run.t b/t/re/script_run.t

index 4878f39..8c91602 100644 (file)
--- a/t/re/script_run.t
+++ b/t/re/script_run.t
@@ -17,7 +17,7 @@ $|=1;
  
  no warnings "experimental::script_run";
  
-my $script_run = qr/ ^ (+script_run: .* ) $ /x;
+my $script_run = qr/ ^ (*script_run: .* ) $ /x;
  
  unlike("\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}\N{CYRILLIC SMALL LETTER U}}\N{CYRILLIC SMALL LETTER ER}\N{CYRILLIC SMALL LETTER A}l", $script_run, "Cyrillic 'paypal' with a Latin 'l' is not a script run");
  unlike("A\N{GREEK CAPITAL LETTER GAMMA}", $script_run, "Latin followed by Greek isn't a script run");
author	Karl Williamson <khw@cpan.org>
	Mon, 19 Feb 2018 03:28:34 +0000 (20:28 -0700)
committer	Karl Williamson <khw@cpan.org>
	Mon, 19 Feb 2018 05:00:33 +0000 (22:00 -0700)
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
pod/perlre.pod		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
t/re/reg_mesg.t		patch \| blob \| blame \| history
t/re/script_run.t		patch \| blob \| blame \| history