Allow suffix form for /a /d /l /u

author Karl Williamson <public@khwilliamson.com>

Sat, 19 Feb 2011 22:56:29 +0000 (15:56 -0700)

committer Karl Williamson <public@khwilliamson.com>

Sun, 20 Feb 2011 05:18:56 +0000 (22:18 -0700)
author Karl Williamson <public@khwilliamson.com>
Sat, 19 Feb 2011 22:56:29 +0000 (15:56 -0700)
committer Karl Williamson <public@khwilliamson.com>
Sun, 20 Feb 2011 05:18:56 +0000 (22:18 -0700)
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index f0677c7..fdc6df1 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -28,6 +28,24 @@ here, but most should go in the L</Performance Enhancements> section.
  
  [ List each enhancement as a =head2 entry ]
  
+=head2 The new regular expression modifiers available in suffix form
+
+Various releases of the 5.13.x series have added new regular expression
+modifiers, C</a>, C</d>, C</l>, and C</u>.  They were only available in
+infix form (e.g., C<(?a:...)> until this release; now they are usable
+in suffix form.  This change was made too late to change all the
+affected documentation, so there are a number of places that erroneously
+say these must be used in infix form.
+
+However, there is an ambiguity with the construct, C<s/foo/bar/le...>.  Due
+to backward compatibility constraints, in Perl 5.14 only, it will be
+resolved as C<s/foo/bar/ le...>, that is, as meaning to take the result
+of the substitution, and see if it is stringwise less-than-or-equal-to
+what follows. In Perl 5.16 and later, it will instead be resolved as
+meaing to do the pattern match using the rules of the current locale,
+and evaluate the rhs as an expression when doing the substitution.  In
+5.14, if you want the latter interpretation, you can write "el" instead.
+
  =head2 Add C<\p{Titlecase}> as a synonym for C<\p{Title}>
  
  This synonym is added for symmetry with the Unicode property names
@@ -41,9 +59,7 @@ non-ASCII character.  For example, normally,
  
      'k' =~ /\N{KELVIN SIGN}/
  
-will match; it won't under C</aa>.  Note that like C</a>, C</aa>
-in 5.14 will not actually be able to be used as a suffix at the end of a
-regular expression.
+will match; it won't under C</aa>.
  
  =head2 New warnings categories for problematic (non-)Unicode code points.
  
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 614276a..2d9a9ac 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -146,6 +146,18 @@ string C<"-foo">, or a call to the function C<foo>, negated.  If you meant
  the string, just write C<"-foo">.  If you meant the function call,
  write C<-foo()>.
  
+=item Ambiguous use of 's//le...' resolved as 's// le...'; Rewrite as 's//el' if you meant 'use locale rules and evaluate rhs as an expression'.  In Perl 5.16, it will be resolved the other way
+
+(W deprecated, ambiguous)  You wrote a pattern match with substitution
+immediately followed by "le".  In Perl 5.14 and earlier, this is
+resolved as meaning to take the result of the substitution, and see if
+it is stringwise less-than-or-equal-to what follows in the expression.
+Having the "le" immediately following a pattern is deprecated behavior,
+so in Perl 5.16, this expression will be resolved as meaning to do the
+pattern match using the rules of the current locale, and evaluate the
+rhs as an expression when doing the substitution.  In 5.14, if you want
+the latter interpretation, you can simply write "el" instead.
+
  =item '|' and '<' may not both be specified on command line
  
  (F) An error peculiar to VMS.  Perl does its own command line
diff --git a/regexp.h b/regexp.h

index 92c9cce..c4fa609 100644 (file)
--- a/regexp.h
+++ b/regexp.h
@@ -233,7 +233,7 @@ and check for NULL.
      case SINGLE_PAT_MOD:    *(pmfl) |= RXf_PMf_SINGLELINE; break;   \
      case XTENDED_PAT_MOD:   *(pmfl) |= RXf_PMf_EXTENDED;   break
  
-/* Note, includes locale, unicode */
+/* Note, includes charset ones, assumes 0 is the default for them */
  #define STD_PMMOD_FLAGS_CLEAR(pmfl)                        \
      *(pmfl) &= ~(RXf_PMf_FOLD|RXf_PMf_MULTILINE|RXf_PMf_SINGLELINE|RXf_PMf_EXTENDED|RXf_PMf_CHARSET)
  
@@ -276,13 +276,15 @@ and check for NULL.
   * character is bit +1, etc. */
  #define STD_PAT_MODS        "msix"
  
+#define CHARSET_PAT_MODS    ASCII_RESTRICT_PAT_MODS DEPENDS_PAT_MODS LOCALE_PAT_MODS UNICODE_PAT_MODS
+
  /* This string is expected by XS_re_regexp_pattern() in universal.c to be ordered
   * so that the first character is the flag in bit RXf_PMf_STD_PMMOD_SHIFT of
   * extflags; the next character is in bit +1, etc. */
  #define INT_PAT_MODS    STD_PAT_MODS    KEEPCOPY_PAT_MODS
  
  #define EXT_PAT_MODS    ONCE_PAT_MODS   KEEPCOPY_PAT_MODS
-#define QR_PAT_MODS     STD_PAT_MODS    EXT_PAT_MODS
+#define QR_PAT_MODS     STD_PAT_MODS    EXT_PAT_MODS      CHARSET_PAT_MODS
  #define M_PAT_MODS      QR_PAT_MODS     LOOP_PAT_MODS
  #define S_PAT_MODS      M_PAT_MODS      EXEC_PAT_MODS      NONDESTRUCT_PAT_MODS
  
diff --git a/t/op/eval.t b/t/op/eval.t

index 5ef3009..f0fa0f2 100644 (file)
--- a/t/op/eval.t
+++ b/t/op/eval.t
@@ -462,7 +462,7 @@ print "ok $test - eval and last\n"; $test++;
  
  {
      no warnings;
-    eval "/ /a;";
+    eval "/ /b;";
      print "not " unless $@ =~ /^syntax error/;
      print "ok $test # eval syntax error, no warnings \n"; $test++;
  }
diff --git a/t/re/re.t b/t/re/re.t

index 67c2181..cf6cdff 100644 (file)
--- a/t/re/re.t
+++ b/t/re/re.t
@@ -55,6 +55,30 @@ if ('1234'=~/(?:(?<A>\d)|(?<C>!))(?<B>\d)(?<A>\d)(?<B>\d)/){
  }
  
  {
+    my ($pat, $mods);
+    $|=1;
+
+    my $re = qr/a/d;
+    ($pat, $mods) = regexp_pattern($re);
+    is($mods, "", "Verify /d results in default mod");
+    $re = qr/a/u;
+    ($pat, $mods) = regexp_pattern($re);
+    is($mods, "u", "Verify /u is understood");
+    $re = qr/a/l;
+    ($pat, $mods) = regexp_pattern($re);
+    is($mods, "l", "Verify /l is understood");
+    $re = qr/a/a;
+    ($pat, $mods) = regexp_pattern($re);
+    is($mods, "a", "Verify /a is understood");
+    $re = qr/a/aa;
+    ($pat, $mods) = regexp_pattern($re);
+    is($mods, "aa", "Verify /aa is understood");
+    diag($mods);
+    $pat = regexp_pattern($re);
+    diag($pat);
+}
+
+{
      # tests for new regexp flags
      my $text = "\xE4";
      my $check;
@@ -110,5 +134,5 @@ if ('1234'=~/(?:(?<A>\d)|(?<C>!))(?<B>\d)(?<A>\d)(?<B>\d)/){
      }
  
  # New tests above this line, don't forget to update the test count below!
-BEGIN { plan tests => 28 }
+BEGIN { plan tests => 33 }
  # No tests here!
diff --git a/toke.c b/toke.c

index a4a279f..ddd50cf 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -8765,24 +8765,80 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s) {
       * otherwise FALSE */
  
      const char c = **s;
+
      if (! strchr(valid_flags, c)) {
          if (isALNUM(c)) {
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX),
-           "Having no space between pattern and following word is deprecated");
+           goto deprecate;
          }
          return FALSE;
      }
  
      switch (c) {
+
          CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl);
          case GLOBAL_PAT_MOD:      *pmfl |= PMf_GLOBAL; break;
          case CONTINUE_PAT_MOD:    *pmfl |= PMf_CONTINUE; break;
          case ONCE_PAT_MOD:        *pmfl |= PMf_KEEP; break;
          case KEEPCOPY_PAT_MOD:    *pmfl |= RXf_PMf_KEEPCOPY; break;
          case NONDESTRUCT_PAT_MOD: *pmfl |= PMf_NONDESTRUCT; break;
+       case LOCALE_PAT_MOD:
+
+           /* In 5.14, qr//lt is legal but deprecated; the 't' means they
+            * can't be regex modifiers.
+            * In 5.14, s///le is legal and ambiguous.  Try to disambiguate as
+            * much as easily done.  s///lei, for example, has to mean regex
+            * modifiers if it's not an error (as does any word character
+            * following the 'e').  Otherwise, we resolve to the backwards-
+            * compatible, but less likely 's/// le ...', i.e. as meaning
+            * less-than-or-equal.  The reason it's not likely is that s//
+            * returns a number, and so '<=' should be used for comparing, not
+            * 'le'. */
+           if (*((*s) + 1) == 't') {
+               goto deprecate;
+           }
+           else if (*((*s) + 1) == 'e' && ! isALNUM(*((*s) + 2)))
+           {
+               Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
+                   "Ambiguous use of 's//le...' resolved as 's// le...'; Rewrite as 's//el' if you meant 'use locale rules and evaluate rhs as an expression'.  In Perl 5.16, it will be resolved the other way");
+               return FALSE;
+           }
+           set_regex_charset(pmfl, REGEX_LOCALE_CHARSET);
+           break;
+       case UNICODE_PAT_MOD:
+           /* In 5.14, qr//unless and qr//until are legal but deprecated; the
+            * 'n' means they can't be regex modifiers */
+           if (*((*s) + 1) == 'n') {
+               goto deprecate;
+           }
+           set_regex_charset(pmfl, REGEX_UNICODE_CHARSET);
+           break;
+       case ASCII_RESTRICT_PAT_MOD:
+           /* In 5.14, qr//and is legal but deprecated; the 'n' means they
+            * can't be regex modifiers */
+           if (*((*s) + 1) == 'n') {
+               goto deprecate;
+           }
+           if (*((*s) + 1) == ASCII_RESTRICT_PAT_MOD) {
+               /* Doubled modifier implies more restricted */
+               set_regex_charset(pmfl, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
+               (*s)++;
+           }
+           else {
+               set_regex_charset(pmfl, REGEX_ASCII_RESTRICTED_CHARSET);
+           }
+           break;
+       case DEPENDS_PAT_MOD:
+           set_regex_charset(pmfl, REGEX_DEPENDS_CHARSET);
+           break;
      }
+
      (*s)++;
      return TRUE;
+
+    deprecate:
+       Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX),
+           "Having no space between pattern and following word is deprecated");
+        return FALSE;
  }
  
  STATIC char *
author	Karl Williamson <public@khwilliamson.com>
	Sat, 19 Feb 2011 22:56:29 +0000 (15:56 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Sun, 20 Feb 2011 05:18:56 +0000 (22:18 -0700)
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
regexp.h		patch \| blob \| blame \| history
t/op/eval.t		patch \| blob \| blame \| history
t/re/re.t		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history