perl #77654: quotemeta quotes non-ASCII consistently

author Karl Williamson <public@khwilliamson.com>

Wed, 15 Feb 2012 18:31:27 +0000 (11:31 -0700)

committer Karl Williamson <public@khwilliamson.com>

Thu, 16 Feb 2012 01:02:35 +0000 (18:02 -0700)
author Karl Williamson <public@khwilliamson.com>
Wed, 15 Feb 2012 18:31:27 +0000 (11:31 -0700)
committer Karl Williamson <public@khwilliamson.com>
Thu, 16 Feb 2012 01:02:35 +0000 (18:02 -0700)
diff --git a/embed.fnc b/embed.fnc

index 34aa251..a7e004f 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -608,6 +608,7 @@ p   |UV     |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool flags
  #endif
  #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
  p      |UV     |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const char S_or_s
+ApR    |bool   |_is_utf8_quotemeta|NN const U8 *p
  #endif
  Ap     |UV     |to_uni_lower   |UV c|NN U8 *p|NN STRLEN *lenp
  Amp    |UV     |to_uni_fold    |UV c|NN U8 *p|NN STRLEN *lenp
diff --git a/embed.h b/embed.h

index d429c8d..541309e 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -781,6 +781,9 @@
  #define warn_nocontext         Perl_warn_nocontext
  #define warner_nocontext       Perl_warner_nocontext
  #endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
+#define _is_utf8_quotemeta(a)  Perl__is_utf8_quotemeta(aTHX_ a)
+#endif
  #if defined(PERL_MAD)
  #define newFORM(a,b,c)         Perl_newFORM(aTHX_ a,b,c)
  #define newMYSUB(a,b,c,d,e)    Perl_newMYSUB(aTHX_ a,b,c,d,e)
diff --git a/embedvar.h b/embedvar.h

index bfb88e0..81d28e7 100644 (file)
--- a/embedvar.h
+++ b/embedvar.h
@@ -378,6 +378,7 @@
  #define PL_utf8_perl_idstart   (vTHX->Iutf8_perl_idstart)
  #define PL_utf8_print          (vTHX->Iutf8_print)
  #define PL_utf8_punct          (vTHX->Iutf8_punct)
+#define PL_utf8_quotemeta      (vTHX->Iutf8_quotemeta)
  #define PL_utf8_space          (vTHX->Iutf8_space)
  #define PL_utf8_tofold         (vTHX->Iutf8_tofold)
  #define PL_utf8_tolower                (vTHX->Iutf8_tolower)
diff --git a/intrpvar.h b/intrpvar.h

index 473c952..273da46 100644 (file)
--- a/intrpvar.h
+++ b/intrpvar.h
@@ -641,6 +641,7 @@ PERLVAR(I, utf8_toupper, SV *)
  PERLVAR(I, utf8_totitle, SV *)
  PERLVAR(I, utf8_tolower, SV *)
  PERLVAR(I, utf8_tofold,        SV *)
+PERLVAR(I, utf8_quotemeta, SV *)
  PERLVAR(I, last_swash_hv, HV *)
  PERLVAR(I, last_swash_tmps, U8 *)
  PERLVAR(I, last_swash_slen, STRLEN)
diff --git a/lib/feature.pm b/lib/feature.pm

index 399303a..fe88c8c 100644 (file)
--- a/lib/feature.pm
+++ b/lib/feature.pm
@@ -145,8 +145,8 @@ L<perlunicode/The "Unicode Bug"> for details.)  For this reason, if you are
  potentially using Unicode in your program, the
  C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
  
-This feature is available starting with Perl 5.12, but was not fully
-implemented until Perl 5.14.
+This feature is available starting with Perl 5.12; was almost fully
+implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>.
  
  =head2 The 'unicode_eval' and 'evalbytes' features
  
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index bb40dc7..b316025 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -226,6 +226,14 @@ cached version of it.
  
  See the documentation for L<$$|perlvar/$$> for details.
  
+=head2 Which Non-ASCII characters get quoted by C<quotemeta> and C<\Q> has changed
+
+This is unlikely to result in a real problem, as Perl does not attach
+special meaning to any non-ASCII character, so it is currently
+irrelevant which are quoted or not.  This change fixes bug [perl #77654] and
+bring Perl's behavior more into line with Unicode's recommendations.
+See L<perlfunc/quotemeta>.
+
  =head1 Deprecations
  
  XXX Any deprecated features, syntax, modules etc. should be listed here.
@@ -730,6 +738,16 @@ bracketed character class in a regular expression that consisted solely
  of a Unicode property, that property wasn't getting inverted outside the
  Latin1 range.
  
+=item *
+
+C<quotemeta> now quotes consistently the same non-ASCII characters under
+C<use feature 'unicode_strings'>, regardless of whether the string is
+encoded in UTF-8 or not, hence fixing the last vestiges (we hope) of the
+infamous L<perlunicode/The "Unicode Bug">.  [perl #77654].
+
+Which of these code points is quoted has changed, based on Unicode's
+recommendations.  See L<perlfunc/quotemeta> for details.
+
  =back
  
  =head1 Known Problems
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod

index 2c8a6bf..7cec3bb 100644 (file)
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -4964,8 +4964,52 @@ input from the user, quotemeta() or C<\Q> must be used.
  
  In Perl v5.14, all non-ASCII characters are quoted in non-UTF-8-encoded
  strings, but not quoted in UTF-8 strings.
-It is planned to change this behavior in v5.16, but the exact rules
-haven't been determined yet.
+
+Starting in Perl v5.16, Perl adopted a Unicode-defined strategy for
+quoting non-ASCII characters; the quoting of ASCII characters is
+unchanged.
+
+Also unchanged is the quoting of non-UTF-8 strings when outside the
+scope of a C<use feature 'unicode_strings'>, which is to quote all
+characters in the upper Latin1 range.  This provides complete backwards
+compatibility for old programs which do not use Unicode.  (Note that
+C<unicode_strings> is automatically enabled within the scope of a
+S<C<use v5.12>> or greater.)
+
+Otherwise, Perl quotes non-ASCII characters using an adaptation from
+Unicode (see L<http://www.unicode.org/reports/tr31/>.)
+The only code points that are quoted are those that have any of the
+Unicode properties:  Pattern_Syntax, Pattern_White_Space, White_Space,
+Default_Ignorable_Code_Point, or General_Category=Control.
+
+Of these properties, the two important ones are Pattern_Syntax and
+Pattern_White_Space.  They have been set up by Unicode for exactly this
+purpose of deciding which characters in a regular expression pattern
+should be quoted.  No character that can be in an identifier has these
+properties.
+
+Perl promises, that if we ever add regular expression pattern
+metacharacters to the dozen already defined
+(C<\ E<verbar> ( ) [ { ^ $ * + ? .>), that we will only use ones that have the
+Pattern_Syntax property.  Perl also promises, that if we ever add
+characters that are considered to be white space in regular expressions
+(currently mostly affected by C</x>), they will all have the
+Pattern_White_Space property.
+
+Unicode promises that the set of code points that have these two
+properties will never change, so something that is not quoted in v5.16
+will never need to be quoted in any future Perl release.  (Not all the
+code points that match Pattern_Syntax have actually had characters
+assigned to them; so there is room to grow, but they are quoted
+whether assigned or not.  Perl, of course, would never use an
+unassigned code point as an actual metacharacter.)
+
+Quoting characters that have the other 3 properties is done to enhance
+the readability of the regular expression and not because they actually
+need to be quoted for regular expression purposes (characters with the
+White_Space property are likely to be indistinguishable on the page or
+screen from those with the Pattern_White_Space property; and the other
+two properties contain non-printing characters).
  
  =item rand EXPR
  X<rand> X<random>
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 4142343..b96efbf 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1371,49 +1371,69 @@ readdir, readlink
  
  =head2 The "Unicode Bug"
  
-The term, the "Unicode bug" has been applied to an inconsistency
+The term, "Unicode bug" has been applied to an inconsistency
  on ASCII platforms with the
  Unicode code points in the Latin-1 Supplement block, that
  is, between 128 and 255.  Without a locale specified, unlike all other
  characters or code points, these characters have very different semantics in
  byte semantics versus character semantics, unless
-C<use feature 'unicode_strings'> is specified.
-(The lesson here is to specify C<unicode_strings> to avoid the
-headaches.)
+C<use feature 'unicode_strings'> is specified, directly or indirectly.
+(It is indirectly specified by a C<use v5.12> or higher.)
  
-In character semantics they are interpreted as Unicode code points, which means
+In character semantics these upper-Latin1 characters are interpreted as
+Unicode code points, which means
  they have the same semantics as Latin-1 (ISO-8859-1).
  
-In byte semantics, they are considered to be unassigned characters, meaning
-that the only semantics they have is their ordinal numbers, and that they are
+In byte semantics (without C<unicode_strings>), they are considered to
+be unassigned characters, meaning that the only semantics they have is
+their ordinal numbers, and that they are
  not members of various character classes.  None are considered to match C<\w>
  for example, but all match C<\W>.
  
-The behavior is known to have effects on these areas:
+Perl 5.12.0 added C<unicode_strings> to force character semantics on
+these code points in some circumstances, which fixed portions of the
+bug; Perl 5.14.0 fixed almost all of it; and Perl 5.16.0 fixed the
+remainder (so far as we know, anyway).  The lesson here is to enable
+C<unicode_strings> to avoid the headaches described below.
+
+The old, problematic behavior affects these areas:
  
  =over 4
  
  =item *
  
  Changing the case of a scalar, that is, using C<uc()>, C<ucfirst()>, C<lc()>,
-and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in regular expression
-substitutions.
+and C<lcfirst()>, or C<\L>, C<\U>, C<\u> and C<\l> in double-quotish
+contexts, such as regular expression substitutions.
+Under C<unicode_strings> starting in Perl 5.12.0, character semantics are
+generally used.  See L<perlfunc/lc> for details on how this works
+in combination with various other pragmas.
  
  =item *
  
-Using caseless (C</i>) regular expression matching
+Using caseless (C</i>) regular expression matching.
+Starting in Perl 5.14.0, regular expressions compiled within
+the scope of C<unicode_semantics> use character semantics
+even when executed or compiled into larger
+regular expressions outside the scope.
  
  =item *
  
  Matching any of several properties in regular expressions, namely C<\b>,
  C<\B>, C<\s>, C<\S>, C<\w>, C<\W>, and all the Posix character classes
  I<except> C<[[:ascii:]]>.
+Starting in Perl 5.14.0, regular expressions compiled within
+the scope of C<unicode_semantics> use character semantics
+even when executed or compiled into larger
+regular expressions outside the scope.
  
  =item *
  
  In C<quotemeta> or its inline equivalent C<\Q>, no code points above 127
  are quoted in UTF-8 encoded strings, but in byte encoded strings, code
  points between 128-255 are always quoted.
+Starting in Perl 5.16.0, consistent quoting rules are used within the
+scope of C<unicode_strings>, as described in L<perlfunc/quotemeta>.
  
  =back
  
@@ -1442,21 +1462,9 @@ ASCII range (except in a locale), along with Perl's desire to add Unicode
  support seamlessly.  The result wasn't seamless: these characters were
  orphaned.
  
-Starting in Perl 5.14, C<use feature 'unicode_strings'> can be used to
-cause Perl to use Unicode semantics on all string operations within the
-scope of the feature subpragma.  Regular expressions compiled in its
-scope retain that behavior even when executed or compiled into larger
-regular expressions outside the scope.  (The pragma does not, however,
-affect the C<quotemeta> behavior.  Nor does it affect the deprecated
-user-defined case changing operations--these still require a UTF-8
-encoded string to operate.)
-
-In Perl 5.12, the subpragma affected casing changes, but not regular
-expressions.  See L<perlfunc/lc> for details on how this pragma works in
-combination with various others for casing.
-
-For earlier Perls, or when a string is passed to a function outside the
-subpragma's scope, a workaround is to always call C<utf8::upgrade($string)>,
+For Perls earlier than those described above, or when a string is passed
+to a function outside the subpragma's scope, a workaround is to always
+call C<utf8::upgrade($string)>,
  or to use the standard module L<Encode>.   Also, a scalar that has any characters
  whose ordinal is above 0x100, or which were specified using either of the
  C<\N{...}> notations, will automatically have character semantics.
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod

index 63e2119..8ce4b7b 100644 (file)
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -152,7 +152,8 @@ problems of the initial Unicode implementation, but for example
  regular expressions still do not work with Unicode in 5.6.1.
  Perl 5.14.0 is the first release where Unicode support is
  (almost) seamlessly integrable without some gotchas (the exception being
-some differences in L<quotemeta|perlfunc/quotemeta>).   To enable this
+some differences in L<quotemeta|perlfunc/quotemeta>, which is fixed
+starting in Perl 5.16.0).   To enable this
  seamless support, you should C<use feature 'unicode_strings'> (which is
  automatically selected if you C<use 5.012> or higher).  See L<feature>.
  (5.14 also fixes a number of bugs and departures from the Unicode
diff --git a/pp.c b/pp.c

index b12772c..93e59fa 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -4088,24 +4088,45 @@ PP(pp_quotemeta)
         d = SvPVX(TARG);
         if (DO_UTF8(sv)) {
             while (len) {
-               if (UTF8_IS_CONTINUED(*s)) {
                     STRLEN ulen = UTF8SKIP(s);
+               bool to_quote = FALSE;
+
+               if (UTF8_IS_INVARIANT(*s)) {
+                   if (_isQUOTEMETA(*s)) {
+                       to_quote = TRUE;
+                   }
+               }
+               else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+                   if (_isQUOTEMETA(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1))))
+                   {
+                       to_quote = TRUE;
+                   }
+               }
+               else if (_is_utf8_quotemeta(s)) {
+                   to_quote = TRUE;
+               }
+
+               if (to_quote) {
+                   *d++ = '\\';
+               }
                     if (ulen > len)
                         ulen = len;
                     len -= ulen;
                     while (ulen--)
                         *d++ = *s++;
-               }
-               else {
-                   if (!isALNUM(*s))
-                       *d++ = '\\';
-                   *d++ = *s++;
-                   len--;
-               }
             }
             SvUTF8_on(TARG);
         }
+       else if (IN_UNI_8_BIT) {
+           while (len--) {
+               if (_isQUOTEMETA(*s))
+                   *d++ = '\\';
+               *d++ = *s++;
+           }
+       }
         else {
+           /* For non UNI_8_BIT (and hence in locale) just quote all \W
+            * including everything above ASCII */
             while (len--) {
                 if (!isWORDCHAR_A(*s))
                     *d++ = '\\';
diff --git a/proto.h b/proto.h

index 84bfbf4..f01e7c3 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -7154,6 +7154,12 @@ STATIC U8        S_to_lower_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp)
  
  #endif
  #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
+PERL_CALLCONV bool     Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
+                       __attribute__warn_unused_result__
+                       __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA    \
+       assert(p)
+
  PERL_CALLCONV UV       Perl__to_upper_title_latin1(pTHX_ const U8 c, U8 *p, STRLEN *lenp, const char S_or_s)
                         __attribute__nonnull__(pTHX_2)
                         __attribute__nonnull__(pTHX_3);
diff --git a/regen/feature.pl b/regen/feature.pl

index 0a23271..445c8b3 100755 (executable)
--- a/regen/feature.pl
+++ b/regen/feature.pl
@@ -439,8 +439,8 @@ L<perlunicode/The "Unicode Bug"> for details.)  For this reason, if you are
  potentially using Unicode in your program, the
  C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
  
-This feature is available starting with Perl 5.12, but was not fully
-implemented until Perl 5.14.
+This feature is available starting with Perl 5.12; was almost fully
+implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>.
  
  =head2 The 'unicode_eval' and 'evalbytes' features
  
diff --git a/sv.c b/sv.c

index 0a2d58c..aebfe48 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -13461,6 +13461,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
      PL_utf8_idcont     = sv_dup_inc(proto_perl->Iutf8_idcont, param);
      PL_utf8_xidcont    = sv_dup_inc(proto_perl->Iutf8_xidcont, param);
      PL_utf8_foldable   = sv_dup_inc(proto_perl->Iutf8_foldable, param);
+    PL_utf8_quotemeta  = sv_dup_inc(proto_perl->Iutf8_quotemeta, param);
      PL_ASCII           = sv_dup_inc(proto_perl->IASCII, param);
      PL_AboveLatin1     = sv_dup_inc(proto_perl->IAboveLatin1, param);
      PL_Latin1          = sv_dup_inc(proto_perl->ILatin1, param);
diff --git a/t/op/quotemeta.t b/t/op/quotemeta.t

index 1415aff..9cec0bd 100644 (file)
--- a/t/op/quotemeta.t
+++ b/t/op/quotemeta.t
@@ -7,7 +7,7 @@ BEGIN {
      require "test.pl";
  }
  
-plan tests => 22;
+plan tests => 40;
  
  if ($Config{ebcdic} eq 'define') {
      $_ = join "", map chr($_), 129..233;
@@ -44,8 +44,45 @@ is("\Q\l\UPe*x*r\El\E*", "pE\\*X\\*Rl*", '\Q\l\UPe*x*r\El\E*');
  is("\U\lPerl\E\E\E\E", "pERL", '\U\lPerl\E\E\E\E');
  is("\l\UPerl\E\E\E\E", "pERL", '\l\UPerl\E\E\E\E');
  
-is(quotemeta("\x{263a}"), "\x{263a}", "quotemeta Unicode");
-is(length(quotemeta("\x{263a}")), 1, "quotemeta Unicode length");
+is(quotemeta("\x{263a}"), "\\\x{263a}", "quotemeta Unicode quoted");
+is(length(quotemeta("\x{263a}")), 2, "quotemeta Unicode quoted length");
+is(quotemeta("\x{100}"), "\x{100}", "quotemeta Unicode nonquoted");
+is(length(quotemeta("\x{100}")), 1, "quotemeta Unicode nonquoted length");
+
+my $char = ":";
+utf8::upgrade($char);
+is(quotemeta($char), "\\$char", "quotemeta '$char' in UTF-8");
+is(length(quotemeta($char)), 2, "quotemeta '$char'  in UTF-8 length");
+
+$char = "M";
+utf8::upgrade($char);
+is(quotemeta($char), "$char", "quotemeta '$char' in UTF-8");
+is(length(quotemeta($char)), 1, "quotemeta '$char'  in UTF-8 length");
+
+my $char = "\N{U+D7}";
+utf8::upgrade($char);
+is(quotemeta($char), "\\$char", "quotemeta '\\N{U+D7}' in UTF-8");
+is(length(quotemeta($char)), 2, "quotemeta '\\N{U+D7}'  in UTF-8 length");
+
+$char = "\N{U+D8}";
+utf8::upgrade($char);
+is(quotemeta($char), "$char", "quotemeta '\\N{U+D8}' in UTF-8");
+is(length(quotemeta($char)), 1, "quotemeta '\\N{U+D8}'  in UTF-8 length");
+
+{
+    no feature 'unicode_strings';
+    is(quotemeta("\x{d7}"), "\\\x{d7}", "quotemeta Latin1 no unicode_strings quoted");
+    is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
+    is(quotemeta("\x{d8}"), "\\\x{d8}", "quotemeta Latin1 no unicode_strings quoted");
+    is(length(quotemeta("\x{d8}")), 2, "quotemeta Latin1 no unicode_strings quoted length");
+}
+{
+    use feature 'unicode_strings';
+    is(quotemeta("\x{d7}"), "\\\x{d7}", "quotemeta Latin1 unicode_strings quoted");
+    is(length(quotemeta("\x{d7}")), 2, "quotemeta Latin1 unicode_strings quoted length");
+    is(quotemeta("\x{d8}"), "\x{d8}", "quotemeta Latin1 unicode_strings nonquoted");
+    is(length(quotemeta("\x{d8}")), 1, "quotemeta Latin1 unicode_strings nonquoted length");
+}
  
  $a = "foo|bar";
  is("a\Q\Ec$a", "acfoo|bar", '\Q\E');
diff --git a/utf8.c b/utf8.c

index 2a5d93e..2b1e99b 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2029,6 +2029,18 @@ Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
      return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
  }
  
+bool
+Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
+{
+    /* For exclusive use of pp_quotemeta() */
+
+    dVAR;
+
+    PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
+
+    return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
+}
+
  /*
  =for apidoc to_utf8_case
author	Karl Williamson <public@khwilliamson.com>
	Wed, 15 Feb 2012 18:31:27 +0000 (11:31 -0700)
committer	Karl Williamson <public@khwilliamson.com>
	Thu, 16 Feb 2012 01:02:35 +0000 (18:02 -0700)
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
embedvar.h		patch \| blob \| blame \| history
intrpvar.h		patch \| blob \| blame \| history
lib/feature.pm		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perlfunc.pod		patch \| blob \| blame \| history
pod/perlunicode.pod		patch \| blob \| blame \| history
pod/perluniintro.pod		patch \| blob \| blame \| history
pp.c		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
regen/feature.pl		patch \| blob \| blame \| history
sv.c		patch \| blob \| blame \| history
t/op/quotemeta.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history