Deprecate Unicode code points above IV_MAX

author Karl Williamson <khw@cpan.org>

Sat, 28 Nov 2015 18:49:43 +0000 (11:49 -0700)

committer Karl Williamson <khw@cpan.org>

Sun, 29 Nov 2015 00:19:27 +0000 (17:19 -0700)
author Karl Williamson <khw@cpan.org>
Sat, 28 Nov 2015 18:49:43 +0000 (11:49 -0700)
committer Karl Williamson <khw@cpan.org>
Sun, 29 Nov 2015 00:19:27 +0000 (17:19 -0700)
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t

index 61a3ff8..1e1c984 100644 (file)
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -4,6 +4,9 @@ use strict;
  use Test::More;
  $|=1;
  
+no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
+                          # machines, and that is tested elsewhere
+
  use XS::APItest;
  
  my $pound_sign = chr utf8::unicode_to_native(163);
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index 38f1350..5174373 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3013,6 +3013,20 @@ an anonymous subroutine, or a reference to a subroutine.
  (W overload) You tried to overload a constant type the overload package is
  unaware of.
  
+=item It is deprecated to use code point 0x%s; the permissible max is 0x%s
+
+(D deprecated) You used a code point that will not be allowed in a
+future perl version, because it is too large.  Unicode only allows code
+points up to 0x10FFFF, but Perl allows much larger ones.  However, the
+largest possible ones break the perl interpreter in some constructs,
+including causing it to hang in a few cases.  The known problem areas
+are in C<tr///>, regular expression pattern matching using quantifiers,
+and as the upper limits in loops.
+
+If your code is to run on various platforms, keep in mind that the upper
+limit depends on the platform.  It is much larger on 64-bit word sizes
+than 32-bit ones.
+
  =item -i used with no filenames on the command line, reading from STDIN
  
  (S inplace) The C<-i> option was passed on the command line, indicating
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index aa0fdca..a652d8d 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1289,7 +1289,10 @@ encoding of numbers up to C<0x7FFF_FFFF>.  Perl continues to allow those,
  and has extended that up to 13 bytes to encode code points up to what
  can fit in a 64-bit word.  However, Perl will warn if you output any of
  these as being non-portable; and under strict UTF-8 input protocols,
-they are forbidden.
+they are forbidden.  In addition, it is deprecated to use a code point
+larger than what a signed integer variable on your system can hold.  On
+32-bit ASCII systems, this means C<0x7FFF_FFFF> is the legal maximum
+going forward (much higher on 64-bit systems).
  
  =item *
  
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8

index df1ccd6..05e7300 100644 (file)
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -89,11 +89,12 @@ Operation "uc" returns its argument for non-Unicode code point 0x110000 at - lin
  Operation "uc" returns its argument for UTF-16 surrogate U+D800 at - line 5.
  ########
  use warnings 'utf8';
+no warnings 'deprecated'; # This is above IV_MAX on 32 bit machines
  my $big_nonUnicode = uc(chr(0x8000_0000));
  no warnings 'non_unicode';
  my $big_nonUnicode = uc(chr(0x8000_0000));
  EXPECT
-Operation "uc" returns its argument for non-Unicode code point 0x80000000 at - line 2.
+Operation "uc" returns its argument for non-Unicode code point 0x80000000 at - line 3.
  ########
  use warnings 'utf8';
  my $d7ff  = lc pack("U", 0xD7FF);
@@ -667,3 +668,35 @@ $a = fc("\x{102}");
  $a = uc("\x{103}");
  $a = ucfirst("\x{104}");
  EXPECT
+########
+# NAME Deprecation of too-large code points
+require "../test.pl";
+use warnings 'non_unicode';
+my $max_cp = ~0 >> 1;
+my $max_char = chr $max_cp;
+my $to_warn_cp = $max_cp + 1;
+my $to_warn_char = chr $to_warn_cp;
+$max_char =~ /[\x{110000}\P{Unassigned}]/;
+$to_warn_char =~ /[\x{110000}\P{Unassigned}]/;
+my $temp = qr/$max_char/;
+$temp = qr/$to_warn_char/;
+$temp = uc($max_char);
+$temp = uc($to_warn_char);
+my $file = tempfile();
+open(my $fh, "+>:utf8", $file);
+print $fh $max_char, "\n";
+print $fh $to_warn_char, "\n";
+close $fh;
+EXPECT
+OPTION regex
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ in pattern match \(m//\) at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ in regexp compilation at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ in regexp compilation at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ in regexp compilation at - line \d+.
+Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ at - line \d+.
+Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line \d+.
+Code point 0x7F+ is not Unicode, may not be portable in print at - line \d+.
+It is deprecated to use code point 0x80+; the permissible max is 0x7F+ in print at - line \d+.
diff --git a/t/op/bop.t b/t/op/bop.t

index a037b06..c9bf195 100644 (file)
--- a/t/op/bop.t
+++ b/t/op/bop.t
@@ -4,6 +4,9 @@
  # test the bit operators '&', '|', '^', '~', '<<', and '>>'
  #
  
+use warnings;
+no warnings 'deprecated';
+
  BEGIN {
      chdir 't' if -d 't';
      @INC = '../lib';
diff --git a/t/op/chop.t b/t/op/chop.t

index a1126dc..d24b9e0 100644 (file)
--- a/t/op/chop.t
+++ b/t/op/chop.t
@@ -252,6 +252,7 @@ foreach my $start (@chars) {
      # [perl #73246] chop doesn't support utf8
      # the problem was UTF8_IS_START() didn't handle perl's extended UTF8
  
+    no warnings 'deprecated'; # This is above IV_MAX on 32 bit machines
      my $utf = "\x{80000001}\x{80000000}";
      my $result = chop($utf);
      is($utf, "\x{80000001}", "chopping high 'unicode'- remnant");
diff --git a/t/op/index.t b/t/op/index.t

index 8d21de7..d1e46dc 100644 (file)
--- a/t/op/index.t
+++ b/t/op/index.t
@@ -130,6 +130,7 @@ is(rindex($a, "foo",    ), 0);
  }
  
  {
+    no warnings 'deprecated'; # These are above IV_MAX on 32 bit machines
      my $a = eval q{"\x{80000000}"};
      my $s = $a.'defxyz';
      is(index($s, 'def'), 1, "0x80000000 is a single character");
diff --git a/t/op/ver.t b/t/op/ver.t

index cbbebba..503efd7 100644 (file)
--- a/t/op/ver.t
+++ b/t/op/ver.t
@@ -225,6 +225,7 @@ ok( abs($v - $]) < 10**-8 , "\$^V == \$] (numeric)" );
  
  {
  
+  no warnings 'deprecated'; # These are above IV_MAX on 32 bit machines
    # [ID 20010902.001] check if v-strings handle full UV range or not
    if ( $Config{'uvsize'} >= 4 ) {
      is(  sprintf("%vd", eval 'v2147483647.2147483648'),   '2147483647.2147483648', 'v-string > IV_MAX[32-bit]' );
diff --git a/t/opbasic/qq.t b/t/opbasic/qq.t

index 190ddb7..5d6908c 100644 (file)
--- a/t/opbasic/qq.t
+++ b/t/opbasic/qq.t
@@ -73,6 +73,7 @@ is ("a\o{1000}b", "a" . chr(0x200) . "b");
  
  # This caused a memory fault
  no warnings "utf8";
+no warnings 'deprecated'; # This is above IV_MAX on 32 bit machines
  is ("abc", eval qq[qq\x{8000_0000}abc\x{8000_0000}]);
  
  # Maybe \x{} should be an error, but if not it should certainly mean \x{0}
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t

index 8627753..1488a88 100644 (file)
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -2374,6 +2374,7 @@ EOF
  
      {   # Assertion was failing on on 64-bit platforms; just didn't work on 32.
          no warnings qw(non_unicode portable);
+        no warnings 'deprecated'; # These are above IV_MAX
          use Config;
  
          # We use 'ok' instead of 'like' because the warnings are lexically
@@ -2450,7 +2451,9 @@ EOF
          # (during compilation, so use a fresh perl)
          $Config{uvsize} == 8
           or skip("need large code-points for this test", 1);
-       fresh_perl_is('/\x{E000000000}|/ and print qq(ok\n)', "ok\n", {},
+
+        # This is above IV_MAX on 32 bit machines, so turn off those warnings
+       fresh_perl_is('no warnings "deprecated"; /\x{E000000000}|/ and print qq(ok\n)', "ok\n", {},
                       "buffer overflow in TRIE_STORE_REVCHAR");
      }
  
diff --git a/utf8.c b/utf8.c

index 9367dbf..e062333 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -35,6 +35,10 @@
  
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
+static const char cp_above_legal_max[] =
+    "It is deprecated to use code point 0x%"UVXf"; the permissible max is 0x%"UVXf"";
+
+#define MAX_NON_DEPRECATED_CP (IV_MAX)
  
  /*
  =head1 Unicode Support
@@ -110,9 +114,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      }
  
      /* The first problematic code point is the first surrogate */
-    if (   flags    /* It's common to turn off all these */
-        && uv >= UNICODE_SURROGATE_FIRST)
-    {
+    if (uv >= UNICODE_SURROGATE_FIRST) {
         if (UNICODE_IS_SURROGATE(uv)) {
             if (flags & UNICODE_WARN_SURROGATE) {
                 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
@@ -123,6 +125,12 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
             }
         }
         else if (UNICODE_IS_SUPER(uv)) {
+            if (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+                && ckWARN_d(WARN_DEPRECATED))
+            {
+               Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                            cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
+            }
             if (   (flags & UNICODE_WARN_SUPER)
                 || (UNICODE_IS_ABOVE_31_BIT(uv) && (flags & UNICODE_WARN_ABOVE_31_BIT)))
              {
@@ -246,8 +254,12 @@ is the recommended wide native character-aware way of saying
  
      *(d++) = uv;
  
-This function accepts any UV as input.  To forbid or warn on non-Unicode code
-points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
+This function accepts any UV as input, but very high code points (above
+C<IV_MAX> on the platform)  will raise a deprecation warning.  This is
+typically 0x7FFF_FFFF in a 32-bit word.
+
+It is possible to forbid or warn on non-Unicode code points, or those that may
+be problematic by using L</uvchr_to_utf8_flags>.
  
  =cut
  */
@@ -279,21 +291,25 @@ This is the Unicode-aware way of saying
  
      *(d++) = uv;
  
-This function will convert to UTF-8 (and not warn) even code points that aren't
-legal Unicode or are problematic, unless C<flags> contains one or more of the
-following flags:
+If C<flags> is 0, this function accepts any UV as input, but very high code
+points (above C<IV_MAX> for the platform)  will raise a deprecation warning.
+This is typically 0x7FFF_FFFF in a 32-bit word.
+
+Specifying C<flags> can further restrict what is allowed and not warned on, as
+follows:
  
  If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
  the function will raise a warning, provided UTF8 warnings are enabled.  If instead
  C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return NULL.
  If both flags are set, the function will both warn and return NULL.
  
-The C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
-affect how the function handles a Unicode non-character.  And likewise, the
-C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags affect the handling of
-code points that are
-above the Unicode maximum of 0x10FFFF.
+Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
+affect how the function handles a Unicode non-character.
  
+And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
+affect the handling of code points that are above the Unicode maximum of
+0x10FFFF.  Languages other than Perl may not be able to accept files that
+contain these.
  
  The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
  the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
@@ -307,8 +323,12 @@ these that written by the perl interpreter; nor would Perl understand files
  written by something that uses a different extension.  For these reasons, there
  is a separate set of flags that can warn and/or disallow these extremely high
  code points, even if other above-Unicode ones are accepted.  These are the
-C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.
-(Of course C<UNICODE_DISALLOW_SUPER> will treat all
+C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
+are entirely independent from the deprecation warning for code points above
+C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
+code point that needs more than 31 bits to represent.  When that happens,
+effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
+32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
  above-Unicode code points, including these, as malformations; and
  C<UNICODE_WARN_SUPER> warns on these.)
  
@@ -473,6 +493,10 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags.
  (But note that warnings are not raised if lexically disabled nor if
  C<UTF8_CHECK_ONLY> is also specified.)
  
+It is now deprecated to have very high code points (above C<IV_MAX> on the
+platforms) and this function will raise a deprecation warning for these (unless
+such warnings are turned off).  This value, is typically 0x7FFF_FFFF (2**31 -1)
+in a 32-bit word.
  
  Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
  so using them is more problematic than other above-Unicode code points.  Perl
@@ -482,8 +506,12 @@ these that written by the perl interpreter; nor would Perl understand files
  written by something that uses a different extension.  For these reasons, there
  is a separate set of flags that can warn and/or disallow these extremely high
  code points, even if other above-Unicode ones are accepted.  These are the
-C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.
-(Of course C<UTF8_DISALLOW_SUPER> will treat all
+C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
+are entirely independent from the deprecation warning for code points above
+C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
+code point that needs more than 31 bits to represent.  When that happens,
+effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
+32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
  above-Unicode code points, including these, as malformations; and
  C<UTF8_WARN_SUPER> warns on these.)
  
@@ -706,14 +734,16 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      /* Here, the input is considered to be well-formed, but it still could be a
       * problematic code point that is not allowed by the input parameters. */
      if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
-       && (flags & ( UTF8_DISALLOW_NONCHAR
-                     |UTF8_DISALLOW_SURROGATE
-                     |UTF8_DISALLOW_SUPER
-                     |UTF8_DISALLOW_ABOVE_31_BIT
-                    |UTF8_WARN_NONCHAR
-                     |UTF8_WARN_SURROGATE
-                     |UTF8_WARN_SUPER
-                     |UTF8_WARN_ABOVE_31_BIT)))
+       && ((flags & ( UTF8_DISALLOW_NONCHAR
+                      |UTF8_DISALLOW_SURROGATE
+                      |UTF8_DISALLOW_SUPER
+                      |UTF8_DISALLOW_ABOVE_31_BIT
+                     |UTF8_WARN_NONCHAR
+                      |UTF8_WARN_SURROGATE
+                      |UTF8_WARN_SUPER
+                      |UTF8_WARN_ABOVE_31_BIT))
+            || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+                && ckWARN_d(WARN_DEPRECATED))))
      {
         if (UNICODE_IS_SURROGATE(uv)) {
  
@@ -789,6 +819,14 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
             if (flags & UTF8_DISALLOW_SUPER) {
                 goto disallowed;
             }
+
+            /* The deprecated warning overrides any non-deprecated one */
+            if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
+            {
+                sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
+                                              uv, MAX_NON_DEPRECATED_CP));
+                pack_warn = packWARN(WARN_DEPRECATED);
+            }
         }
         else if (UNICODE_IS_NONCHAR(uv)) {
             if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
@@ -890,6 +928,9 @@ the next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
+Code points above the platform's C<IV_MAX> will raise a deprecation warning,
+unless those are turned off.
+
  =cut
  */
  
@@ -965,6 +1006,9 @@ is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
+Code points above the platform's C<IV_MAX> will raise a deprecation warning,
+unless those are turned off.
+
  =cut
  */
  
@@ -1828,6 +1872,9 @@ mappings, like C<"utf8::ToSpecLower">.
  C<normal> is a string like C<"ToLower"> which means the swash
  C<%utf8::ToLower>.
  
+Code points above the platform's C<IV_MAX> will raise a deprecation warning,
+unless those are turned off.
+
  =cut */
  
  UV
@@ -1850,6 +1897,12 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
             }
         }
         else if (UNICODE_IS_SUPER(uv1)) {
+            if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
+                && ckWARN_d(WARN_DEPRECATED))
+            {
+               Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                            cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
+            }
             if (ckWARN_d(WARN_NON_UNICODE)) {
                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
                 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
@@ -3879,7 +3932,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in s which has length len bytes.  Returns
       * TRUE if none found; FALSE otherwise.  The only other validity check is
-     * to make sure that this won't exceed the string's length */
+     * to make sure that this won't exceed the string's length.
+     *
+     * Code points above the platform's C<IV_MAX> will raise a deprecation
+     * warning, unless those are turned off.  */
  
      const U8* const e = s + len;
      bool ok = TRUE;
@@ -3895,7 +3951,29 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             STRLEN char_len;
             if (UTF8_IS_SUPER(s, e)) {
-               if (ckWARN_d(WARN_NON_UNICODE)) {
+                if (   ckWARN_d(WARN_NON_UNICODE)
+                    || (   ckWARN_d(WARN_DEPRECATED)
+#if defined(UV_IS_QUAD)
+                        /* 2**63 and up meet these conditions provided we have
+                         * a 64-bit word. */
+#   ifdef EBCDIC
+                        && *s == 0xFE && e - s >= UTF8_MAXBYTES
+                        && s[1] >= 0x49
+#   else
+                        && *s == 0xFF && e -s >= UTF8_MAXBYTES
+                        && s[2] >= 0x88
+#   endif
+#else   /* Below is 32-bit words */
+                        /* 2**31 and above meet these conditions on all EBCDIC
+                         * pages recognized for 32-bit platforms */
+#   ifdef EBCDIC
+                        && *s == 0xFE && e - s >= UTF8_MAXBYTES
+                        && s[6] >= 0x43
+#   else
+                        && *s >= 0xFE
+#   endif
+#endif
+                )) {
                      /* A side effect of this function will be to warn */
                      (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_SUPER);
                     ok = FALSE;
author	Karl Williamson <khw@cpan.org>
	Sat, 28 Nov 2015 18:49:43 +0000 (11:49 -0700)
committer	Karl Williamson <khw@cpan.org>
	Sun, 29 Nov 2015 00:19:27 +0000 (17:19 -0700)
ext/XS-APItest/t/utf8.t		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
pod/perlunicode.pod		patch \| blob \| blame \| history
t/lib/warnings/utf8		patch \| blob \| blame \| history
t/op/bop.t		patch \| blob \| blame \| history
t/op/chop.t		patch \| blob \| blame \| history
t/op/index.t		patch \| blob \| blame \| history
t/op/ver.t		patch \| blob \| blame \| history
t/opbasic/qq.t		patch \| blob \| blame \| history
t/re/pat_advanced.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history