Forbid above IV_MAX code points

author Karl Williamson <khw@cpan.org>

Sat, 1 Jul 2017 17:58:00 +0000 (11:58 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
author Karl Williamson <khw@cpan.org>
Sat, 1 Jul 2017 17:58:00 +0000 (11:58 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl

index 6f40c17..3eddeaa 100644 (file)
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -18,8 +18,6 @@ BEGIN {
  
  $|=1;
  
-no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
-                          # machines, and that is tested elsewhere
  use XS::APItest;
  
  my @warnings_gotten;
@@ -62,9 +60,12 @@ sub overflow_discern_len($) {
      # needed.
  
      if (isASCII) {
-        return ($::is64bit) ? 3 : ((shift == $::max_bytes)
-                                   ? 1
-                                   : 2);
+        return ($::is64bit) ? 3 : 1;
+
+        # Below is needed for code points above IV_MAX
+        #return ($::is64bit) ? 3 : ((shift == $::max_bytes)
+        #                           ? 1
+        #                           : 2);
      }
  
      return ($::is64bit) ? 2 : 8;
@@ -79,11 +80,17 @@ sub overlong_discern_len($) {
      my $length = length $string;
      my $byte = ord native_to_I8(substr($string, 0, 1));
      if (isASCII) {
-        return ($length == $::max_bytes)
-                  # This is constrained to 1 on 32-bit machines, as it
-                  # overflows there
-                ? (($::is64bit) ? 7 : 1)
+        return ($byte >= 0xFE)
+                ? ((! $::is64bit)
+                    ? 1
+                    : ($byte == 0xFF) ? 7 : 2)
                  : (($length == 2) ? 1 : 2);
+        # Below is needed for code points above IV_MAX
+        #return ($length == $::max_bytes)
+        #          # This is constrained to 1 on 32-bit machines, as it
+        #          # overflows there
+        #        ? (($::is64bit) ? 7 : 1)
+        #        : (($length == 2) ? 1 : 2);
      }
  
      return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
@@ -394,36 +401,51 @@ my @tests;
              : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
              (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
          ],
-        [ "lowest 32 bit code point",
-            (isASCII)
-            ?  "\xfe\x82\x80\x80\x80\x80\x80"
-            : I8_to_native(
-                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
-            0x80000000,
-        ],
-        [ "highest 32 bit code point",
+        [ "highest 31 bit code point",
              (isASCII)
-            ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
+            ?  "\xfd\xbf\xbf\xbf\xbf\xbf"
              : I8_to_native(
-               "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
-            0xFFFFFFFF,
+               "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
+            0x7FFFFFFF,
+            1,
          ],
-        [ "Lowest 33 bit code point",
+        [ "lowest 32 bit code point",
              (isASCII)
-            ?  "\xfe\x84\x80\x80\x80\x80\x80"
+            ?  "\xfe\x82\x80\x80\x80\x80\x80"
              : I8_to_native(
-                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
-            ($::is64bit) ? 0x100000000 : -1,   # Overflows on 32-bit systems
+                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
+            ($::is64bit) ? 0x80000000 : -1,   # Overflows on 32-bit systems
+            1,
          ],
+        # Used when UV_MAX is allowed as a code point
+        #[ "highest 32 bit code point",
+        #    (isASCII)
+        #    ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
+        #    : I8_to_native(
+        #       "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
+        #    0xFFFFFFFF,
+        #],
+        #[ "Lowest 33 bit code point",
+        #    (isASCII)
+        #    ?  "\xfe\x84\x80\x80\x80\x80\x80"
+        #    : I8_to_native(
+        #        "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
+        #    ($::is64bit) ? 0x100000000 : 0x0,   # Overflows on 32-bit systems
+        #],
      );
  
      if (! $::is64bit) {
          if (isASCII) {
              push @tests,
                  [ "overlong malformation, but naively looks like overflow",
-                    "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
-                    0xFFFFFFFF,
+                    "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf",
+                    0x7FFFFFFF,
                  ],
+                # Used when above IV_MAX are allowed.
+                #[ "overlong malformation, but naively looks like overflow",
+                #    "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
+                #    0xFFFFFFFF,
+                #],
                  [ "overflow that old algorithm failed to detect",
                      "\xfe\x86\x80\x80\x80\x80\x80",
                      -1,
@@ -457,21 +479,37 @@ my @tests;
  
      if ($::is64bit) {
          push @tests,
-            [ "highest 64 bit code point",
+            [ "highest 63 bit code point",
                (isASCII)
-              ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
+              ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
                : I8_to_native(
-                "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
-              0xFFFFFFFFFFFFFFFF,
+                "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
+              0x7FFFFFFFFFFFFFFF,
                (isASCII) ? 1 : 2,
              ],
-            [ "first 65 bit code point",
+            [ "first 64 bit code point",
                (isASCII)
-              ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
+              ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
                : I8_to_native(
-                "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+                "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                -1,
              ];
+            # Used when UV_MAX is allowed as a code point
+            #[ "highest 64 bit code point",
+            #  (isASCII)
+            #  ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
+            #  : I8_to_native(
+            #    "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
+            #  0xFFFFFFFFFFFFFFFF,
+            #  (isASCII) ? 1 : 2,
+            #],
+            #[ "first 65 bit code point",
+            #  (isASCII)
+            #  ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
+            #  : I8_to_native(
+            #    "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+            #  0,
+            #];
          if (isASCII) {
              push @tests,
                  [ "overflow that old algorithm failed to detect",
@@ -1332,7 +1370,7 @@ foreach my $test (@tests) {
                      my $expect_warnings_for_overflow;
  
                      if ($warning_type == 0) {
-                        $eval_warn = "use warnings; no warnings 'deprecated'";
+                        $eval_warn = "use warnings";
                          $expect_regular_warnings = $use_warn_flag;
  
                          # We ordinarily expect overflow warnings here.  But it
@@ -1621,7 +1659,7 @@ foreach my $test (@tests) {
                      # not just when the $this_disallow_flags is set
                      if ($disallowed) {
                          my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
-                        my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
+                        my $eval_text = "use warnings; \$ret_ref ="
                                        . " test_utf8n_to_uvchr_error('"
                                        . "$this_bytes', $this_length,"
                                        . " $this_flags)";
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8

index b322ae0..9066308 100644 (file)
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -735,38 +735,25 @@ $a = uc("\x{103}");
  $a = ucfirst("\x{104}");
  EXPECT
  ########
-# NAME Deprecation of too-large code points
+# NAME Fatality of too-large code points, but IV_MAX works, warns
  require "../test.pl";
  use warnings 'non_unicode';
  my $max_cp = ~0 >> 1;
  my $max_char = chr $max_cp;
-my $to_warn_cp = $max_cp + 1;
-my $to_warn_char = chr $to_warn_cp;
-$max_char =~ /[\x{110000}\P{Unassigned}]/;
-$to_warn_char =~ /[\x{110000}\P{Unassigned}]/;
  my $temp = qr/$max_char/;
-$temp = qr/$to_warn_char/;
  $temp = uc($max_char);
-$temp = uc($to_warn_char);
+$max_char =~ /[\x{110000}\P{Unassigned}]/;
  my $file = tempfile();
  open(my $fh, "+>:utf8", $file);
  print $fh $max_char, "\n";
-print $fh $to_warn_char, "\n";
  close $fh;
+my $error_cp = $max_cp + 1;
+my $error_char = chr $error_cp;
  EXPECT
-OPTION regex
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in pattern match \(m//\) at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in regexp compilation at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in regexp compilation at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in regexp compilation at - line \d+.
+OPTIONS fatal regex
  Operation "uc" returns its argument for non-Unicode code point 0x7F+ at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in uc at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 at - line \d+.
-Operation "uc" returns its argument for non-Unicode code point 0x80+ at - line \d+.
  Code point 0x7F+ is not Unicode, (may not be|requires a Perl extension, and so is not) portable in print at - line \d+.
-Use of code point 0x80+ is deprecated; the permissible max is 0x7F+\. This will be fatal in Perl 5\.28 in print at - line \d+.
+Use of code point 0x80+ is not allowed; the permissible max is 0x7F+\ at - line \d+.
  ########
  # NAME  [perl #127262]
  BEGIN{
diff --git a/t/op/index.t b/t/op/index.t

index 996c5e4..5e95191 100644 (file)
--- a/t/op/index.t
+++ b/t/op/index.t
@@ -131,7 +131,6 @@ is(rindex($a, "foo",    ), 0);
  
  SKIP: {
      skip("Not a 64-bit machine", 3) if length sprintf("%x", ~0) <= 8;
-    no warnings 'deprecated'; # These are above IV_MAX on 32 bit machines
      my $a = eval q{"\x{80000000}"};
      my $s = $a.'defxyz';
      is(index($s, 'def'), 1, "0x80000000 is a single character");
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index 1e9ea88..18ab588 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -85,16 +85,17 @@ foreach (<DATA>) {
              my $message;
              my $after = "";
              if ($expect64 < $expect32 && ! $is64bit) {
-                if (       substr($octets, 0, 1) gt "\xfe"
-                    || (   substr($octets, 0, 1) eq "\xfe"
-                        && length $octets > 1
-                        && substr($octets, 1, 1) le "\xbf"
-                        && substr($octets, 1, 1) ge "\x80"))
-                {
+                # This is needed for code points above IV_MAX
+                #if (       substr($octets, 0, 1) gt "\xfe"
+                #    || (   substr($octets, 0, 1) eq "\xfe"
+                #        && length $octets > 1
+                #        && substr($octets, 1, 1) le "\xbf"
+                #        && substr($octets, 1, 1) ge "\x80"))
+                #{
                      like($warnings[0], qr/overflow/, "overflow warning for $id seen");
                      shift @warnings;
                      $after .= "overflow";
-                }
+                #}
              }
  
              # The data below assumes that if there is both a 'short' and
@@ -189,8 +190,8 @@ __DATA__
  3.4.1 N15 -    30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
  3.5    Impossible bytes (but not with Perl's extended UTF-8)
  3.5.1 n -      1       fe      -       1 byte available, need 7
-3.5.2 N2,1 -   1       ff      -       1 byte available, need 13
-3.5.3 N9,7 -   4       fe:fe:ff:ff     -       byte 0xfe
+3.5.2 n -      1       ff      -       1 byte available, need 13
+3.5.3 N7 -     4       fe:fe:ff:ff     -       byte 0xfe
  4      Overlong sequences
  4.1    Examples of an overlong ASCII character
  4.1.1 n -      2       c0:af   -       overlong
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t

index 3f2d6c3..d90ceeb 100644 (file)
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -2421,8 +2421,7 @@ EOF
          $Config{uvsize} == 8
           or skip("need large code-points for this test", 1);
  
-        # This is above IV_MAX on 32 bit machines, so turn off those warnings
-       fresh_perl_is('no warnings "deprecated"; /\x{E000000000}|/ and print qq(ok\n)', "ok\n", {},
+       fresh_perl_is('/\x{E000000000}|/ and print qq(ok\n)', "ok\n", {},
                       "buffer overflow in TRIE_STORE_REVCHAR");
      }
  
diff --git a/utf8.c b/utf8.c

index d64ce90..93cdd66 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -37,9 +37,10 @@ static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
  static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
+                        "Use of code point 0x%" UVXf " is not allowed; the"
+                        " permissible max is 0x%" UVXf;
  
-#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
+#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
  
  /*
  =head1 Unicode Support
@@ -207,11 +208,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
-        if (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-            && ckWARN_d(WARN_DEPRECATED))
-        {
-            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                        cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
+        if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
+            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
          }
          if (      (flags & UNICODE_WARN_SUPER)
              || (  (flags & UNICODE_WARN_PERL_EXTENDED)
@@ -295,9 +293,8 @@ is the recommended wide native character-aware way of saying
  
      *(d++) = uv;
  
-This function accepts any UV as input, but very high code points (above
-C<IV_MAX> on the platform)  will raise a deprecation warning.  This is
-typically 0x7FFF_FFFF in a 32-bit word.
+This function accepts any code point from 0..C<IV_MAX> as input.
+C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
  
  It is possible to forbid or warn on non-Unicode code points, or those that may
  be problematic by using L</uvchr_to_utf8_flags>.
@@ -332,9 +329,8 @@ This is the Unicode-aware way of saying
  
      *(d++) = uv;
  
-If C<flags> is 0, this function accepts any UV as input, but very high code
-points (above C<IV_MAX> for the platform)  will raise a deprecation warning.
-This is typically 0x7FFF_FFFF in a 32-bit word.
+If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
+input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
  
  Specifying C<flags> can further restrict what is allowed and not warned on, as
  follows:
@@ -656,21 +652,13 @@ S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
      return -1;
  }
  
-/* Anything larger than this will overflow the word if it were converted into a UV */
-#if defined(UV_IS_QUAD)
+#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
  #  ifdef EBCDIC     /* Actually is I8 */
  #   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+                "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
  #  else
  #   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#else   /* 32-bit */
-#  ifdef EBCDIC
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
+                "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
  #  endif
  #endif
  
@@ -680,13 +668,13 @@ S_does_utf8_overflow(const U8 * const s,
                       const bool consider_overlongs)
  {
      /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
-     * 'e' - 1 would overflow a UV on this platform; that is if it represents a
-     * code point larger than the highest representable code point.  It returns
-     * 1 if it does overflow; 0 if it doesn't, and -1 if there isn't enough
-     * information to tell.  This last return value can happen if the sequence
-     * is incomplete, missing some trailing bytes that would form a complete
-     * character.  If there are enough bytes to make a definitive decision,
-     * this function does so.
+     * 'e' - 1 would overflow an IV on this platform; that is if it represents
+     * a code point larger than the highest representable code point.  It
+     * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
+     * enough information to tell.  This last return value can happen if the
+     * sequence is incomplete, missing some trailing bytes that would form a
+     * complete character.  If there are enough bytes to make a definitive
+     * decision, this function does so.
       *
       * If 'consider_overlongs' is TRUE, the function checks for the possibility
       * that the sequence is an overlong that doesn't overflow.  Otherwise, it
@@ -699,13 +687,77 @@ S_does_utf8_overflow(const U8 * const s,
       *
       * 'e' - 1 must not be beyond a full character. */
  
-    const STRLEN len = e - s;
-    const U8 *x;
-    const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
  
      PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
      assert(s <= e && s + UTF8SKIP(s) >= e);
  
+#if ! defined(UV_IS_QUAD)
+
+    return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
+
+#else
+
+    PERL_UNUSED_ARG(consider_overlongs);
+
+    {
+        const STRLEN len = e - s;
+        const U8 *x;
+        const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+        for (x = s; x < e; x++, y++) {
+
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+                continue;
+            }
+
+            /* If this byte is larger than the corresponding highest UTF-8
+             * byte, the sequence overflow; otherwise the byte is less than,
+             * and so the sequence doesn't overflow */
+            return NATIVE_UTF8_TO_I8(*x) > *y;
+
+        }
+
+        /* Got to the end and all bytes are the same.  If the input is a whole
+         * character, it doesn't overflow.  And if it is a partial character,
+         * there's not enough information to tell */
+        if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+#endif
+
+}
+
+#if 0
+
+/* This is the portions of the above function that deal with UV_MAX instead of
+ * IV_MAX.  They are left here in case we want to combine them so that internal
+ * uses can have larger code points.  The only logic difference is that the
+ * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
+ * different logic.
+ */
+
+/* Anything larger than this will overflow the word if it were converted into a UV */
+#if defined(UV_IS_QUAD)
+#  ifdef EBCDIC     /* Actually is I8 */
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#else   /* 32-bit */
+#  ifdef EBCDIC
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#endif
+
  #if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
  
      /* On 32 bit ASCII machines, many overlongs that start with FF don't
@@ -725,34 +777,14 @@ S_does_utf8_overflow(const U8 * const s,
          return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
      }
  
-#else
-
-    PERL_UNUSED_ARG(consider_overlongs);
-
+/* Starting with the #else, the rest of the function is identical except
+ *      1.  we need to move the 'len' declaration to be global to the function
+ *      2.  the endif move to just after the UNUSED_ARG.
+ * An empty endif is given just below to satisfy the preprocessor
+ */
  #endif
  
-    for (x = s; x < e; x++, y++) {
-
-        if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
-            continue;
-        }
-
-        /* If this byte is larger than the corresponding highest UTF-8 byte,
-         * the sequence overflow; otherwise the byte is less than, and so the
-         * sequence doesn't overflow */
-        return NATIVE_UTF8_TO_I8(*x) > *y;
-
-    }
-
-    /* Got to the end and all bytes are the same.  If the input is a whole
-     * character, it doesn't overflow.  And if it is a partial character,
-     * there's not enough information to tell */
-    if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
-        return -1;
-    }
-
-    return 0;
-}
+#endif
  
  #undef F0_ABOVE_OVERLONG
  #undef F8_ABOVE_OVERLONG
@@ -1103,10 +1135,6 @@ EBCDIC platforms, and sometimes when the L<overlong
  malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
  describe the situation in all cases.
  
-It is now deprecated to have very high code points (above C<IV_MAX> on the
-platforms) and this function will raise a deprecation warning for these (unless
-such warnings are turned off).  This value is typically 0x7FFF_FFFF (2**31 -1)
-in a 32-bit word.
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
@@ -1213,7 +1241,7 @@ in a position where only a continuation type one should be.
  =item C<UTF8_GOT_OVERFLOW>
  
  The input sequence was malformed in that it is for a code point that is not
-representable in the number of bits available in a UV on the current platform.
+representable in the number of bits available in an IV on the current platform.
  
  =item C<UTF8_GOT_SHORT>
  
@@ -1474,14 +1502,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                       |UTF8_WARN_NONCHAR
                        |UTF8_WARN_SURROGATE
                        |UTF8_WARN_SUPER
-                      |UTF8_WARN_PERL_EXTENDED))
-                   /* In case of a malformation, 'uv' is not valid, and has
-                    * been changed to something in the Unicode range.
-                    * Currently we don't output a deprecation message if there
-                    * is already a malformation, so we don't have to special
-                    * case the test immediately below */
-            || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-                && ckWARN_d(WARN_DEPRECATED))))
+                      |UTF8_WARN_PERL_EXTENDED))))
      {
          /* If there were no malformations, or the only malformation is an
           * overlong, 'uv' is valid */
@@ -1587,11 +1608,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      disallowed = TRUE;
                  }
  
-                /* Likewise, warn if any say to, plus if deprecation warnings
-                 * are on, because this code point is above IV_MAX */
-                if (      ckWARN_d(WARN_DEPRECATED)
-                    || ! (flags & UTF8_ALLOW_OVERFLOW)
-                    ||   (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
+                /* Likewise, warn if any say to */
+                if (  ! (flags & UTF8_ALLOW_OVERFLOW)
+                    ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
                  {
  
                      /* The warnings code explicitly says it doesn't handle the
@@ -1793,21 +1812,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
                      disallowed = TRUE;
                  }
-
-                /* The deprecated warning overrides any non-deprecated one.  If
-                 * there are other problems, a deprecation message is not
-                 * really helpful, so don't bother to raise it in that case.
-                 * This also keeps the code from having to handle the case
-                 * where 'uv' is not valid. */
-                if (   ! (orig_problems
-                                    & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                    && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-                    && ckWARN_d(WARN_DEPRECATED))
-                {
-                    message = Perl_form(aTHX_ cp_above_legal_max,
-                                              uv, MAX_NON_DEPRECATED_CP);
-                    pack_warn = packWARN(WARN_DEPRECATED);
-                }
              }
              else if (possible_problems & UTF8_GOT_NONCHAR) {
                  possible_problems &= ~UTF8_GOT_NONCHAR;
@@ -1936,9 +1940,6 @@ the next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
-Code points above the platform's C<IV_MAX> will raise a deprecation warning,
-unless those are turned off.
-
  =cut
  
  Also implemented as a macro in utf8.h
@@ -1978,9 +1979,6 @@ is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
-Code points above the platform's C<IV_MAX> will raise a deprecation warning,
-unless those are turned off.
-
  =cut
  */
  
@@ -3118,11 +3116,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
-                    if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
-                        && ckWARN_d(WARN_DEPRECATED))
-                    {
-                        Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                                cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
+                    if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
+                        Perl_croak(aTHX_ cp_above_legal_max, uv1,
+                                         MAX_EXTERNALLY_LEGAL_CP);
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
@@ -5220,10 +5216,8 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in 's' which has length 'len' bytes.
       * Returns TRUE if none found; FALSE otherwise.  The only other validity
-     * check is to make sure that this won't exceed the string's length.
-     *
-     * Code points above the platform's C<IV_MAX> will raise a deprecation
-     * warning, unless those are turned off.  */
+     * check is to make sure that this won't exceed the string's length nor
+     * overflow */
  
      const U8* const e = s + len;
      bool ok = TRUE;
author	Karl Williamson <khw@cpan.org>
	Sat, 1 Jul 2017 17:58:00 +0000 (11:58 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
ext/XS-APItest/t/utf8_warn_base.pl		patch \| blob \| blame \| history
t/lib/warnings/utf8		patch \| blob \| blame \| history
t/op/index.t		patch \| blob \| blame \| history
t/op/utf8decode.t		patch \| blob \| blame \| history
t/re/pat_advanced.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history