Spelling correction for consistency with pod/perldebguts.pod.

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 9619da7..94e3267 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -36,10 +36,13 @@
  static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
  static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
+
+/* Be sure to synchronize this message with the similar one in regcomp.c */
  static const char cp_above_legal_max[] =
  static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
+                        "Use of code point 0x%" UVXf " is not allowed; the"
+                        " permissible max is 0x%" UVXf;
  
  
-#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
+#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
  
  /*
  =head1 Unicode Support
  
  /*
  =head1 Unicode Support
@@ -100,6 +103,29 @@ Perl__force_out_malformed_utf8_message(pTHX_
      }
  }
  
      }
  }
  
+STATIC HV *
+S_new_msg_hv(pTHX_ const char * const message, /* The message text */
+                   U32 categories,  /* Packed warning categories */
+                   U32 flag)        /* Flag associated with this message */
+{
+    /* Creates, populates, and returns an HV* that describes an error message
+     * for the translators between UTF8 and code point */
+
+    SV* msg_sv = newSVpv(message, 0);
+    SV* category_sv = newSVuv(categories);
+    SV* flag_bit_sv = newSVuv(flag);
+
+    HV* msg_hv = newHV();
+
+    PERL_ARGS_ASSERT_NEW_MSG_HV;
+
+    (void) hv_stores(msg_hv, "text", msg_sv);
+    (void) hv_stores(msg_hv, "warn_categories",  category_sv);
+    (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
+
+    return msg_hv;
+}
+
  /*
  =for apidoc uvoffuni_to_utf8_flags
  
  /*
  =for apidoc uvoffuni_to_utf8_flags
  
@@ -116,31 +142,56 @@ For details, see the description for L</uvchr_to_utf8_flags>.
  =cut
  */
  
  =cut
  */
  
+U8 *
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
+{
+    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
+
+    return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
+}
+
  /* All these formats take a single UV code point argument */
  const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
  const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
                                     " is not recommended for open interchange";
  const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
                                     " may not be portable";
  /* All these formats take a single UV code point argument */
  const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
  const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
                                     " is not recommended for open interchange";
  const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
                                     " may not be portable";
-const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not"
-                                      " Unicode, and not portable";
+const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
+                                       " Unicode, requires a Perl extension," \
+                                       " and so is not portable";
  
  
-#define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
+#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                   \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_SURROGATE) {                       \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_SURROGATE) {                       \
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),        \
-                                   surrogate_cp_format, uv);        \
+            U32 category = packWARN(WARN_SURROGATE);                \
+            const char * format = surrogate_cp_format;              \
+            if (msgs) {                                             \
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
+                                   category,                        \
+                                   UNICODE_GOT_SURROGATE);          \
+            }                                                       \
+            else {                                                  \
+                Perl_ck_warner_d(aTHX_ category, format, uv);       \
+            }                                                       \
          }                                                           \
          if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
              return NULL;                                            \
          }                                                           \
      } STMT_END;
  
          }                                                           \
          if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
              return NULL;                                            \
          }                                                           \
      } STMT_END;
  
-#define HANDLE_UNICODE_NONCHAR(uv, flags)                           \
+#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                     \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_NONCHAR) {                         \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_NONCHAR) {                         \
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),          \
-                                  nonchar_cp_format, uv);          \
+            U32 category = packWARN(WARN_NONCHAR);                  \
+            const char * format = nonchar_cp_format;                \
+            if (msgs) {                                             \
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
+                                   category,                        \
+                                   UNICODE_GOT_NONCHAR);            \
+            }                                                       \
+            else {                                                  \
+                Perl_ck_warner_d(aTHX_ category, format, uv);       \
+            }                                                       \
          }                                                           \
          if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
              return NULL;                                            \
          }                                                           \
          if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
              return NULL;                                            \
@@ -153,10 +204,62 @@ const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not"
  #define MARK    UTF_CONTINUATION_MARK
  #define MASK    UTF_CONTINUATION_MASK
  
  #define MARK    UTF_CONTINUATION_MARK
  #define MASK    UTF_CONTINUATION_MASK
  
+/*
+=for apidoc uvchr_to_utf8_flags_msgs
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+
+Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
+
+This function is for code that wants any warning and/or error messages to be
+returned to the caller rather than be displayed.  All messages that would have
+been displayed if all lexcial warnings are enabled will be returned.
+
+It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
+placed after all the others, C<msgs>.  If this parameter is 0, this function
+behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
+be a pointer to an C<HV *> variable, in which this function creates a new HV to
+contain any appropriate messages.  The hash has three key-value pairs, as
+follows:
+
+=over 4
+
+=item C<text>
+
+The text of the message as a C<SVpv>.
+
+=item C<warn_categories>
+
+The warning category (or categories) packed into a C<SVuv>.
+
+=item C<flag>
+
+A single flag bit associated with this message, in a C<SVuv>.
+The bit corresponds to some bit in the C<*errors> return value,
+such as C<UNICODE_GOT_SURROGATE>.
+
+=back
+
+It's important to note that specifying this parameter as non-null will cause
+any warnings this function would otherwise generate to be suppressed, and
+instead be placed in C<*msgs>.  The caller can check the lexical warnings state
+(or not) when choosing what to do with the returned messages.
+
+The caller, of course, is responsible for freeing any returned HV.
+
+=cut
+*/
+
+/* Undocumented; we don't want people using this.  Instead they should use
+ * uvchr_to_utf8_flags_msgs() */
  U8 *
  U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
+Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
  {
  {
-    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
+    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
+
+    if (msgs) {
+        *msgs = NULL;
+    }
  
      if (OFFUNI_IS_INVARIANT(uv)) {
         *d++ = LATIN1_TO_NATIVE(uv);
  
      if (OFFUNI_IS_INVARIANT(uv)) {
         *d++ = LATIN1_TO_NATIVE(uv);
@@ -188,10 +291,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
              if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
                           || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
              {
              if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
                           || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
              {
-                HANDLE_UNICODE_NONCHAR(uv, flags);
+                HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
              }
              else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
              }
              else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-                HANDLE_UNICODE_SURROGATE(uv, flags);
+                HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
              }
          }
  #endif
              }
          }
  #endif
@@ -206,33 +309,44 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
-        if (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-            && ckWARN_d(WARN_DEPRECATED))
-        {
-            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                        cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
+        if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
+            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
          }
          if (       (flags & UNICODE_WARN_SUPER)
          }
          if (       (flags & UNICODE_WARN_SUPER)
-            || (   UNICODE_IS_ABOVE_31_BIT(uv)
-                && (flags & UNICODE_WARN_ABOVE_31_BIT)))
+            || (   (flags & UNICODE_WARN_PERL_EXTENDED)
+                && UNICODE_IS_PERL_EXTENDED(uv)))
          {
          {
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
+            const char * format = super_cp_format;
+            U32 category = packWARN(WARN_NON_UNICODE);
+            U32 flag = UNICODE_GOT_SUPER;
+
+            /* Choose the more dire applicable warning */
+            if (UNICODE_IS_PERL_EXTENDED(uv)) {
+                format = perl_extended_cp_format;
+                if (flags & (UNICODE_WARN_PERL_EXTENDED
+                            |UNICODE_DISALLOW_PERL_EXTENDED))
+                {
+                    flag = UNICODE_GOT_PERL_EXTENDED;
+                }
+            }
  
  
-              /* Choose the more dire applicable warning */
-              (UNICODE_IS_ABOVE_31_BIT(uv))
-              ? above_31_bit_cp_format
-              : super_cp_format,
-             uv);
+            if (msgs) {
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
+                                   category, flag);
+            }
+            else {
+                Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
+            }
          }
          if (       (flags & UNICODE_DISALLOW_SUPER)
          }
          if (       (flags & UNICODE_DISALLOW_SUPER)
-            || (   UNICODE_IS_ABOVE_31_BIT(uv)
-                && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
+            || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
+                &&  UNICODE_IS_PERL_EXTENDED(uv)))
          {
              return NULL;
          }
      }
      else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
          {
              return NULL;
          }
      }
      else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
-        HANDLE_UNICODE_NONCHAR(uv, flags);
+        HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
      }
  
      /* Test for and handle 4-byte result.   In the test immediately below, the
      }
  
      /* Test for and handle 4-byte result.   In the test immediately below, the
@@ -251,10 +365,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
                     characters.  The end-plane non-characters for EBCDIC were
                     handled just above */
          if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
                     characters.  The end-plane non-characters for EBCDIC were
                     handled just above */
          if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
-            HANDLE_UNICODE_NONCHAR(uv, flags);
+            HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
          }
          else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
          }
          else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-            HANDLE_UNICODE_SURROGATE(uv, flags);
+            HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
          }
  #endif
  
          }
  #endif
  
@@ -294,9 +408,8 @@ is the recommended wide native character-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
-This function accepts any UV as input, but very high code points (above
-C<IV_MAX> on the platform)  will raise a deprecation warning.  This is
-typically 0x7FFF_FFFF in a 32-bit word.
+This function accepts any code point from 0..C<IV_MAX> as input.
+C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
  
  It is possible to forbid or warn on non-Unicode code points, or those that may
  be problematic by using L</uvchr_to_utf8_flags>.
  
  It is possible to forbid or warn on non-Unicode code points, or those that may
  be problematic by using L</uvchr_to_utf8_flags>.
@@ -331,9 +444,8 @@ This is the Unicode-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
-If C<flags> is 0, this function accepts any UV as input, but very high code
-points (above C<IV_MAX> for the platform)  will raise a deprecation warning.
-This is typically 0x7FFF_FFFF in a 32-bit word.
+If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
+input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
  
  Specifying C<flags> can further restrict what is allowed and not warned on, as
  follows:
  
  Specifying C<flags> can further restrict what is allowed and not warned on, as
  follows:
@@ -362,30 +474,25 @@ defined in
  L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
  See L<perlunicode/Noncharacter code points>.
  
  L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
  See L<perlunicode/Noncharacter code points>.
  
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these that written by the perl interpreter; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UNICODE_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UNICODE_WARN_ABOVE_31_BIT>
-and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
+and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
+treat all above-Unicode code points, including these, as malformations.  (Note
+that the Unicode standard considers anything above 0x10FFFF to be illegal, but
+there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
+
+A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
+C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
+platforms,these flags can apply to code points that actually do fit in 31 bits.
+The new names accurately describe the situation in all cases.
  
  =cut
  */
  
  =cut
  */
@@ -399,8 +506,12 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
-PERL_STATIC_INLINE bool
-S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+#ifndef UV_IS_QUAD
+
+STATIC int
+S_is_utf8_cp_above_31_bits(const U8 * const s,
+                           const U8 * const e,
+                           const bool consider_overlongs)
  {
      /* Returns TRUE if the first code point represented by the Perl-extended-
       * UTF-8-encoded string starting at 's', and looking no further than 'e -
  {
      /* Returns TRUE if the first code point represented by the Perl-extended-
       * UTF-8-encoded string starting at 's', and looking no further than 'e -
@@ -412,166 +523,170 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
       * the final ones necessary for the complete representation may be beyond
       * 'e - 1'.
       *
       * the final ones necessary for the complete representation may be beyond
       * 'e - 1'.
       *
-     * The function assumes that the sequence is well-formed UTF-8 as far as it
-     * goes, and is for a UTF-8 variant code point.  If the sequence is
-     * incomplete, the function returns FALSE if there is any well-formed
-     * UTF-8 byte sequence that can complete it in such a way that a code point
-     * < 2**31 is produced; otherwise it returns TRUE.
-     *
-     * Getting this exactly right is slightly tricky, and has to be done in
-     * several places in this file, so is centralized here.  It is based on the
-     * following table:
+     * The function also can handle the case where the input is an overlong
+     * sequence.  If 'consider_overlongs' is 0, the function assumes the
+     * input is not overlong, without checking, and will return based on that
+     * assumption.  If this parameter is 1, the function will go to the trouble
+     * of figuring out if it actually evaluates to above or below 31 bits.
       *
       *
-     * U+7FFFFFFF (2 ** 31 - 1)
-     *      ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
-     *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
-     *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
-     *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
-     *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
-     * U+80000000 (2 ** 31):
-     *      ASCII: \xFE\x82\x80\x80\x80\x80\x80
-     *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
-     *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-     *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-     *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-     *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+     * The sequence is otherwise assumed to be well-formed, without checking.
       */
  
       */
  
-#ifdef EBCDIC
-
-    /* [0] is start byte  [1] [2] [3] [4] [5] [6] [7] */
-    const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
-    const STRLEN prefix_len = sizeof(prefix) - 1;
      const STRLEN len = e - s;
      const STRLEN len = e - s;
-    const STRLEN cmp_len = MIN(prefix_len, len - 1);
+    int is_overlong;
  
  
-#else
+    PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
  
  
-    PERL_UNUSED_ARG(e);
+    assert(! UTF8_IS_INVARIANT(*s) && e > s);
  
  
-#endif
-
-    PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
+#ifdef EBCDIC
  
  
-    assert(! UTF8_IS_INVARIANT(*s));
+    PERL_UNUSED_ARG(consider_overlongs);
  
  
-#ifndef EBCDIC
+    /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
+     * mean a 32-bit or larger code point (0xFF is an invariant).  0xFE can
+     * also be the start byte for a 31-bit code point; we need at least 2
+     * bytes, and maybe up through 8 bytes, to determine that.  (It can also be
+     * the start byte for an overlong sequence, but for 30-bit or smaller code
+     * points, so we don't have to worry about overlongs on EBCDIC.) */
+    if (*s != 0xFE) {
+        return 0;
+    }
  
  
-    /* Technically, a start byte of FE can be for a code point that fits into
-     * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
-     * malformation. */
-    return (*s >= 0xFE);
+    if (len == 1) {
+        return -1;
+    }
  
  #else
  
  
  #else
  
-    /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
-     * mean a 32-bit or larger code point (0xFF is an invariant).  For 0xFE, we
-     * need at least 2 bytes, and maybe up through 8 bytes, to be sure that the
-     * value is above 31 bits. */
-    if (*s != 0xFE || len == 1) {
-        return FALSE;
+    /* On ASCII, FE and FF are the only start bytes that can evaluate to
+     * needing more than 31 bits. */
+    if (LIKELY(*s < 0xFE)) {
+        return 0;
      }
  
      }
  
-    /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
-     * \x41 and \x42. */
-    return cBOOL(memGT(s + 1, prefix, cmp_len));
-
-#endif
+    /* What we have left are FE and FF.  Both of these require more than 31
+     * bits unless they are for overlongs. */
+    if (! consider_overlongs) {
+        return 1;
+    }
  
  
-}
+    /* Here, we have FE or FF.  If the input isn't overlong, it evaluates to
+     * above 31 bits.  But we need more than one byte to discern this, so if
+     * passed just the start byte, it could be an overlong evaluating to
+     * smaller */
+    if (len == 1) {
+        return -1;
+    }
  
  
-/* Anything larger than this will overflow the word if it were converted into a UV */
-#if defined(UV_IS_QUAD)
-#  ifdef EBCDIC     /* Actually is I8 */
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#else   /* 32-bit */
-#  ifdef EBCDIC
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#endif
+    /* Having excluded len==1, and knowing that FE and FF are both valid start
+     * bytes, we can call the function below to see if the sequence is
+     * overlong.  (We don't need the full generality of the called function,
+     * but for these huge code points, speed shouldn't be a consideration, and
+     * the compiler does have enough information, since it's static to this
+     * file, to optimize to just the needed parts.) */
+    is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
  
  
-PERL_STATIC_INLINE bool
-S_does_utf8_overflow(const U8 * const s, const U8 * e)
-{
-    const U8 *x;
-    const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+    /* If it isn't overlong, more than 31 bits are required. */
+    if (is_overlong == 0) {
+        return 1;
+    }
  
  
-#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+    /* If it is indeterminate if it is overlong, return that */
+    if (is_overlong < 0) {
+        return -1;
+    }
  
  
-    const STRLEN len = e - s;
+    /* Here is overlong.  Such a sequence starting with FE is below 31 bits, as
+     * the max it can be is 2**31 - 1 */
+    if (*s == 0xFE) {
+        return 0;
+    }
  
  #endif
  
  
  #endif
  
-    /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
-     * platform, that is if it represents a code point larger than the highest
-     * representable code point.  (For ASCII platforms, we could use memcmp()
-     * because we don't have to convert each byte to I8, but it's very rare
-     * input indeed that would approach overflow, so the loop below will likely
-     * only get executed once.
-     *
-     * 'e' must not be beyond a full character.  If it is less than a full
-     * character, the function returns FALSE if there is any input beyond 'e'
-     * that could result in a non-overflowing code point */
+    /* Here, ASCII and EBCDIC rejoin:
+    *  On ASCII:   We have an overlong sequence starting with FF
+    *  On EBCDIC:  We have a sequence starting with FE. */
  
  
-    PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
-    assert(s <= e && s + UTF8SKIP(s) >= e);
+    {   /* For C89, use a block so the declaration can be close to its use */
  
  
-#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+#ifdef EBCDIC
  
  
-    /* On 32 bit ASCII machines, many overlongs that start with FF don't
-     * overflow */
+        /* U+7FFFFFFF (2 ** 31 - 1)
+         *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
+         *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
+         *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
+         *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
+         *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
+         * U+80000000 (2 ** 31):
+         *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+         *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+         *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+         *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+         *
+         * and since we know that *s = \xfe, any continuation sequcence
+         * following it that is gt the below is above 31 bits
+                                                [0] [1] [2] [3] [4] [5] [6] */
+        const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
  
  
-    if (isFF_OVERLONG(s, len)) {
-        const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
-        return memGE(s, max_32_bit_overlong,
-                                MIN(len, sizeof(max_32_bit_overlong) - 1));
-    }
+#else
  
  
-#endif
+        /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
+         *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
+         * FF overlong for U+80000000 (2 ** 31):
+         *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
+         * and since we know that *s = \xff, any continuation sequcence
+         * following it that is gt the below is above 30 bits
+                                                [0] [1] [2] [3] [4] [5] [6] */
+        const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
  
  
-    for (x = s; x < e; x++, y++) {
  
  
-            /* If this byte is larger than the corresponding highest UTF-8
-                * byte, it overflows */
-            if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
-                return TRUE;
-            }
-
-            /* If not the same as this byte, it must be smaller, doesn't
-                * overflow */
-            if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
-                return FALSE;
-            }
+#endif
+        const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
+        const STRLEN cmp_len = MIN(conts_len, len - 1);
+
+        /* Now compare the continuation bytes in s with the ones we have
+         * compiled in that are for the largest 30 bit code point.  If we have
+         * enough bytes available to determine the answer, or the bytes we do
+         * have differ from them, we can compare the two to get a definitive
+         * answer (Note that in UTF-EBCDIC, the two lowest possible
+         * continuation bytes are \x41 and \x42.) */
+        if (cmp_len >= conts_len || memNE(s + 1,
+                                          conts_for_highest_30_bit,
+                                          cmp_len))
+        {
+            return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
          }
  
          }
  
-    /* Got to the end and all bytes are the same.  If the input is a whole
-     * character, it doesn't overflow.  And if it is a partial character,
-     * there's not enough information to tell, so assume doesn't overflow */
-    return FALSE;
+        /* Here, all the bytes we have are the same as the highest 30-bit code
+         * point, but we are missing so many bytes that we can't make the
+         * determination */
+        return -1;
+    }
  }
  
  }
  
-PERL_STATIC_INLINE bool
+#endif
+
+PERL_STATIC_INLINE int
  S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  {
  S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  {
-    /* Overlongs can occur whenever the number of continuation bytes
-     * changes.  That means whenever the number of leading 1 bits in a start
-     * byte increases from the next lower start byte.  That happens for start
-     * bytes C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following
-     * illegal start bytes have already been excluded, so don't need to be
-     * tested here;
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
+     * it isn't, and -1 if there isn't enough information to tell.  This last
+     * return value can happen if the sequence is incomplete, missing some
+     * trailing bytes that would form a complete character.  If there are
+     * enough bytes to make a definitive decision, this function does so.
+     * Usually 2 bytes sufficient.
+     *
+     * Overlongs can occur whenever the number of continuation bytes changes.
+     * That means whenever the number of leading 1 bits in a start byte
+     * increases from the next lower start byte.  That happens for start bytes
+     * C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following illegal
+     * start bytes have already been excluded, so don't need to be tested here;
       * ASCII platforms: C0, C1
       * EBCDIC platforms C0, C1, C2, C3, C4, E0
       * ASCII platforms: C0, C1
       * EBCDIC platforms C0, C1, C2, C3, C4, E0
-     *
-     * At least a second byte is required to determine if other sequences will
-     * be an overlong. */
+     */
  
      const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
      const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
  
      const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
      const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
@@ -596,7 +711,7 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  #       else
  
      if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
  #       else
  
      if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
-        return TRUE;
+        return 1;
      }
  
  #           define F0_ABOVE_OVERLONG 0x90
      }
  
  #           define F0_ABOVE_OVERLONG 0x90
@@ -612,27 +727,178 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
          || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
          || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
      {
          || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
          || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
      {
-        return TRUE;
+        return 1;
      }
  
      /* Check for the FF overlong */
      return isFF_OVERLONG(s, len);
  }
  
      }
  
      /* Check for the FF overlong */
      return isFF_OVERLONG(s, len);
  }
  
-PERL_STATIC_INLINE bool
+PERL_STATIC_INLINE int
  S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
  {
  S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
  {
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
+     * it isn't, and -1 if there isn't enough information to tell.  This last
+     * return value can happen if the sequence is incomplete, missing some
+     * trailing bytes that would form a complete character.  If there are
+     * enough bytes to make a definitive decision, this function does so. */
+
      PERL_ARGS_ASSERT_ISFF_OVERLONG;
  
      PERL_ARGS_ASSERT_ISFF_OVERLONG;
  
-    /* Check for the FF overlong.  This happens only if all these bytes match;
-     * what comes after them doesn't matter.  See tables in utf8.h,
+    /* To be an FF overlong, all the available bytes must match */
+    if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
+                     MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
+    {
+        return 0;
+    }
+
+    /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
+     * be there; what comes after them doesn't matter.  See tables in utf8.h,
       * utfebcdic.h. */
       * utfebcdic.h. */
+    if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
+        return 1;
+    }
  
  
-    return    len >= sizeof(FF_OVERLONG_PREFIX) - 1
-           && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
-                                            sizeof(FF_OVERLONG_PREFIX) - 1));
+    /* The missing bytes could cause the result to go one way or the other, so
+     * the result is indeterminate */
+    return -1;
  }
  
  }
  
+#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
+#  ifdef EBCDIC     /* Actually is I8 */
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#endif
+
+PERL_STATIC_INLINE int
+S_does_utf8_overflow(const U8 * const s,
+                     const U8 * e,
+                     const bool consider_overlongs)
+{
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 'e' - 1 would overflow an IV on this platform; that is if it represents
+     * a code point larger than the highest representable code point.  It
+     * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
+     * enough information to tell.  This last return value can happen if the
+     * sequence is incomplete, missing some trailing bytes that would form a
+     * complete character.  If there are enough bytes to make a definitive
+     * decision, this function does so.
+     *
+     * If 'consider_overlongs' is TRUE, the function checks for the possibility
+     * that the sequence is an overlong that doesn't overflow.  Otherwise, it
+     * assumes the sequence is not an overlong.  This can give different
+     * results only on ASCII 32-bit platforms.
+     *
+     * (For ASCII platforms, we could use memcmp() because we don't have to
+     * convert each byte to I8, but it's very rare input indeed that would
+     * approach overflow, so the loop below will likely only get executed once.)
+     *
+     * 'e' - 1 must not be beyond a full character. */
+
+
+    PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
+    assert(s <= e && s + UTF8SKIP(s) >= e);
+
+#if ! defined(UV_IS_QUAD)
+
+    return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
+
+#else
+
+    PERL_UNUSED_ARG(consider_overlongs);
+
+    {
+        const STRLEN len = e - s;
+        const U8 *x;
+        const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+        for (x = s; x < e; x++, y++) {
+
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+                continue;
+            }
+
+            /* If this byte is larger than the corresponding highest UTF-8
+             * byte, the sequence overflow; otherwise the byte is less than,
+             * and so the sequence doesn't overflow */
+            return NATIVE_UTF8_TO_I8(*x) > *y;
+
+        }
+
+        /* Got to the end and all bytes are the same.  If the input is a whole
+         * character, it doesn't overflow.  And if it is a partial character,
+         * there's not enough information to tell */
+        if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+#endif
+
+}
+
+#if 0
+
+/* This is the portions of the above function that deal with UV_MAX instead of
+ * IV_MAX.  They are left here in case we want to combine them so that internal
+ * uses can have larger code points.  The only logic difference is that the
+ * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
+ * different logic.
+ */
+
+/* Anything larger than this will overflow the word if it were converted into a UV */
+#if defined(UV_IS_QUAD)
+#  ifdef EBCDIC     /* Actually is I8 */
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#else   /* 32-bit */
+#  ifdef EBCDIC
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#endif
+
+#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+
+    /* On 32 bit ASCII machines, many overlongs that start with FF don't
+     * overflow */
+    if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
+
+        /* To be such an overlong, the first bytes of 's' must match
+         * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80".  If we
+         * don't have any additional bytes available, the sequence, when
+         * completed might or might not fit in 32 bits.  But if we have that
+         * next byte, we can tell for sure.  If it is <= 0x83, then it does
+         * fit. */
+        if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
+            return -1;
+        }
+
+        return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
+    }
+
+/* Starting with the #else, the rest of the function is identical except
+ *      1.  we need to move the 'len' declaration to be global to the function
+ *      2.  the endif move to just after the UNUSED_ARG.
+ * An empty endif is given just below to satisfy the preprocessor
+ */
+#endif
+
+#endif
+
  #undef F0_ABOVE_OVERLONG
  #undef F8_ABOVE_OVERLONG
  #undef FC_ABOVE_OVERLONG
  #undef F0_ABOVE_OVERLONG
  #undef F8_ABOVE_OVERLONG
  #undef FC_ABOVE_OVERLONG
@@ -680,7 +946,7 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
      PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
  
      assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
      PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
  
      assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
      assert(! UTF8_IS_INVARIANT(*s));
  
      /* A variant char must begin with a start byte */
      assert(! UTF8_IS_INVARIANT(*s));
  
      /* A variant char must begin with a start byte */
@@ -730,10 +996,12 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
                                                         /* B6 and B7 */      \
                                                && ((s1) & 0xFE ) == 0xB6)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
                                                         /* B6 and B7 */      \
                                                && ((s1) & 0xFE ) == 0xB6)
+#  define isUTF8_PERL_EXTENDED(s)   (*s == I8_TO_NATIVE_UTF8(0xFF))
  #else
  #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
  #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
  #else
  #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
  #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
+#  define isUTF8_PERL_EXTENDED(s)   (*s >= 0xFE)
  #endif
  
          if (  (flags & UTF8_DISALLOW_SUPER)
  #endif
  
          if (  (flags & UTF8_DISALLOW_SUPER)
@@ -742,10 +1010,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
              return 0;           /* Above Unicode */
          }
  
              return 0;           /* Above Unicode */
          }
  
-        if (   (flags & UTF8_DISALLOW_ABOVE_31_BIT)
-            &&  UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+        if (   (flags & UTF8_DISALLOW_PERL_EXTENDED)
+            &&  UNLIKELY(isUTF8_PERL_EXTENDED(s)))
          {
          {
-            return 0;           /* Above 31 bits */
+            return 0;
          }
  
          if (len > 1) {
          }
  
          if (len > 1) {
@@ -780,13 +1048,16 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  
      /* Here is syntactically valid.  Next, make sure this isn't the start of an
       * overlong. */
  
      /* Here is syntactically valid.  Next, make sure this isn't the start of an
       * overlong. */
-    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
          return 0;
      }
  
      /* And finally, that the code point represented fits in a word on this
       * platform */
          return 0;
      }
  
      /* And finally, that the code point represented fits in a word on this
       * platform */
-    if (does_utf8_overflow(s, e)) {
+    if (0 < does_utf8_overflow(s, e,
+                               0 /* Don't consider overlongs */
+                              ))
+    {
          return 0;
      }
  
          return 0;
      }
  
@@ -794,10 +1065,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  }
  
  char *
  }
  
  char *
-Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
+Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
  {
      /* Returns a mortalized C string that is a displayable copy of the 'len'
  {
      /* Returns a mortalized C string that is a displayable copy of the 'len'
-     * bytes starting at 's'.  'format' gives how to display each byte.
+     * bytes starting at 'start'.  'format' gives how to display each byte.
       * Currently, there are only two formats, so it is currently a bool:
       *      0   \xab
       *      1    ab         (that is a space between two hex digit bytes)
       * Currently, there are only two formats, so it is currently a bool:
       *      0   \xab
       *      1    ab         (that is a space between two hex digit bytes)
@@ -805,7 +1076,8 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
  
      const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
                                                 trailing NUL */
  
      const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
                                                 trailing NUL */
-    const U8 * const e = s + len;
+    const U8 * s = start;
+    const U8 * const e = start + len;
      char * output;
      char * d;
  
      char * output;
      char * d;
  
@@ -815,12 +1087,14 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
      SAVEFREEPV(output);
  
      d = output;
      SAVEFREEPV(output);
  
      d = output;
-    for (; s < e; s++) {
+    for (s = start; s < e; s++) {
          const unsigned high_nibble = (*s & 0xF0) >> 4;
          const unsigned low_nibble =  (*s & 0x0F);
  
          if (format) {
          const unsigned high_nibble = (*s & 0xF0) >> 4;
          const unsigned low_nibble =  (*s & 0x0F);
  
          if (format) {
-            *d++ = ' ';
+            if (s > start) {
+                *d++ = ' ';
+            }
          }
          else {
              *d++ = '\\';
          }
          else {
              *d++ = '\\';
@@ -954,35 +1228,29 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags.
  (But note that warnings are not raised if lexically disabled nor if
  C<UTF8_CHECK_ONLY> is also specified.)
  
  (But note that warnings are not raised if lexically disabled nor if
  C<UTF8_CHECK_ONLY> is also specified.)
  
-It is now deprecated to have very high code points (above C<IV_MAX> on the
-platforms) and this function will raise a deprecation warning for these (unless
-such warnings are turned off).  This value is typically 0x7FFF_FFFF (2**31 -1)
-in a 32-bit word.
-
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UTF8_WARN_SUPER> warns on these.)
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
+C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations.
+(Note that the Unicode standard considers anything above 0x10FFFF to be
+illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
+(2**31 -1))
+
+A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
+C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
+can apply to code points that actually do fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
+describe the situation in all cases.
  
  
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UTF8_WARN_ABOVE_31_BIT>
-and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
@@ -1004,6 +1272,102 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
      return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
  }
  
      return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
  }
  
+/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/,
+ * which requires this copyright notice */
+
+/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#if 0
+static U8 utf8d_C9[] = {
+  /* The first part of the table maps bytes to character classes that
+   * to reduce the size of the transition table and create bitmasks. */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
+
+  /* The second part is a transition table that maps a combination
+   * of a state of the automaton and a character class to a state. */
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12
+};
+
+#endif
+
+#ifndef EBCDIC
+
+/* This is a version of the above table customized for Perl that doesn't
+ * exclude surrogates and accepts start bytes up through F7 (representing
+ * 2**21 - 1). */
+static U8 dfa_tab_for_perl[] = {
+    /* The first part of the table maps bytes to character classes to reduce
+     * the size of the transition table and create bitmasks. */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
+  10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/
+
+  /* The second part is a transition table that maps a combination
+   * of a state of the automaton and a character class to a state. */
+   0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/
+  12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */
+
+ /* The customization was to repurpose the surrogates type '4' to instead be
+  * for start bytes F1-F7.  Types 5 and 6 are now unused, and their entries in
+  * the transition part of the table are set to 12, so are illegal.
+  *
+  * To do higher code points would require expansion and some rearrangement of
+  * the table.  The type '1' entries for continuation bytes 80-8f would have to
+  * be split into several types, because they aren't treated uniformly for
+  * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE:
+  * 80-81.  We start needing to worry about overflow if FE is included.
+  * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember
+  * from the web site that these are used to right shift).  FE would
+  * necessarily be type 7; and FF, type 8.  And new states would have to be
+  * created for F8 and FC (and FE and FF if used), so quite a bit of work would
+  * be involved.
+  *
+  * XXX Better would be to customize the table so that the noncharacters are
+  * excluded.  This again is non trivial, but doing so would simplify the code
+  * that uses this, and might make it small enough to make it inlinable */
+};
+
+#endif
+
  /*
  
  =for apidoc utf8n_to_uvchr_error
  /*
  
  =for apidoc utf8n_to_uvchr_error
@@ -1012,7 +1376,8 @@ THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
  Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  This function is for code that needs to know what the precise malformation(s)
  Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  This function is for code that needs to know what the precise malformation(s)
-are when an error is found.
+are when an error is found.  If you also need to know the generated warning
+messages, use L</utf8n_to_uvchr_msgs>() instead.
  
  It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
  all the others, C<errors>.  If this parameter is 0, this function behaves
  
  It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
  all the others, C<errors>.  If this parameter is 0, this function behaves
@@ -1026,12 +1391,36 @@ exceptions are noted:
  
  =over 4
  
  
  =over 4
  
-=item C<UTF8_GOT_ABOVE_31_BIT>
+=item C<UTF8_GOT_PERL_EXTENDED>
  
  
-The code point represented by the input UTF-8 sequence occupies more than 31
-bits.
-This bit is set only if the input C<flags> parameter contains either the
-C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+The input sequence is not standard UTF-8, but a Perl extension.  This bit is
+set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+and so some extension must be used to express them.  Perl uses a natural
+extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
+extension to represent even higher ones, so that any code point that fits in a
+64-bit word can be represented.  Text using these extensions is not likely to
+be portable to non-Perl code.  We lump both of these extensions together and
+refer to them as Perl extended UTF-8.  There exist other extensions that people
+have invented, incompatible with Perl's.
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII.  Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.
+
+On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
+Perl extended UTF-8 is used.
+
+In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
+may use for backward compatibility.  That name is misleading, as this flag may
+be set when the code point actually does fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
+describes the situation in all cases.
  
  =item C<UTF8_GOT_CONTINUATION>
  
  
  =item C<UTF8_GOT_CONTINUATION>
  
@@ -1065,7 +1454,7 @@ in a position where only a continuation type one should be.
  =item C<UTF8_GOT_OVERFLOW>
  
  The input sequence was malformed in that it is for a code point that is not
  =item C<UTF8_GOT_OVERFLOW>
  
  The input sequence was malformed in that it is for a code point that is not
-representable in the number of bits available in a UV on the current platform.
+representable in the number of bits available in an IV on the current platform.
  
  =item C<UTF8_GOT_SHORT>
  
  
  =item C<UTF8_GOT_SHORT>
  
@@ -1093,21 +1482,87 @@ To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
  flag to suppress any warnings, and then examine the C<*errors> return.
  
  =cut
  flag to suppress any warnings, and then examine the C<*errors> return.
  
  =cut
+
+Also implemented as a macro in utf8.h
  */
  
  UV
  Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  */
  
  UV
  Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
-                                STRLEN curlen,
-                                STRLEN *retlen,
-                                const U32 flags,
-                                U32 * errors)
+                          STRLEN curlen,
+                          STRLEN *retlen,
+                          const U32 flags,
+                          U32 * errors)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+
+    return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
+}
+
+/*
+
+=for apidoc utf8n_to_uvchr_msgs
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+
+This function is for code that needs to know what the precise malformation(s)
+are when an error is found, and wants the corresponding warning and/or error
+messages to be returned to the caller rather than be displayed.  All messages
+that would have been displayed if all lexcial warnings are enabled will be
+returned.
+
+It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
+placed after all the others, C<msgs>.  If this parameter is 0, this function
+behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
+be a pointer to an C<AV *> variable, in which this function creates a new AV to
+contain any appropriate messages.  The elements of the array are ordered so
+that the first message that would have been displayed is in the 0th element,
+and so on.  Each element is a hash with three key-value pairs, as follows:
+
+=over 4
+
+=item C<text>
+
+The text of the message as a C<SVpv>.
+
+=item C<warn_categories>
+
+The warning category (or categories) packed into a C<SVuv>.
+
+=item C<flag>
+
+A single flag bit associated with this message, in a C<SVuv>.
+The bit corresponds to some bit in the C<*errors> return value,
+such as C<UTF8_GOT_LONG>.
+
+=back
+
+It's important to note that specifying this parameter as non-null will cause
+any warnings this function would otherwise generate to be suppressed, and
+instead be placed in C<*msgs>.  The caller can check the lexical warnings state
+(or not) when choosing what to do with the returned messages.
+
+If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
+no AV is created.
+
+The caller, of course, is responsible for freeing any returned AV.
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+                               STRLEN curlen,
+                               STRLEN *retlen,
+                               const U32 flags,
+                               U32 * errors,
+                               AV ** msgs)
  {
      const U8 * const s0 = s;
  {
      const U8 * const s0 = s;
-    U8 * send = NULL;           /* (initialized to silence compilers' wrong
-                                   warning) */
+    const U8 * send = s0 + curlen;
      U32 possible_problems = 0;  /* A bit is set here for each potential problem
                                     found as we go along */
      U32 possible_problems = 0;  /* A bit is set here for each potential problem
                                     found as we go along */
-    UV uv = *s;
+    UV uv = (UV) -1;
      STRLEN expectlen   = 0;     /* How long should this sequence be?
                                     (initialized to silence compilers' wrong
                                     warning) */
      STRLEN expectlen   = 0;     /* How long should this sequence be?
                                     (initialized to silence compilers' wrong
                                     warning) */
@@ -1119,13 +1574,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
       * the third not used at all */
      U8 * adjusted_s0 = (U8 *) s0;
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
       * the third not used at all */
      U8 * adjusted_s0 = (U8 *) s0;
-    U8 * adjusted_send = NULL;  /* (Initialized to silence compilers' wrong
-                                   warning) */
      U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                              routine; see [perl #130921] */
      UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
  
      U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                              routine; see [perl #130921] */
      UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
  
      if (errors) {
          *errors = 0;
  
      if (errors) {
          *errors = 0;
@@ -1179,10 +1634,54 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
      }
  
      /* An invariant is trivially well-formed */
      }
  
      /* An invariant is trivially well-formed */
-    if (UTF8_IS_INVARIANT(uv)) {
-       return uv;
+    if (UTF8_IS_INVARIANT(*s0)) {
+       return *s0;
      }
  
      }
  
+#ifndef EBCDIC
+
+    /* Measurements show that this dfa is somewhat faster than the regular code
+     * below, so use it first, dropping down for the non-normal cases. */
+
+#  define PERL_UTF8_DECODE_REJECT 12
+
+    while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
+        UV type = dfa_tab_for_perl[*s];
+
+        if (state != 0) {
+            uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT);
+            state = dfa_tab_for_perl[256 + state + type];
+        }
+        else {
+            uv = (0xff >> type) & (*s);
+            state = dfa_tab_for_perl[256 + type];
+        }
+
+        if (state == 0) {
+
+            /* If this could be a code point that the flags don't allow (the first
+            * surrogate is the first such possible one), delve further, but we already
+            * have calculated 'uv' */
+            if (  (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                           |UTF8_WARN_ILLEGAL_INTERCHANGE))
+                && uv >= UNICODE_SURROGATE_FIRST)
+            {
+                curlen = s + 1 - s0;
+                goto got_uv;
+            }
+
+            return uv;
+        }
+
+        s++;
+    }
+
+    /* Here, is some sort of failure.  Use the full mechanism */
+
+    uv = *s0;
+
+#endif
+
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
@@ -1203,16 +1702,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
      /* Setup the loop end point, making sure to not look past the end of the
       * input string, and flag it as too short if the size isn't big enough. */
  
      /* Setup the loop end point, making sure to not look past the end of the
       * input string, and flag it as too short if the size isn't big enough. */
-    send = (U8*) s0;
      if (UNLIKELY(curlen < expectlen)) {
          possible_problems |= UTF8_GOT_SHORT;
          avail_len = curlen;
      if (UNLIKELY(curlen < expectlen)) {
          possible_problems |= UTF8_GOT_SHORT;
          avail_len = curlen;
-        send += curlen;
      }
      else {
      }
      else {
-        send += expectlen;
+        send = (U8*) s0 + expectlen;
      }
      }
-    adjusted_send = send;
  
      /* Now, loop through the remaining bytes in the character's sequence,
       * accumulating each into the working value as we go. */
  
      /* Now, loop through the remaining bytes in the character's sequence,
       * accumulating each into the working value as we go. */
@@ -1249,8 +1745,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
          uv = UNICODE_REPLACEMENT;
      }
  
          uv = UNICODE_REPLACEMENT;
      }
  
-    /* Check for overflow. */
-    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+    /* Check for overflow.  The algorithm requires us to not look past the end
+     * of the current character, even if partial, so the upper limit is 's' */
+    if (UNLIKELY(0 < does_utf8_overflow(s0, s,
+                                         1 /* Do consider overlongs */
+                                        )))
+    {
          possible_problems |= UTF8_GOT_OVERFLOW;
          uv = UNICODE_REPLACEMENT;
      }
          possible_problems |= UTF8_GOT_OVERFLOW;
          uv = UNICODE_REPLACEMENT;
      }
@@ -1264,8 +1764,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
          || (       UNLIKELY(possible_problems)
              && (   UNLIKELY(! UTF8_IS_START(*s0))
                  || (   curlen > 1
          || (       UNLIKELY(possible_problems)
              && (   UNLIKELY(! UTF8_IS_START(*s0))
                  || (   curlen > 1
-                    && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
-                                                                send - s0))))))
+                    && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
+                                                                s - s0))))))
      {
          possible_problems |= UTF8_GOT_LONG;
  
      {
          possible_problems |= UTF8_GOT_LONG;
  
@@ -1297,10 +1797,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
              }
  
              adjusted_s0 = temp_char_buf;
              }
  
              adjusted_s0 = temp_char_buf;
-            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+            (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
          }
      }
  
          }
      }
  
+  got_uv:
+
      /* Here, we have found all the possible problems, except for when the input
       * is for a problematic code point not allowed by the input parameters. */
  
      /* Here, we have found all the possible problems, except for when the input
       * is for a problematic code point not allowed by the input parameters. */
  
@@ -1316,22 +1818,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                             * and we deal with those in the overflow handling
                             * code */
                  && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
                             * and we deal with those in the overflow handling
                             * code */
                  && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
-                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+                && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
+                    || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
         && ((flags & ( UTF8_DISALLOW_NONCHAR
                        |UTF8_DISALLOW_SURROGATE
                        |UTF8_DISALLOW_SUPER
         && ((flags & ( UTF8_DISALLOW_NONCHAR
                        |UTF8_DISALLOW_SURROGATE
                        |UTF8_DISALLOW_SUPER
-                      |UTF8_DISALLOW_ABOVE_31_BIT
+                      |UTF8_DISALLOW_PERL_EXTENDED
                       |UTF8_WARN_NONCHAR
                        |UTF8_WARN_SURROGATE
                        |UTF8_WARN_SUPER
                       |UTF8_WARN_NONCHAR
                        |UTF8_WARN_SURROGATE
                        |UTF8_WARN_SUPER
-                      |UTF8_WARN_ABOVE_31_BIT))
-                   /* In case of a malformation, 'uv' is not valid, and has
-                    * been changed to something in the Unicode range.
-                    * Currently we don't output a deprecation message if there
-                    * is already a malformation, so we don't have to special
-                    * case the test immediately below */
-            || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-                && ckWARN_d(WARN_DEPRECATED))))
+                      |UTF8_WARN_PERL_EXTENDED))))
      {
          /* If there were no malformations, or the only malformation is an
           * overlong, 'uv' is valid */
      {
          /* If there were no malformations, or the only malformation is an
           * overlong, 'uv' is valid */
@@ -1396,32 +1892,32 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
       *                      end, based on how many bytes the start byte tells
       *                      us should be in it, but no further than s0 +
       *                      avail_len
       *                      end, based on how many bytes the start byte tells
       *                      us should be in it, but no further than s0 +
       *                      avail_len
-     * adjusted_s0          normally is the same as s0, but in case of an
-     *                      overlong for which the UTF-8 matters below, it is
-     *                      the first byte of the shortest form representation
-     *                      of the input.
-     * adjusted_send        normally is the same as 'send', but if adjusted_s0
-     *                      is set to something other than s0, this points one
-     *                      beyond its end
       */
  
      if (UNLIKELY(possible_problems)) {
          bool disallowed = FALSE;
          const U32 orig_problems = possible_problems;
  
       */
  
      if (UNLIKELY(possible_problems)) {
          bool disallowed = FALSE;
          const U32 orig_problems = possible_problems;
  
+        if (msgs) {
+            *msgs = NULL;
+        }
+
          while (possible_problems) { /* Handle each possible problem */
              UV pack_warn = 0;
              char * message = NULL;
          while (possible_problems) { /* Handle each possible problem */
              UV pack_warn = 0;
              char * message = NULL;
+            U32 this_flag_bit = 0;
  
              /* Each 'if' clause handles one problem.  They are ordered so that
               * the first ones' messages will be displayed before the later
  
              /* Each 'if' clause handles one problem.  They are ordered so that
               * the first ones' messages will be displayed before the later
-             * ones; this is kinda in decreasing severity order */
+             * ones; this is kinda in decreasing severity order.  But the
+             * overlong must come last, as it changes 'uv' looked at by the
+             * others */
              if (possible_problems & UTF8_GOT_OVERFLOW) {
  
                  /* Overflow means also got a super and are using Perl's
                   * extended UTF-8, but we handle all three cases here */
                  possible_problems
              if (possible_problems & UTF8_GOT_OVERFLOW) {
  
                  /* Overflow means also got a super and are using Perl's
                   * extended UTF-8, but we handle all three cases here */
                  possible_problems
-                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
                  *errors |= UTF8_GOT_OVERFLOW;
  
                  /* But the API says we flag all errors found */
                  *errors |= UTF8_GOT_OVERFLOW;
  
                  /* But the API says we flag all errors found */
@@ -1429,24 +1925,22 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
                  }
                  if (flags
                      *errors |= UTF8_GOT_SUPER;
                  }
                  if (flags
-                        & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
+                        & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
                  {
                  {
-                    *errors |= UTF8_GOT_ABOVE_31_BIT;
+                    *errors |= UTF8_GOT_PERL_EXTENDED;
                  }
  
                  /* Disallow if any of the three categories say to */
                  if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
                      || (flags & ( UTF8_DISALLOW_SUPER
                  }
  
                  /* Disallow if any of the three categories say to */
                  if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
                      || (flags & ( UTF8_DISALLOW_SUPER
-                                 |UTF8_DISALLOW_ABOVE_31_BIT)))
+                                 |UTF8_DISALLOW_PERL_EXTENDED)))
                  {
                      disallowed = TRUE;
                  }
  
                  {
                      disallowed = TRUE;
                  }
  
-                /* Likewise, warn if any say to, plus if deprecation warnings
-                 * are on, because this code point is above IV_MAX */
-                if (      ckWARN_d(WARN_DEPRECATED)
-                    || ! (flags & UTF8_ALLOW_OVERFLOW)
-                    ||   (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+                /* Likewise, warn if any say to */
+                if (  ! (flags & UTF8_ALLOW_OVERFLOW)
+                    ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
                  {
  
                      /* The warnings code explicitly says it doesn't handle the
                  {
  
                      /* The warnings code explicitly says it doesn't handle the
@@ -1456,16 +1950,17 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                       * necessarily do so in the future.  We output (only) the
                       * most dire warning */
                      if (! (flags & UTF8_CHECK_ONLY)) {
                       * necessarily do so in the future.  We output (only) the
                       * most dire warning */
                      if (! (flags & UTF8_CHECK_ONLY)) {
-                        if (ckWARN_d(WARN_UTF8)) {
+                        if (msgs || ckWARN_d(WARN_UTF8)) {
                              pack_warn = packWARN(WARN_UTF8);
                          }
                              pack_warn = packWARN(WARN_UTF8);
                          }
-                        else if (ckWARN_d(WARN_NON_UNICODE)) {
+                        else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
                              pack_warn = packWARN(WARN_NON_UNICODE);
                          }
                          if (pack_warn) {
                              message = Perl_form(aTHX_ "%s: %s (overflows)",
                                              malformed_text,
                                              _byte_dump_string(s0, curlen, 0));
                              pack_warn = packWARN(WARN_NON_UNICODE);
                          }
                          if (pack_warn) {
                              message = Perl_form(aTHX_ "%s: %s (overflows)",
                                              malformed_text,
                                              _byte_dump_string(s0, curlen, 0));
+                            this_flag_bit = UTF8_GOT_OVERFLOW;
                          }
                      }
                  }
                          }
                      }
                  }
@@ -1482,10 +1977,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      assert(0);
  
                      disallowed = TRUE;
                      assert(0);
  
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if (  (msgs
+                        || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_ "%s (empty string)",
                                                     malformed_text);
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_ "%s (empty string)",
                                                     malformed_text);
+                        this_flag_bit = UTF8_GOT_EMPTY;
                      }
                  }
              }
                      }
                  }
              }
@@ -1495,13 +1993,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                  if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                      disallowed = TRUE;
  
                  if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                                  "%s: %s (unexpected continuation byte 0x%02x,"
                                  " with no preceding start byte)",
                                  malformed_text,
                                  _byte_dump_string(s0, 1, 0), *s0);
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                                  "%s: %s (unexpected continuation byte 0x%02x,"
                                  " with no preceding start byte)",
                                  malformed_text,
                                  _byte_dump_string(s0, 1, 0), *s0);
+                        this_flag_bit = UTF8_GOT_CONTINUATION;
                      }
                  }
              }
                      }
                  }
              }
@@ -1511,7 +2012,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                  if (! (flags & UTF8_ALLOW_SHORT)) {
                      disallowed = TRUE;
  
                  if (! (flags & UTF8_ALLOW_SHORT)) {
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                               "%s: %s (too short; %d byte%s available, need %d)",
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                               "%s: %s (too short; %d byte%s available, need %d)",
@@ -1520,6 +2023,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                               (int)avail_len,
                               avail_len == 1 ? "" : "s",
                               (int)expectlen);
                               (int)avail_len,
                               avail_len == 1 ? "" : "s",
                               (int)expectlen);
+                        this_flag_bit = UTF8_GOT_SHORT;
                      }
                  }
  
                      }
                  }
  
@@ -1530,73 +2034,23 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                  if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                      disallowed = TRUE;
  
                  if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
  
                          /* If we don't know for sure that the input length is
                           * valid, avoid as much as possible reading past the
                           * end of the buffer */
                          int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
  
                          /* If we don't know for sure that the input length is
                           * valid, avoid as much as possible reading past the
                           * end of the buffer */
                          int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
-                                       ? s - s0
-                                       : send - s0;
-                        pack_warn = packWARN(WARN_UTF8);
-                        message = Perl_form(aTHX_ "%s",
-                            unexpected_non_continuation_text(s0,
-                                                            printlen,
-                                                            s - s0,
-                                                            (int) expectlen));
-                    }
-                }
-            }
-            else if (possible_problems & UTF8_GOT_LONG) {
-                possible_problems &= ~UTF8_GOT_LONG;
-                *errors |= UTF8_GOT_LONG;
-
-                if (flags & UTF8_ALLOW_LONG) {
-
-                    /* We don't allow the actual overlong value, unless the
-                     * special extra bit is also set */
-                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
-                                    & ~UTF8_ALLOW_LONG)))
-                    {
-                        uv = UNICODE_REPLACEMENT;
-                    }
-                }
-                else {
-                    disallowed = TRUE;
-
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                                       ? s - s0
+                                       : send - s0;
                          pack_warn = packWARN(WARN_UTF8);
                          pack_warn = packWARN(WARN_UTF8);
-
-                        /* These error types cause 'uv' to be something that
-                         * isn't what was intended, so can't use it in the
-                         * message.  The other error types either can't
-                         * generate an overlong, or else the 'uv' is valid */
-                        if (orig_problems &
-                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                        {
-                            message = Perl_form(aTHX_
-                                    "%s: %s (any UTF-8 sequence that starts"
-                                    " with \"%s\" is overlong which can and"
-                                    " should be represented with a"
-                                    " different, shorter sequence)",
-                                    malformed_text,
-                                    _byte_dump_string(s0, send - s0, 0),
-                                    _byte_dump_string(s0, curlen, 0));
-                        }
-                        else {
-                            U8 tmpbuf[UTF8_MAXBYTES+1];
-                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
-                                                                        uv, 0);
-                            message = Perl_form(aTHX_
-                                "%s: %s (overlong; instead use %s to represent"
-                                " U+%0*" UVXf ")",
-                                malformed_text,
-                                _byte_dump_string(s0, curlen, 0),
-                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
-                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
-                                                         small code points */
-                                uv);
-                        }
+                        message = Perl_form(aTHX_ "%s",
+                            unexpected_non_continuation_text(s0,
+                                                            printlen,
+                                                            s - s0,
+                                                            (int) expectlen));
+                        this_flag_bit = UTF8_GOT_NON_CONTINUATION;
                      }
                  }
              }
                      }
                  }
              }
@@ -1607,7 +2061,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SURROGATE;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
                      *errors |= UTF8_GOT_SURROGATE;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_SURROGATE))
+                        && (msgs || ckWARN_d(WARN_SURROGATE)))
                      {
                          pack_warn = packWARN(WARN_SURROGATE);
  
                      {
                          pack_warn = packWARN(WARN_SURROGATE);
  
@@ -1622,6 +2076,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                          else {
                              message = Perl_form(aTHX_ surrogate_cp_format, uv);
                          }
                          else {
                              message = Perl_form(aTHX_ surrogate_cp_format, uv);
                          }
+                        this_flag_bit = UTF8_GOT_SURROGATE;
                      }
                  }
  
                      }
                  }
  
@@ -1637,7 +2092,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
                      *errors |= UTF8_GOT_SUPER;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_NON_UNICODE))
+                        && (msgs || ckWARN_d(WARN_NON_UNICODE)))
                      {
                          pack_warn = packWARN(WARN_NON_UNICODE);
  
                      {
                          pack_warn = packWARN(WARN_NON_UNICODE);
  
@@ -1651,51 +2106,49 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                          else {
                              message = Perl_form(aTHX_ super_cp_format, uv);
                          }
                          else {
                              message = Perl_form(aTHX_ super_cp_format, uv);
                          }
+                        this_flag_bit = UTF8_GOT_SUPER;
                      }
                  }
  
                      }
                  }
  
-                /* The maximum code point ever specified by a standard was
-                 * 2**31 - 1.  Anything larger than that is a Perl extension
-                 * that very well may not be understood by other applications
-                 * (including earlier perl versions on EBCDIC platforms).  We
-                 * test for these after the regular SUPER ones, and before
-                 * possibly bailing out, so that the slightly more dire warning
-                 * will override the regular one. */
-                if (   (flags & (UTF8_WARN_ABOVE_31_BIT
-                                |UTF8_WARN_SUPER
-                                |UTF8_DISALLOW_ABOVE_31_BIT))
-                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
-                            && UNLIKELY(is_utf8_cp_above_31_bits(
-                                                adjusted_s0,
-                                                adjusted_send)))
-                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
-                            && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
-                {
+                /* Test for Perl's extended UTF-8 after the regular SUPER ones,
+                 * and before possibly bailing out, so that the more dire
+                 * warning will override the regular one. */
+                if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
                      if (  ! (flags & UTF8_CHECK_ONLY)
                      if (  ! (flags & UTF8_CHECK_ONLY)
-                        &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
-                        &&  ckWARN_d(WARN_UTF8))
+                        &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
+                        &&  (msgs || ckWARN_d(WARN_NON_UNICODE)))
                      {
                      {
-                        pack_warn = packWARN(WARN_UTF8);
+                        pack_warn = packWARN(WARN_NON_UNICODE);
  
  
-                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                        /* If it is an overlong that evaluates to a code point
+                         * that doesn't have to use the Perl extended UTF-8, it
+                         * still used it, and so we output a message that
+                         * doesn't refer to the code point.  The same is true
+                         * if there was a SHORT malformation where the code
+                         * point is not valid.  In that case, 'uv' will have
+                         * been set to the REPLACEMENT CHAR, and the message
+                         * below without the code point in it will be selected
+                         * */
+                        if (UNICODE_IS_PERL_EXTENDED(uv)) {
                              message = Perl_form(aTHX_
                              message = Perl_form(aTHX_
-                                        "Any UTF-8 sequence that starts with"
-                                        " \"%s\" is for a non-Unicode code"
-                                        " point, and is not portable",
-                                        _byte_dump_string(s0, curlen, 0));
+                                            perl_extended_cp_format, uv);
                          }
                          else {
                              message = Perl_form(aTHX_
                          }
                          else {
                              message = Perl_form(aTHX_
-                                            above_31_bit_cp_format, uv);
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is a Perl extension, and"
+                                        " so is not portable",
+                                        _byte_dump_string(s0, curlen, 0));
                          }
                          }
+                        this_flag_bit = UTF8_GOT_PERL_EXTENDED;
                      }
  
                      }
  
-                    if (flags & ( UTF8_WARN_ABOVE_31_BIT
-                                 |UTF8_DISALLOW_ABOVE_31_BIT))
+                    if (flags & ( UTF8_WARN_PERL_EXTENDED
+                                 |UTF8_DISALLOW_PERL_EXTENDED))
                      {
                      {
-                        *errors |= UTF8_GOT_ABOVE_31_BIT;
+                        *errors |= UTF8_GOT_PERL_EXTENDED;
  
  
-                        if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                        if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
                              disallowed = TRUE;
                          }
                      }
                              disallowed = TRUE;
                          }
                      }
@@ -1705,21 +2158,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
                      disallowed = TRUE;
                  }
                      *errors |= UTF8_GOT_SUPER;
                      disallowed = TRUE;
                  }
-
-                /* The deprecated warning overrides any non-deprecated one.  If
-                 * there are other problems, a deprecation message is not
-                 * really helpful, so don't bother to raise it in that case.
-                 * This also keeps the code from having to handle the case
-                 * where 'uv' is not valid. */
-                if (   ! (orig_problems
-                                    & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                    && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-                    && ckWARN_d(WARN_DEPRECATED))
-                {
-                    message = Perl_form(aTHX_ cp_above_legal_max,
-                                              uv, MAX_NON_DEPRECATED_CP);
-                    pack_warn = packWARN(WARN_DEPRECATED);
-                }
              }
              else if (possible_problems & UTF8_GOT_NONCHAR) {
                  possible_problems &= ~UTF8_GOT_NONCHAR;
              }
              else if (possible_problems & UTF8_GOT_NONCHAR) {
                  possible_problems &= ~UTF8_GOT_NONCHAR;
@@ -1728,7 +2166,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_NONCHAR;
  
                      if (  ! (flags & UTF8_CHECK_ONLY)
                      *errors |= UTF8_GOT_NONCHAR;
  
                      if (  ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_NONCHAR))
+                        && (msgs || ckWARN_d(WARN_NONCHAR)))
                      {
                          /* The code above should have guaranteed that we don't
                           * get here with errors other than overlong */
                      {
                          /* The code above should have guaranteed that we don't
                           * get here with errors other than overlong */
@@ -1737,6 +2175,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                          pack_warn = packWARN(WARN_NONCHAR);
                          message = Perl_form(aTHX_ nonchar_cp_format, uv);
  
                          pack_warn = packWARN(WARN_NONCHAR);
                          message = Perl_form(aTHX_ nonchar_cp_format, uv);
+                        this_flag_bit = UTF8_GOT_NONCHAR;
                      }
                  }
  
                      }
                  }
  
@@ -1744,12 +2183,89 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      disallowed = TRUE;
                      *errors |= UTF8_GOT_NONCHAR;
                  }
                      disallowed = TRUE;
                      *errors |= UTF8_GOT_NONCHAR;
                  }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
+
+                if (flags & UTF8_ALLOW_LONG) {
+
+                    /* We don't allow the actual overlong value, unless the
+                     * special extra bit is also set */
+                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
+                                    & ~UTF8_ALLOW_LONG)))
+                    {
+                        uv = UNICODE_REPLACEMENT;
+                    }
+                }
+                else {
+                    disallowed = TRUE;
+
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0, 0),
+                                    _byte_dump_string(s0, curlen, 0));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            /* Don't use U+ for non-Unicode code points, which
+                             * includes those in the Latin1 range */
+                            const char * preface = (    uv > PERL_UNICODE_MAX
+#ifdef EBCDIC
+                                                     || uv <= 0xFF
+#endif
+                                                    )
+                                                   ? "0x"
+                                                   : "U+";
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " %s%0*" UVXf ")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0, 0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
+                                preface,
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                UNI_TO_NATIVE(uv));
+                        }
+                        this_flag_bit = UTF8_GOT_LONG;
+                    }
+                }
              } /* End of looking through the possible flags */
  
              /* Display the message (if any) for the problem being handled in
               * this iteration of the loop */
              if (message) {
              } /* End of looking through the possible flags */
  
              /* Display the message (if any) for the problem being handled in
               * this iteration of the loop */
              if (message) {
-                if (PL_op)
+                if (msgs) {
+                    assert(this_flag_bit);
+
+                    if (*msgs == NULL) {
+                        *msgs = newAV();
+                    }
+
+                    av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
+                                                                pack_warn,
+                                                                this_flag_bit)));
+                }
+                else if (PL_op)
                      Perl_warner(aTHX_ pack_warn, "%s in %s", message,
                                                   OP_DESC(PL_op));
                  else
                      Perl_warner(aTHX_ pack_warn, "%s in %s", message,
                                                   OP_DESC(PL_op));
                  else
@@ -1791,9 +2307,6 @@ the next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
-Code points above the platform's C<IV_MAX> will raise a deprecation warning,
-unless those are turned off.
-
  =cut
  
  Also implemented as a macro in utf8.h
  =cut
  
  Also implemented as a macro in utf8.h
@@ -1818,7 +2331,9 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
  Only in very rare circumstances should code need to be dealing in Unicode
  (as opposed to native) code points.  In those few cases, use
  
  Only in very rare circumstances should code need to be dealing in Unicode
  (as opposed to native) code points.  In those few cases, use
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.  If you
+are not absolutely sure this is one of those cases, then assume it isn't and
+use plain C<utf8_to_uvchr_buf> instead.
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
@@ -1833,9 +2348,6 @@ is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
-Code points above the platform's C<IV_MAX> will raise a deprecation warning,
-unless those are turned off.
-
  =cut
  */
  
  =cut
  */
  
@@ -1846,16 +2358,18 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
      assert(send > s);
  
  
      assert(send > s);
  
-    /* Call the low level routine, asking for checks */
      return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
  }
  
  /*
  =for apidoc utf8_length
  
      return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
  }
  
  /*
  =for apidoc utf8_length
  
-Return the length of the UTF-8 char encoded string C<s> in characters.
-Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
-up past C<e>, croaks.
+Returns the number of characters in the sequence of UTF-8-encoded bytes starting
+at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
+same place, it returns 0 with no warning raised.
+
+If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
+and returns the number of valid characters.
  
  =cut
  */
  
  =cut
  */
@@ -2040,7 +2554,9 @@ C<*lenp> are unchanged, and the return value is the original C<s>.
  
  Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
  newly created string containing a downgraded copy of C<s>, and whose length is
  
  Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
  newly created string containing a downgraded copy of C<s>, and whose length is
-returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.
+returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
+caller is responsible for arranging for the memory used by this string to get
+freed.
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting the
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting the
@@ -2155,8 +2671,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f
      }
  
    finish_and_return:
      }
  
    finish_and_return:
-        *d = '\0';
-        *lenp = d - converted_start;
+    *d = '\0';
+    *lenp = d - converted_start;
  
      /* Trim unused space */
      Renew(converted_start, *lenp + 1, U8);
  
      /* Trim unused space */
      Renew(converted_start, *lenp + 1, U8);
@@ -2170,7 +2686,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f
  Converts a string C<s> of length C<*lenp> bytes from the native encoding into
  UTF-8.
  Returns a pointer to the newly-created string, and sets C<*lenp> to
  Converts a string C<s> of length C<*lenp> bytes from the native encoding into
  UTF-8.
  Returns a pointer to the newly-created string, and sets C<*lenp> to
-reflect the new length in bytes.
+reflect the new length in bytes.  The caller is responsible for arranging for
+the memory used by this string to get freed.
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting it from the
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting it from the
@@ -2202,16 +2719,30 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
          append_utf8_from_native_byte(*s, &d);
          s++;
      }
          append_utf8_from_native_byte(*s, &d);
          s++;
      }
+
      *d = '\0';
      *lenp = d-dst;
      *d = '\0';
      *lenp = d-dst;
+
+    /* Trim unused space */
+    Renew(dst, *lenp + 1, U8);
+
      return dst;
  }
  
  /*
      return dst;
  }
  
  /*
- * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
+ * Convert native (big-endian) UTF-16 to UTF-8.  For reversed (little-endian),
+ * use utf16_to_utf8_reversed().
+ *
+ * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
+ * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
+ * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
   *
   *
- * Destination must be pre-extended to 3/2 source.  Do not use in-place.
- * We optimize for native, for obvious reasons. */
+ * These functions don't check for overflow.  The worst case is every code
+ * point in the input is 2 bytes, and requires 4 bytes on output.  (If the code
+ * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.)  Therefore the
+ * destination must be pre-extended to 2 times the source length.
+ *
+ * Do not use in-place.  We optimize for native, for obvious reasons. */
  
  U8*
  Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  
  U8*
  Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
@@ -2239,10 +2770,12 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
             continue;
         }
             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
             continue;
         }
+
  #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
  #define LAST_HIGH_SURROGATE  0xDBFF
  #define FIRST_LOW_SURROGATE  0xDC00
  #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
  #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
  #define LAST_HIGH_SURROGATE  0xDBFF
  #define FIRST_LOW_SURROGATE  0xDC00
  #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
+#define FIRST_IN_PLANE1      0x10000
  
          /* This assumes that most uses will be in the first Unicode plane, not
           * needing surrogates */
  
          /* This assumes that most uses will be in the first Unicode plane, not
           * needing surrogates */
@@ -2261,13 +2794,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
                  }
                 p += 2;
                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
                  }
                 p += 2;
                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
-                                       + (low - FIRST_LOW_SURROGATE) + 0x10000;
+                                + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
             }
         }
  #ifdef EBCDIC
          d = uvoffuni_to_utf8_flags(d, uv, 0);
  #else
             }
         }
  #ifdef EBCDIC
          d = uvoffuni_to_utf8_flags(d, uv, 0);
  #else
-       if (uv < 0x10000) {
+       if (uv < FIRST_IN_PLANE1) {
             *d++ = (U8)(( uv >> 12)         | 0xe0);
             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
             *d++ = (U8)(( uv >> 12)         | 0xe0);
             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
@@ -2312,9 +2845,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  bool
  Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
  {
  bool
  Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
+    return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
  }
  
  /* Internal function so we can deprecate the external one, and call
  }
  
  /* Internal function so we can deprecate the external one, and call
@@ -2327,23 +2858,21 @@ Perl__is_utf8_idstart(pTHX_ const U8 *p)
  
      if (*p == '_')
         return TRUE;
  
      if (*p == '_')
         return TRUE;
-    return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
+    return is_utf8_common(p, NULL,
+                          "This is buggy if this gets used",
+                          PL_utf8_idstart);
  }
  
  bool
  Perl__is_uni_perl_idcont(pTHX_ UV c)
  {
  }
  
  bool
  Perl__is_uni_perl_idcont(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
+    return _invlist_contains_cp(PL_utf8_perl_idcont, c);
  }
  
  bool
  Perl__is_uni_perl_idstart(pTHX_ UV c)
  {
  }
  
  bool
  Perl__is_uni_perl_idstart(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
+    return _invlist_contains_cp(PL_utf8_perl_idstart, c);
  }
  
  UV
  }
  
  UV
@@ -2404,27 +2933,72 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
      return converted;
  }
  
      return converted;
  }
  
+/* If compiled on an early Unicode version, there may not be auxiliary tables
+ * */
+#ifndef HAS_UC_AUX_TABLES
+#  define UC_AUX_TABLE_ptrs     NULL
+#  define UC_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_TC_AUX_TABLES
+#  define TC_AUX_TABLE_ptrs     NULL
+#  define TC_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_LC_AUX_TABLES
+#  define LC_AUX_TABLE_ptrs     NULL
+#  define LC_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_CF_AUX_TABLES
+#  define CF_AUX_TABLE_ptrs     NULL
+#  define CF_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_UC_AUX_TABLES
+#  define UC_AUX_TABLE_ptrs     NULL
+#  define UC_AUX_TABLE_lengths  NULL
+#endif
+
  /* Call the function to convert a UTF-8 encoded character to the specified case.
   * Note that there may be more than one character in the result.
  /* Call the function to convert a UTF-8 encoded character to the specified case.
   * Note that there may be more than one character in the result.
- * INP is a pointer to the first byte of the input character
- * OUTP will be set to the first byte of the string of changed characters.  It
+ * 's' is a pointer to the first byte of the input character
+ * 'd' will be set to the first byte of the string of changed characters.  It
   *     needs to have space for UTF8_MAXBYTES_CASE+1 bytes
   *     needs to have space for UTF8_MAXBYTES_CASE+1 bytes
- * LENP will be set to the length in bytes of the string of changed characters
+ * 'lenp' will be set to the length in bytes of the string of changed characters
   *
   * The functions return the ordinal of the first character in the string of
   *
   * The functions return the ordinal of the first character in the string of
- * OUTP */
+ * 'd' */
  #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
  #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
-                _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
+                _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
+                                              Uppercase_Mapping_invmap,     \
+                                              UC_AUX_TABLE_ptrs,            \
+                                              UC_AUX_TABLE_lengths,         \
+                                              "uppercase")
  #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
  #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
-                _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
+                _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
+                                              Titlecase_Mapping_invmap,     \
+                                              TC_AUX_TABLE_ptrs,            \
+                                              TC_AUX_TABLE_lengths,         \
+                                              "titlecase")
  #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
  #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
-                _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
+                _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
+                                              Lowercase_Mapping_invmap,     \
+                                              LC_AUX_TABLE_ptrs,            \
+                                              LC_AUX_TABLE_lengths,         \
+                                              "lowercase")
+
  
  /* This additionally has the input parameter 'specials', which if non-zero will
   * cause this to use the specials hash for folding (meaning get full case
   * folding); otherwise, when zero, this implies a simple case fold */
  #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
  
  /* This additionally has the input parameter 'specials', which if non-zero will
   * cause this to use the specials hash for folding (meaning get full case
   * folding); otherwise, when zero, this implies a simple case fold */
  #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
-_to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
+        (specials)                                                          \
+        ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
+                                          Case_Folding_invmap,              \
+                                          CF_AUX_TABLE_ptrs,                \
+                                          CF_AUX_TABLE_lengths,             \
+                                          "foldcase")                       \
+        : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
+                                         Simple_Case_Folding_invmap,        \
+                                         NULL, NULL,                        \
+                                         "foldcase")
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
@@ -2501,8 +3075,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  }
  
  UV
  }
  
  UV
-Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
-                           const unsigned int flags)
+Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
  {
      /* Corresponds to to_lower_latin1(); <flags> bits meanings:
       *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
  {
      /* Corresponds to to_lower_latin1(); <flags> bits meanings:
       *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
@@ -2514,7 +3087,6 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
      UV converted;
  
      PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
      UV converted;
  
      PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
-    PERL_UNUSED_CONTEXT;
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
@@ -2577,12 +3149,13 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
      PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      if (flags & FOLD_FLAGS_LOCALE) {
      PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      if (flags & FOLD_FLAGS_LOCALE) {
-        /* Treat a UTF-8 locale as not being in locale at all */
+        /* Treat a UTF-8 locale as not being in locale at all, except for
+         * potentially warning */
+        _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
          if (IN_UTF8_CTYPE_LOCALE) {
              flags &= ~FOLD_FLAGS_LOCALE;
          }
          else {
          if (IN_UTF8_CTYPE_LOCALE) {
              flags &= ~FOLD_FLAGS_LOCALE;
          }
          else {
-            _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
              goto needs_full_generality;
          }
      }
              goto needs_full_generality;
          }
      }
@@ -2639,6 +3212,12 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
          NOT_REACHED; /* NOTREACHED */
      }
  
          NOT_REACHED; /* NOTREACHED */
      }
  
+    if (invlist) {
+        return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
+    }
+
+    assert(swash);
+
      if (!*swash) {
          U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
          *swash = _core_swash_init("utf8",
      if (!*swash) {
          U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
          *swash = _core_swash_init("utf8",
@@ -2674,6 +3253,12 @@ S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
          NOT_REACHED; /* NOTREACHED */
      }
  
          NOT_REACHED; /* NOTREACHED */
      }
  
+    if (invlist) {
+        return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
+    }
+
+    assert(swash);
+
      if (!*swash) {
          U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
          *swash = _core_swash_init("utf8",
      if (!*swash) {
          U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
          *swash = _core_swash_init("utf8",
@@ -2759,8 +3344,8 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
              case _CC_CASED:
  
                  return is_utf8_common(p,
              case _CC_CASED:
  
                  return is_utf8_common(p,
-                                      &PL_utf8_swash_ptrs[classnum],
-                                      swash_property_names[classnum],
+                                      NULL,
+                                      "This is buggy if this gets used",
                                        PL_XPosix_ptrs[classnum]);
  
              case _CC_SPACE:
                                        PL_XPosix_ptrs[classnum]);
  
              case _CC_SPACE:
@@ -2776,19 +3361,13 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
              case _CC_VERTSPACE:
                  return is_VERTWS_high(p);
              case _CC_IDFIRST:
              case _CC_VERTSPACE:
                  return is_VERTWS_high(p);
              case _CC_IDFIRST:
-                if (! PL_utf8_perl_idstart) {
-                    PL_utf8_perl_idstart
-                                = _new_invlist_C_array(_Perl_IDStart_invlist);
-                }
-                return is_utf8_common(p, &PL_utf8_perl_idstart,
-                                      "_Perl_IDStart", NULL);
+                return is_utf8_common(p, NULL,
+                                      "This is buggy if this gets used",
+                                      PL_utf8_perl_idstart);
              case _CC_IDCONT:
              case _CC_IDCONT:
-                if (! PL_utf8_perl_idcont) {
-                    PL_utf8_perl_idcont
-                                = _new_invlist_C_array(_Perl_IDCont_invlist);
-                }
-                return is_utf8_common(p, &PL_utf8_perl_idcont,
-                                      "_Perl_IDCont", NULL);
+                return is_utf8_common(p, NULL,
+                                      "This is buggy if this gets used",
+                                      PL_utf8_perl_idcont);
          }
      }
  
          }
      }
  
@@ -2827,27 +3406,19 @@ Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
  {
      PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
  
  {
      PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
  
-    assert(classnum < _FIRST_NON_SWASH_CC);
-
-    return is_utf8_common_with_len(p,
-                                   e,
-                                   &PL_utf8_swash_ptrs[classnum],
-                                   swash_property_names[classnum],
+    return is_utf8_common_with_len(p, e, NULL,
+                                   "This is buggy if this gets used",
                                     PL_XPosix_ptrs[classnum]);
  }
  
  bool
  Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
                                     PL_XPosix_ptrs[classnum]);
  }
  
  bool
  Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
-    SV* invlist = NULL;
-
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
  
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
  
-    if (! PL_utf8_perl_idstart) {
-        invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
-    }
-    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
-                                      "_Perl_IDStart", invlist);
+    return is_utf8_common_with_len(p, e, NULL,
+                                   "This is buggy if this gets used",
+                                   PL_utf8_perl_idstart);
  }
  
  bool
  }
  
  bool
@@ -2863,15 +3434,11 @@ Perl__is_utf8_xidstart(pTHX_ const U8 *p)
  bool
  Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
  bool
  Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
-    SV* invlist = NULL;
-
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
  
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
  
-    if (! PL_utf8_perl_idcont) {
-        invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
-    }
-    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
-                                   "_Perl_IDCont", invlist);
+    return is_utf8_common_with_len(p, e, NULL,
+                                   "This is buggy if this gets used",
+                                   PL_utf8_perl_idcont);
  }
  
  bool
  }
  
  bool
@@ -2887,7 +3454,7 @@ Perl__is_utf8_xidcont(pTHX_ const U8 *p)
  {
      PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
  
  {
      PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
  
-    return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
+    return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL);
  }
  
  bool
  }
  
  bool
@@ -2898,13 +3465,26 @@ Perl__is_utf8_mark(pTHX_ const U8 *p)
      return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
  }
  
      return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
  }
  
-    /* change namve uv1 to 'from' */
  STATIC UV
  STATIC UV
-S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
-               SV **swashp, const char *normal, const char *special)
+S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
+                      U8* ustrp, STRLEN *lenp,
+                      SV *invlist, const int * const invmap,
+                      const unsigned int * const * const aux_tables,
+                      const U8 * const aux_table_lengths,
+                      const char * const normal)
  {
      STRLEN len = 0;
  
  {
      STRLEN len = 0;
  
+    /* Change the case of code point 'uv1' whose UTF-8 representation (assumed
+     * by this routine to be valid) begins at 'p'.  'normal' is a string to use
+     * to name the new case in any generated messages, as a fallback if the
+     * operation being used is not available.  The new case is given by the
+     * data structures in the remaining arguments.
+     *
+     * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
+     * entire changed case string, and the return value is the first code point
+     * in that string */
+
      PERL_ARGS_ASSERT__TO_UTF8_CASE;
  
      /* For code points that don't change case, we already know that the output
      PERL_ARGS_ASSERT__TO_UTF8_CASE;
  
      /* For code points that don't change case, we already know that the output
@@ -2969,15 +3549,12 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                   * some others */
                  if (uv1 < 0xFB00) {
                      goto cases_to_self;
                   * some others */
                  if (uv1 < 0xFB00) {
                      goto cases_to_self;
-
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
-                    if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
-                        && ckWARN_d(WARN_DEPRECATED))
-                    {
-                        Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                                cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
+                    if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
+                        Perl_croak(aTHX_ cp_above_legal_max, uv1,
+                                         MAX_EXTERNALLY_LEGAL_CP);
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
@@ -3001,62 +3578,46 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
          }
  
         /* Note that non-characters are perfectly legal, so no warning should
          }
  
         /* Note that non-characters are perfectly legal, so no warning should
-         * be given.  There are so few of them, that it isn't worth the extra
-         * tests to avoid swash creation */
+         * be given. */
      }
  
      }
  
-    if (!*swashp) /* load on-demand */
-         *swashp = _core_swash_init("utf8", normal, &PL_sv_undef,
-                                    4, 0, NULL, NULL);
+    {
+        unsigned int i;
+        const unsigned int * cp_list;
+        U8 * d;
+        SSize_t index = _invlist_search(invlist, uv1);
+        IV base = invmap[index];
  
  
-    if (special) {
-         /* It might be "special" (sometimes, but not always,
-         * a multicharacter mapping) */
-         HV *hv = NULL;
-        SV **svp;
+        /* The data structures are set up so that if 'base' is non-negative,
+         * the case change is 1-to-1; and if 0, the change is to itself */
+        if (base >= 0) {
+            IV lc;
  
  
-        /* If passed in the specials name, use that; otherwise use any
-         * given in the swash */
-         if (*special != '\0') {
-            hv = get_hv(special, 0);
-        }
-        else {
-            svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
-            if (svp) {
-                hv = MUTABLE_HV(SvRV(*svp));
+            if (base == 0) {
+                goto cases_to_self;
              }
              }
-        }
-
-        if (hv
-             && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
-             && (*svp))
-         {
-            const char *s;
-
-             s = SvPV_const(*svp, len);
-             if (len == 1)
-                  /* EIGHTBIT */
-                  len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
-             else {
-                  Copy(s, ustrp, len, U8);
-             }
-        }
-    }
  
  
-    if (!len && *swashp) {
-       const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
+            /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
+            lc = base + uv1 - invlist_array(invlist)[index];
+            *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
+            return lc;
+        }
  
  
-        if (uv2) {
-             /* It was "normal" (a single character mapping). */
-             len = uvchr_to_utf8(ustrp, uv2) - ustrp;
-        }
-    }
+        /* Here 'base' is negative.  That means the mapping is 1-to-many, and
+         * requires an auxiliary table look up.  abs(base) gives the index into
+         * a list of such tables which points to the proper aux table.  And a
+         * parallel list gives the length of each corresponding aux table. */
+        cp_list = aux_tables[-base];
  
  
-    if (len) {
-        if (lenp) {
-            *lenp = len;
+        /* Create the string of UTF-8 from the mapped-to code points */
+        d = ustrp;
+        for (i = 0; i < aux_table_lengths[-base]; i++) {
+            d = uvchr_to_utf8(d, cp_list[i]);
          }
          }
-        return valid_utf8_to_uvchr(ustrp, 0);
+        *d = '\0';
+        *lenp = d - ustrp;
+
+        return cp_list[0];
      }
  
      /* Here, there was no mapping defined, which means that the code point maps
      }
  
      /* Here, there was no mapping defined, which means that the code point maps
@@ -3074,6 +3635,67 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
  
  }
  
  
  }
  
+Size_t
+Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
+                          const unsigned int ** remaining_folds_to)
+{
+    /* Returns the count of the number of code points that fold to the input
+     * 'cp' (besides itself).
+     *
+     * If the return is 0, there is nothing else that folds to it, and
+     * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
+     *
+     * If the return is 1, '*first_folds_to' is set to the single code point,
+     * and '*remaining_folds_to' is set to NULL.
+     *
+     * Otherwise, '*first_folds_to' is set to a code point, and
+     * '*remaining_fold_to' is set to an array that contains the others.  The
+     * length of this array is the returned count minus 1.
+     *
+     * The reason for this convolution is to avoid having to deal with
+     * allocating and freeing memory.  The lists are already constructed, so
+     * the return can point to them, but single code points aren't, so would
+     * need to be constructed if we didn't employ something like this API */
+
+    SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
+    int base = _Perl_IVCF_invmap[index];
+
+    PERL_ARGS_ASSERT__INVERSE_FOLDS;
+
+    if (base == 0) {            /* No fold */
+        *first_folds_to = 0;
+        *remaining_folds_to = NULL;
+        return 0;
+    }
+
+#ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
+
+    assert(base > 0);
+
+#else
+
+    if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
+
+        /* The data structure is set up so that the absolute value of 'base' is
+         * an index into a table of pointers to arrays, with the array
+         * corresponding to the index being the list of code points that fold
+         * to 'cp', and the parallel array containing the length of the list
+         * array */
+        *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
+        *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
+                                                                 *first_folds_to
+                                                                */
+        return IVCF_AUX_TABLE_lengths[-base];
+    }
+
+#endif
+
+    /* Only the single code point.  This works like 'fc(G) = G - A + a' */
+    *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+    *remaining_folds_to = NULL;
+    return 1;
+}
+
  STATIC UV
  S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
                                         U8* const ustrp, STRLEN *lenp)
  STATIC UV
  S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
                                         U8* const ustrp, STRLEN *lenp)
@@ -3098,7 +3720,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
      assert(UTF8_IS_ABOVE_LATIN1(*p));
  
      /* We know immediately if the first character in the string crosses the
      assert(UTF8_IS_ABOVE_LATIN1(*p));
  
      /* We know immediately if the first character in the string crosses the
-     * boundary, so can skip */
+     * boundary, so can skip testing */
      if (result > 255) {
  
         /* Look at every character in the result; if any cross the
      if (result > 255) {
  
         /* Look at every character in the result; if any cross the
@@ -3243,13 +3865,11 @@ S_check_and_deprecate(pTHX_ const U8 *p,
                                 L1_func_extra_param)                          \
                                                                               \
      if (flags & (locale_flags)) {                                            \
                                 L1_func_extra_param)                          \
                                                                               \
      if (flags & (locale_flags)) {                                            \
+        _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                  \
          /* Treat a UTF-8 locale as not being in locale at all */             \
          if (IN_UTF8_CTYPE_LOCALE) {                                          \
              flags &= ~(locale_flags);                                        \
          }                                                                    \
          /* Treat a UTF-8 locale as not being in locale at all */             \
          if (IN_UTF8_CTYPE_LOCALE) {                                          \
              flags &= ~(locale_flags);                                        \
          }                                                                    \
-        else {                                                               \
-            _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                              \
-        }                                                                    \
      }                                                                        \
                                                                               \
      if (UTF8_IS_INVARIANT(*p)) {                                             \
      }                                                                        \
                                                                               \
      if (UTF8_IS_INVARIANT(*p)) {                                             \
@@ -3441,17 +4061,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
         if (flags & FOLD_FLAGS_LOCALE) {
  
  #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
         if (flags & FOLD_FLAGS_LOCALE) {
  
  #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
-            const unsigned int long_s_t_len    = sizeof(LONG_S_T) - 1;
-
  #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  
  #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  
-            const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
-
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
-            if (UTF8SKIP(p) == cap_sharp_s_len
-                && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
+            if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3461,8 +4076,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
              }
              else
  #endif
              }
              else
  #endif
-                 if (UTF8SKIP(p) == long_s_t_len
-                     && memEQ((char *) p, LONG_S_T, long_s_t_len))
+                 if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3481,9 +4095,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
               * 255/256 boundary which is forbidden under /l, and so the code
               * wouldn't catch that they are equivalent (which they are only in
               * this release) */
               * 255/256 boundary which is forbidden under /l, and so the code
               * wouldn't catch that they are equivalent (which they are only in
               * this release) */
-            else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
-                     && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
-            {
+            else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
                                "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
                                "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
@@ -3671,11 +4283,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
  
      SV* retval = &PL_sv_undef;
      HV* swash_hv = NULL;
  
      SV* retval = &PL_sv_undef;
      HV* swash_hv = NULL;
-    const int invlist_swash_boundary =
-        (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
-        ? 512    /* Based on some benchmarking, but not extensive, see commit
-                    message */
-        : -1;   /* Never return just an inversion list */
+    const bool use_invlist= (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST);
  
      assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
      assert(! invlist || minbits == 1);
  
      assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
      assert(! invlist || minbits == 1);
@@ -3842,7 +4450,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
  
                  /* Here, there is no swash already.  Set up a minimal one, if
                   * we are going to return a swash */
  
                  /* Here, there is no swash already.  Set up a minimal one, if
                   * we are going to return a swash */
-                if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
+                if (! use_invlist) {
                      swash_hv = newHV();
                      retval = newRV_noinc(MUTABLE_SV(swash_hv));
                  }
                      swash_hv = newHV();
                      retval = newRV_noinc(MUTABLE_SV(swash_hv));
                  }
@@ -3853,9 +4461,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
          /* Here, we have computed the union of all the passed-in data.  It may
           * be that there was an inversion list in the swash which didn't get
           * touched; otherwise save the computed one */
          /* Here, we have computed the union of all the passed-in data.  It may
           * be that there was an inversion list in the swash which didn't get
           * touched; otherwise save the computed one */
-       if (! invlist_in_swash_is_valid
-            && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
-        {
+       if (! invlist_in_swash_is_valid && ! use_invlist) {
             if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
              {
                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
             if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
              {
                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
@@ -3868,8 +4474,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
          /* The result is immutable.  Forbid attempts to change it. */
          SvREADONLY_on(swash_invlist);
  
          /* The result is immutable.  Forbid attempts to change it. */
          SvREADONLY_on(swash_invlist);
  
-        /* Use the inversion list stand-alone if small enough */
-        if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
+        if (use_invlist) {
             SvREFCNT_dec(retval);
             if (!swash_invlist_unclaimed)
                 SvREFCNT_inc_simple_void_NN(swash_invlist);
             SvREFCNT_dec(retval);
             if (!swash_invlist_unclaimed)
                 SvREFCNT_inc_simple_void_NN(swash_invlist);
@@ -4505,320 +5110,6 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
      return swatch;
  }
  
      return swatch;
  }
  
-HV*
-Perl__swash_inversion_hash(pTHX_ SV* const swash)
-{
-
-   /* Subject to change or removal.  For use only in regcomp.c and regexec.c
-    * Can't be used on a property that is subject to user override, as it
-    * relies on the value of SPECIALS in the swash which would be set by
-    * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
-    * for overridden properties
-    *
-    * Returns a hash which is the inversion and closure of a swash mapping.
-    * For example, consider the input lines:
-    * 004B             006B
-    * 004C             006C
-    * 212A             006B
-    *
-    * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
-    * 006C.  The value for each key is an array.  For 006C, the array would
-    * have two elements, the UTF-8 for itself, and for 004C.  For 006B, there
-    * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
-    *
-    * Note that there are no elements in the hash for 004B, 004C, 212A.  The
-    * keys are only code points that are folded-to, so it isn't a full closure.
-    *
-    * Essentially, for any code point, it gives all the code points that map to
-    * it, or the list of 'froms' for that point.
-    *
-    * Currently it ignores any additions or deletions from other swashes,
-    * looking at just the main body of the swash, and if there are SPECIALS
-    * in the swash, at that hash
-    *
-    * The specials hash can be extra code points, and most likely consists of
-    * maps from single code points to multiple ones (each expressed as a string
-    * of UTF-8 characters).   This function currently returns only 1-1 mappings.
-    * However consider this possible input in the specials hash:
-    * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
-    * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
-    *
-    * Both FB05 and FB06 map to the same multi-char sequence, which we don't
-    * currently handle.  But it also means that FB05 and FB06 are equivalent in
-    * a 1-1 mapping which we should handle, and this relationship may not be in
-    * the main table.  Therefore this function examines all the multi-char
-    * sequences and adds the 1-1 mappings that come out of that.
-    *
-    * XXX This function was originally intended to be multipurpose, but its
-    * only use is quite likely to remain for constructing the inversion of
-    * the CaseFolding (//i) property.  If it were more general purpose for
-    * regex patterns, it would have to do the FB05/FB06 game for simple folds,
-    * because certain folds are prohibited under /iaa and /il.  As an example,
-    * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
-    * equivalent under /i.  But under /iaa and /il, the folds to 'i' are
-    * prohibited, so we would not figure out that they fold to each other.
-    * Code could be written to automatically figure this out, similar to the
-    * code that does this for multi-character folds, but this is the only case
-    * where something like this is ever likely to happen, as all the single
-    * char folds to the 0-255 range are now quite settled.  Instead there is a
-    * little special code that is compiled only for this Unicode version.  This
-    * is smaller and didn't require much coding time to do.  But this makes
-    * this routine strongly tied to being used just for CaseFolding.  If ever
-    * it should be generalized, this would have to be fixed */
-
-    U8 *l, *lend;
-    STRLEN lcur;
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-
-    /* The string containing the main body of the table.  This will have its
-     * assertion fail if the swash has been converted to its inversion list */
-    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
-
-    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
-    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
-    /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
-    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
-    const STRLEN bits  = SvUV(*bitssvp);
-    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
-    const UV     none  = SvUV(*nonesvp);
-    SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
-
-    HV* ret = newHV();
-
-    PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
-
-    /* Must have at least 8 bits to get the mappings */
-    if (bits != 8 && bits != 16 && bits != 32) {
-       Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"
-                         UVuf, (UV)bits);
-    }
-
-    if (specials_p) { /* It might be "special" (sometimes, but not always, a
-                       mapping to more than one character */
-
-       /* Construct an inverse mapping hash for the specials */
-       HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
-       HV * specials_inverse = newHV();
-       char *char_from; /* the lhs of the map */
-       I32 from_len;   /* its byte length */
-       char *char_to;  /* the rhs of the map */
-       I32 to_len;     /* its byte length */
-       SV *sv_to;      /* and in a sv */
-       AV* from_list;  /* list of things that map to each 'to' */
-
-       hv_iterinit(specials_hv);
-
-       /* The keys are the characters (in UTF-8) that map to the corresponding
-        * UTF-8 string value.  Iterate through the list creating the inverse
-        * list. */
-       while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
-           SV** listp;
-           if (! SvPOK(sv_to)) {
-               Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
-                          "unexpectedly is not a string, flags=%lu",
-                          (unsigned long)SvFLAGS(sv_to));
-           }
-           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
-
-           /* Each key in the inverse list is a mapped-to value, and the key's
-            * hash value is a list of the strings (each in UTF-8) that map to
-            * it.  Those strings are all one character long */
-           if ((listp = hv_fetch(specials_inverse,
-                                   SvPVX(sv_to),
-                                   SvCUR(sv_to), 0)))
-           {
-               from_list = (AV*) *listp;
-           }
-           else { /* No entry yet for it: create one */
-               from_list = newAV();
-               if (! hv_store(specials_inverse,
-                               SvPVX(sv_to),
-                               SvCUR(sv_to),
-                               (SV*) from_list, 0))
-               {
-                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-               }
-           }
-
-           /* Here have the list associated with this 'to' (perhaps newly
-            * created and empty).  Just add to it.  Note that we ASSUME that
-            * the input is guaranteed to not have duplications, so we don't
-            * check for that.  Duplications just slow down execution time. */
-           av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
-       }
-
-       /* Here, 'specials_inverse' contains the inverse mapping.  Go through
-        * it looking for cases like the FB05/FB06 examples above.  There would
-        * be an entry in the hash like
-       *       'st' => [ FB05, FB06 ]
-       * In this example we will create two lists that get stored in the
-       * returned hash, 'ret':
-       *       FB05 => [ FB05, FB06 ]
-       *       FB06 => [ FB05, FB06 ]
-       *
-       * Note that there is nothing to do if the array only has one element.
-       * (In the normal 1-1 case handled below, we don't have to worry about
-       * two lists, as everything gets tied to the single list that is
-       * generated for the single character 'to'.  But here, we are omitting
-       * that list, ('st' in the example), so must have multiple lists.) */
-       while ((from_list = (AV *) hv_iternextsv(specials_inverse,
-                                                &char_to, &to_len)))
-       {
-           if (av_tindex_skip_len_mg(from_list) > 0) {
-               SSize_t i;
-
-               /* We iterate over all combinations of i,j to place each code
-                * point on each list */
-               for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
-                   SSize_t j;
-                   AV* i_list = newAV();
-                   SV** entryp = av_fetch(from_list, i, FALSE);
-                   if (entryp == NULL) {
-                       Perl_croak(aTHX_ "panic: av_fetch() unexpectedly"
-                                         " failed");
-                   }
-                   if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
-                       Perl_croak(aTHX_ "panic: unexpected entry for %s",
-                                                                SvPVX(*entryp));
-                   }
-                   if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
-                                  (SV*) i_list, FALSE))
-                   {
-                       Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-                   }
-
-                   /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
-                   for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
-                       entryp = av_fetch(from_list, j, FALSE);
-                       if (entryp == NULL) {
-                           Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
-                       }
-
-                       /* When i==j this adds itself to the list */
-                       av_push(i_list, newSVuv(utf8_to_uvchr_buf(
-                                       (U8*) SvPVX(*entryp),
-                                       (U8*) SvPVX(*entryp) + SvCUR(*entryp),
-                                       0)));
-                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
-                   }
-               }
-           }
-       }
-       SvREFCNT_dec(specials_inverse); /* done with it */
-    } /* End of specials */
-
-    /* read $swash->{LIST} */
-
-#if    UNICODE_MAJOR_VERSION   == 3         \
-    && UNICODE_DOT_VERSION     == 0         \
-    && UNICODE_DOT_DOT_VERSION == 1
-
-    /* For this version only U+130 and U+131 are equivalent under qr//i.  Add a
-     * rule so that things work under /iaa and /il */
-
-    SV * mod_listsv = sv_mortalcopy(*listsvp);
-    sv_catpv(mod_listsv, "130\t130\t131\n");
-    l = (U8*)SvPV(mod_listsv, lcur);
-
-#else
-
-    l = (U8*)SvPV(*listsvp, lcur);
-
-#endif
-
-    lend = l + lcur;
-
-    /* Go through each input line */
-    while (l < lend) {
-       UV min, max, val;
-       UV inverse;
-       l = swash_scan_list_line(l, lend, &min, &max, &val,
-                                                     cBOOL(octets), typestr);
-       if (l > lend) {
-           break;
-       }
-
-       /* Each element in the range is to be inverted */
-       for (inverse = min; inverse <= max; inverse++) {
-           AV* list;
-           SV** listp;
-           IV i;
-           bool found_key = FALSE;
-           bool found_inverse = FALSE;
-
-           /* The key is the inverse mapping */
-           char key[UTF8_MAXBYTES+1];
-           char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
-           STRLEN key_len = key_end - key;
-
-           /* Get the list for the map */
-           if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
-               list = (AV*) *listp;
-           }
-           else { /* No entry yet for it: create one */
-               list = newAV();
-               if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
-                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-               }
-           }
-
-           /* Look through list to see if this inverse mapping already is
-            * listed, or if there is a mapping to itself already */
-           for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
-               SV** entryp = av_fetch(list, i, FALSE);
-               SV* entry;
-               UV uv;
-               if (entryp == NULL) {
-                   Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
-               }
-               entry = *entryp;
-               uv = SvUV(entry);
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/
-               if (uv == val) {
-                   found_key = TRUE;
-               }
-               if (uv == inverse) {
-                   found_inverse = TRUE;
-               }
-
-               /* No need to continue searching if found everything we are
-                * looking for */
-               if (found_key && found_inverse) {
-                   break;
-               }
-           }
-
-           /* Make sure there is a mapping to itself on the list */
-           if (! found_key) {
-               av_push(list, newSVuv(val));
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/
-           }
-
-
-           /* Simply add the value to the list */
-           if (! found_inverse) {
-               av_push(list, newSVuv(inverse));
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/
-           }
-
-           /* swatch_get() increments the value of val for each element in the
-            * range.  That makes more compact tables possible.  You can
-            * express the capitalization, for example, of all consecutive
-            * letters with a single line: 0061\t007A\t0041 This maps 0061 to
-            * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
-            * and it's not documented; it appears to be used only in
-            * implementing tr//; I copied the semantics from swatch_get(), just
-            * in case */
-           if (!none || val < none) {
-               ++val;
-           }
-       }
-    }
-
-    return ret;
-}
-
  SV*
  Perl__swash_to_invlist(pTHX_ SV* const swash)
  {
  SV*
  Perl__swash_to_invlist(pTHX_ SV* const swash)
  {
@@ -4930,7 +5221,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
           * size based on worst possible case, which is each line in the input
           * creates 2 elements in the inversion list: 1) the beginning of a
           * range in the list; 2) the beginning of a range not in the list.  */
           * size based on worst possible case, which is each line in the input
           * creates 2 elements in the inversion list: 1) the beginning of a
           * range in the list; 2) the beginning of a range not in the list.  */
-        while ((loc = (strchr(loc, '\n'))) != NULL) {
+        while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) {
              elements += 2;
              loc++;
          }
              elements += 2;
              loc++;
          }
@@ -5075,10 +5366,8 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in 's' which has length 'len' bytes.
       * Returns TRUE if none found; FALSE otherwise.  The only other validity
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in 's' which has length 'len' bytes.
       * Returns TRUE if none found; FALSE otherwise.  The only other validity
-     * check is to make sure that this won't exceed the string's length.
-     *
-     * Code points above the platform's C<IV_MAX> will raise a deprecation
-     * warning, unless those are turned off.  */
+     * check is to make sure that this won't exceed the string's length nor
+     * overflow */
  
      const U8* const e = s + len;
      bool ok = TRUE;
  
      const U8* const e = s + len;
      bool ok = TRUE;
@@ -5094,22 +5383,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
                  if (   ckWARN_d(WARN_NON_UNICODE)
         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
                  if (   ckWARN_d(WARN_NON_UNICODE)
-                    || (   ckWARN_d(WARN_DEPRECATED)
-#ifndef UV_IS_QUAD
-                        && UNLIKELY(is_utf8_cp_above_31_bits(s, e))
-#else   /* Below is 64-bit words */
-                        /* 2**63 and up meet these conditions provided we have
-                         * a 64-bit word. */
-#   ifdef EBCDIC
-                        && *s == 0xFE
-                        && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8
-#   else
-                        && *s == 0xFF
-                           /* s[1] being above 0x80 overflows */
-                        && s[2] >= 0x88
-#   endif
-#endif
-                )) {
+                    || UNLIKELY(0 < does_utf8_overflow(s, s + len,
+                                               0 /* Don't consider overlongs */
+                                               )))
+                {
                      /* A side effect of this function will be to warn */
                      (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
                      ok = FALSE;
                      /* A side effect of this function will be to warn */
                      (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
                      ok = FALSE;
@@ -5523,7 +5800,7 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
  
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
  
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
+    return uvoffuni_to_utf8_flags(d, uv, 0);
  }
  
  /*
  }
  
  /*