Remove trailing '/' from prefix

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index a784c54..07e4df7 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -36,10 +36,11 @@
  static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
  static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
-static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
  
  
-#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
+/* Be sure to synchronize this message with the similar one in regcomp.c */
+static const char cp_above_legal_max[] =
+                        "Use of code point 0x%" UVXf " is not allowed; the"
+                        " permissible max is 0x%" UVXf;
  
  /*
  =head1 Unicode Support
  
  /*
  =head1 Unicode Support
@@ -52,6 +53,19 @@ within non-zero characters.
  =cut
  */
  
  =cut
  */
  
+/* helper for Perl__force_out_malformed_utf8_message(). Like
+ * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
+ * PL_compiling */
+
+static void
+S_restore_cop_warnings(pTHX_ void *p)
+{
+    if (!specialWARN(PL_curcop->cop_warnings))
+        PerlMemShared_free(PL_curcop->cop_warnings);
+    PL_curcop->cop_warnings = (STRLEN*)p;
+}
+
+
  void
  Perl__force_out_malformed_utf8_message(pTHX_
              const U8 *const p,      /* First byte in UTF-8 sequence */
  void
  Perl__force_out_malformed_utf8_message(pTHX_
              const U8 *const p,      /* First byte in UTF-8 sequence */
@@ -83,6 +97,10 @@ Perl__force_out_malformed_utf8_message(pTHX_
  
      PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
      if (PL_curcop) {
  
      PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
      if (PL_curcop) {
+        /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
+         * than PL_compiling */
+        SAVEDESTRUCTOR_X(S_restore_cop_warnings,
+                (void*)PL_curcop->cop_warnings);
          PL_curcop->cop_warnings = pWARN_ALL;
      }
  
          PL_curcop->cop_warnings = pWARN_ALL;
      }
  
@@ -100,6 +118,29 @@ Perl__force_out_malformed_utf8_message(pTHX_
      }
  }
  
      }
  }
  
+STATIC HV *
+S_new_msg_hv(pTHX_ const char * const message, /* The message text */
+                   U32 categories,  /* Packed warning categories */
+                   U32 flag)        /* Flag associated with this message */
+{
+    /* Creates, populates, and returns an HV* that describes an error message
+     * for the translators between UTF8 and code point */
+
+    SV* msg_sv = newSVpv(message, 0);
+    SV* category_sv = newSVuv(categories);
+    SV* flag_bit_sv = newSVuv(flag);
+
+    HV* msg_hv = newHV();
+
+    PERL_ARGS_ASSERT_NEW_MSG_HV;
+
+    (void) hv_stores(msg_hv, "text", msg_sv);
+    (void) hv_stores(msg_hv, "warn_categories",  category_sv);
+    (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
+
+    return msg_hv;
+}
+
  /*
  =for apidoc uvoffuni_to_utf8_flags
  
  /*
  =for apidoc uvoffuni_to_utf8_flags
  
@@ -116,23 +157,56 @@ For details, see the description for L</uvchr_to_utf8_flags>.
  =cut
  */
  
  =cut
  */
  
-#define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
+U8 *
+Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
+{
+    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
+
+    return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
+}
+
+/* All these formats take a single UV code point argument */
+const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
+const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
+                                   " is not recommended for open interchange";
+const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
+                                   " may not be portable";
+const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
+                                       " Unicode, requires a Perl extension," \
+                                       " and so is not portable";
+
+#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                   \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_SURROGATE) {                       \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_SURROGATE) {                       \
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),        \
-                                "UTF-16 surrogate U+%04" UVXf, uv); \
+            U32 category = packWARN(WARN_SURROGATE);                \
+            const char * format = surrogate_cp_format;              \
+            if (msgs) {                                             \
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
+                                   category,                        \
+                                   UNICODE_GOT_SURROGATE);          \
+            }                                                       \
+            else {                                                  \
+                Perl_ck_warner_d(aTHX_ category, format, uv);       \
+            }                                                       \
          }                                                           \
          if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
              return NULL;                                            \
          }                                                           \
      } STMT_END;
  
          }                                                           \
          if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
              return NULL;                                            \
          }                                                           \
      } STMT_END;
  
-#define HANDLE_UNICODE_NONCHAR(uv, flags)                           \
+#define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                     \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_NONCHAR) {                         \
      STMT_START {                                                    \
          if (flags & UNICODE_WARN_NONCHAR) {                         \
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),          \
-                "Unicode non-character U+%04" UVXf " is not "      \
-                 "recommended for open interchange", uv);           \
+            U32 category = packWARN(WARN_NONCHAR);                  \
+            const char * format = nonchar_cp_format;                \
+            if (msgs) {                                             \
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
+                                   category,                        \
+                                   UNICODE_GOT_NONCHAR);            \
+            }                                                       \
+            else {                                                  \
+                Perl_ck_warner_d(aTHX_ category, format, uv);       \
+            }                                                       \
          }                                                           \
          if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
              return NULL;                                            \
          }                                                           \
          if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
              return NULL;                                            \
@@ -145,10 +219,62 @@ For details, see the description for L</uvchr_to_utf8_flags>.
  #define MARK    UTF_CONTINUATION_MARK
  #define MASK    UTF_CONTINUATION_MASK
  
  #define MARK    UTF_CONTINUATION_MARK
  #define MASK    UTF_CONTINUATION_MASK
  
+/*
+=for apidoc uvchr_to_utf8_flags_msgs
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+
+Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
+
+This function is for code that wants any warning and/or error messages to be
+returned to the caller rather than be displayed.  All messages that would have
+been displayed if all lexical warnings are enabled will be returned.
+
+It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
+placed after all the others, C<msgs>.  If this parameter is 0, this function
+behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
+be a pointer to an C<HV *> variable, in which this function creates a new HV to
+contain any appropriate messages.  The hash has three key-value pairs, as
+follows:
+
+=over 4
+
+=item C<text>
+
+The text of the message as a C<SVpv>.
+
+=item C<warn_categories>
+
+The warning category (or categories) packed into a C<SVuv>.
+
+=item C<flag>
+
+A single flag bit associated with this message, in a C<SVuv>.
+The bit corresponds to some bit in the C<*errors> return value,
+such as C<UNICODE_GOT_SURROGATE>.
+
+=back
+
+It's important to note that specifying this parameter as non-null will cause
+any warnings this function would otherwise generate to be suppressed, and
+instead be placed in C<*msgs>.  The caller can check the lexical warnings state
+(or not) when choosing what to do with the returned messages.
+
+The caller, of course, is responsible for freeing any returned HV.
+
+=cut
+*/
+
+/* Undocumented; we don't want people using this.  Instead they should use
+ * uvchr_to_utf8_flags_msgs() */
  U8 *
  U8 *
-Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
+Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
  {
  {
-    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
+    PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
+
+    if (msgs) {
+        *msgs = NULL;
+    }
  
      if (OFFUNI_IS_INVARIANT(uv)) {
         *d++ = LATIN1_TO_NATIVE(uv);
  
      if (OFFUNI_IS_INVARIANT(uv)) {
         *d++ = LATIN1_TO_NATIVE(uv);
@@ -180,10 +306,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
              if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
                           || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
              {
              if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
                           || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
              {
-                HANDLE_UNICODE_NONCHAR(uv, flags);
+                HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
              }
              else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
              }
              else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-                HANDLE_UNICODE_SURROGATE(uv, flags);
+                HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
              }
          }
  #endif
              }
          }
  #endif
@@ -198,33 +324,44 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
-        if (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-            && ckWARN_d(WARN_DEPRECATED))
-        {
-            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                        cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
+        if (UNLIKELY(uv > MAX_LEGAL_CP)) {
+            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
          }
          }
-        if (   (flags & UNICODE_WARN_SUPER)
-            || (   UNICODE_IS_ABOVE_31_BIT(uv)
-                && (flags & UNICODE_WARN_ABOVE_31_BIT)))
+        if (       (flags & UNICODE_WARN_SUPER)
+            || (   (flags & UNICODE_WARN_PERL_EXTENDED)
+                && UNICODE_IS_PERL_EXTENDED(uv)))
          {
          {
-            Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
+            const char * format = super_cp_format;
+            U32 category = packWARN(WARN_NON_UNICODE);
+            U32 flag = UNICODE_GOT_SUPER;
+
+            /* Choose the more dire applicable warning */
+            if (UNICODE_IS_PERL_EXTENDED(uv)) {
+                format = perl_extended_cp_format;
+                if (flags & (UNICODE_WARN_PERL_EXTENDED
+                            |UNICODE_DISALLOW_PERL_EXTENDED))
+                {
+                    flag = UNICODE_GOT_PERL_EXTENDED;
+                }
+            }
  
  
-              /* Choose the more dire applicable warning */
-              (UNICODE_IS_ABOVE_31_BIT(uv))
-              ? "Code point 0x%" UVXf " is not Unicode, and not portable"
-              : "Code point 0x%" UVXf " is not Unicode, may not be portable",
-             uv);
+            if (msgs) {
+                *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
+                                   category, flag);
+            }
+            else {
+                Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
+            }
          }
          }
-        if (flags & UNICODE_DISALLOW_SUPER
-            || (   UNICODE_IS_ABOVE_31_BIT(uv)
-                && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
+        if (       (flags & UNICODE_DISALLOW_SUPER)
+            || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
+                &&  UNICODE_IS_PERL_EXTENDED(uv)))
          {
              return NULL;
          }
      }
      else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
          {
              return NULL;
          }
      }
      else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
-        HANDLE_UNICODE_NONCHAR(uv, flags);
+        HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
      }
  
      /* Test for and handle 4-byte result.   In the test immediately below, the
      }
  
      /* Test for and handle 4-byte result.   In the test immediately below, the
@@ -243,10 +380,10 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
                     characters.  The end-plane non-characters for EBCDIC were
                     handled just above */
          if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
                     characters.  The end-plane non-characters for EBCDIC were
                     handled just above */
          if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
-            HANDLE_UNICODE_NONCHAR(uv, flags);
+            HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
          }
          else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
          }
          else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-            HANDLE_UNICODE_SURROGATE(uv, flags);
+            HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
          }
  #endif
  
          }
  #endif
  
@@ -264,8 +401,8 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
-           *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
-           uv >>= UTF_ACCUMULATION_SHIFT;
+           *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
+           uv >>= SHIFT;
         }
         *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
         return d+len;
         }
         *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
         return d+len;
@@ -286,9 +423,8 @@ is the recommended wide native character-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
-This function accepts any UV as input, but very high code points (above
-C<IV_MAX> on the platform)  will raise a deprecation warning.  This is
-typically 0x7FFF_FFFF in a 32-bit word.
+This function accepts any code point from 0..C<IV_MAX> as input.
+C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
  
  It is possible to forbid or warn on non-Unicode code points, or those that may
  be problematic by using L</uvchr_to_utf8_flags>.
  
  It is possible to forbid or warn on non-Unicode code points, or those that may
  be problematic by using L</uvchr_to_utf8_flags>.
@@ -323,9 +459,8 @@ This is the Unicode-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
-If C<flags> is 0, this function accepts any UV as input, but very high code
-points (above C<IV_MAX> for the platform)  will raise a deprecation warning.
-This is typically 0x7FFF_FFFF in a 32-bit word.
+If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
+input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
  
  Specifying C<flags> can further restrict what is allowed and not warned on, as
  follows:
  
  Specifying C<flags> can further restrict what is allowed and not warned on, as
  follows:
@@ -354,30 +489,25 @@ defined in
  L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
  See L<perlunicode/Noncharacter code points>.
  
  L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
  See L<perlunicode/Noncharacter code points>.
  
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these that written by the perl interpreter; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UNICODE_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UNICODE_WARN_ABOVE_31_BIT>
-and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
+and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
+treat all above-Unicode code points, including these, as malformations.  (Note
+that the Unicode standard considers anything above 0x10FFFF to be illegal, but
+there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
+
+A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
+C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
+platforms,these flags can apply to code points that actually do fit in 31 bits.
+The new names accurately describe the situation in all cases.
  
  =cut
  */
  
  =cut
  */
@@ -391,8 +521,12 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
-PERL_STATIC_INLINE bool
-S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+#ifndef UV_IS_QUAD
+
+STATIC int
+S_is_utf8_cp_above_31_bits(const U8 * const s,
+                           const U8 * const e,
+                           const bool consider_overlongs)
  {
      /* Returns TRUE if the first code point represented by the Perl-extended-
       * UTF-8-encoded string starting at 's', and looking no further than 'e -
  {
      /* Returns TRUE if the first code point represented by the Perl-extended-
       * UTF-8-encoded string starting at 's', and looking no further than 'e -
@@ -404,165 +538,170 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
       * the final ones necessary for the complete representation may be beyond
       * 'e - 1'.
       *
       * the final ones necessary for the complete representation may be beyond
       * 'e - 1'.
       *
-     * The function assumes that the sequence is well-formed UTF-8 as far as it
-     * goes, and is for a UTF-8 variant code point.  If the sequence is
-     * incomplete, the function returns FALSE if there is any well-formed
-     * UTF-8 byte sequence that can complete it in such a way that a code point
-     * < 2**31 is produced; otherwise it returns TRUE.
-     *
-     * Getting this exactly right is slightly tricky, and has to be done in
-     * several places in this file, so is centralized here.  It is based on the
-     * following table:
+     * The function also can handle the case where the input is an overlong
+     * sequence.  If 'consider_overlongs' is 0, the function assumes the
+     * input is not overlong, without checking, and will return based on that
+     * assumption.  If this parameter is 1, the function will go to the trouble
+     * of figuring out if it actually evaluates to above or below 31 bits.
       *
       *
-     * U+7FFFFFFF (2 ** 31 - 1)
-     *      ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
-     *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
-     *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
-     *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
-     *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
-     * U+80000000 (2 ** 31):
-     *      ASCII: \xFE\x82\x80\x80\x80\x80\x80
-     *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
-     *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-     *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-     *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
-     *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+     * The sequence is otherwise assumed to be well-formed, without checking.
       */
  
       */
  
-#ifdef EBCDIC
-
-    /* [0] is start byte  [1] [2] [3] [4] [5] [6] [7] */
-    const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
-    const STRLEN prefix_len = sizeof(prefix) - 1;
      const STRLEN len = e - s;
      const STRLEN len = e - s;
-    const STRLEN cmp_len = MIN(prefix_len, len - 1);
+    int is_overlong;
  
  
-#else
-
-    PERL_UNUSED_ARG(e);
+    PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
  
  
-#endif
+    assert(! UTF8_IS_INVARIANT(*s) && e > s);
  
  
-    PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
+#ifdef EBCDIC
  
  
-    assert(! UTF8_IS_INVARIANT(*s));
+    PERL_UNUSED_ARG(consider_overlongs);
  
  
-#ifndef EBCDIC
+    /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
+     * mean a 32-bit or larger code point (0xFF is an invariant).  0xFE can
+     * also be the start byte for a 31-bit code point; we need at least 2
+     * bytes, and maybe up through 8 bytes, to determine that.  (It can also be
+     * the start byte for an overlong sequence, but for 30-bit or smaller code
+     * points, so we don't have to worry about overlongs on EBCDIC.) */
+    if (*s != 0xFE) {
+        return 0;
+    }
  
  
-    /* Technically, a start byte of FE can be for a code point that fits into
-     * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
-     * malformation. */
-    return (*s >= 0xFE);
+    if (len == 1) {
+        return -1;
+    }
  
  #else
  
  
  #else
  
-    /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
-     * larger code point (0xFF is an invariant).  For 0xFE, we need at least 2
-     * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
-     * bits. */
-    if (*s != 0xFE || len == 1) {
-        return FALSE;
+    /* On ASCII, FE and FF are the only start bytes that can evaluate to
+     * needing more than 31 bits. */
+    if (LIKELY(*s < 0xFE)) {
+        return 0;
      }
  
      }
  
-    /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
-     * \x41 and \x42. */
-    return cBOOL(memGT(s + 1, prefix, cmp_len));
-
-#endif
+    /* What we have left are FE and FF.  Both of these require more than 31
+     * bits unless they are for overlongs. */
+    if (! consider_overlongs) {
+        return 1;
+    }
  
  
-}
+    /* Here, we have FE or FF.  If the input isn't overlong, it evaluates to
+     * above 31 bits.  But we need more than one byte to discern this, so if
+     * passed just the start byte, it could be an overlong evaluating to
+     * smaller */
+    if (len == 1) {
+        return -1;
+    }
  
  
-/* Anything larger than this will overflow the word if it were converted into a UV */
-#if defined(UV_IS_QUAD)
-#  ifdef EBCDIC     /* Actually is I8 */
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#else   /* 32-bit */
-#  ifdef EBCDIC
-#   define HIGHEST_REPRESENTABLE_UTF8                                       \
-                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
-#  else
-#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
-#  endif
-#endif
+    /* Having excluded len==1, and knowing that FE and FF are both valid start
+     * bytes, we can call the function below to see if the sequence is
+     * overlong.  (We don't need the full generality of the called function,
+     * but for these huge code points, speed shouldn't be a consideration, and
+     * the compiler does have enough information, since it's static to this
+     * file, to optimize to just the needed parts.) */
+    is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
  
  
-PERL_STATIC_INLINE bool
-S_does_utf8_overflow(const U8 * const s, const U8 * e)
-{
-    const U8 *x;
-    const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+    /* If it isn't overlong, more than 31 bits are required. */
+    if (is_overlong == 0) {
+        return 1;
+    }
  
  
-#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+    /* If it is indeterminate if it is overlong, return that */
+    if (is_overlong < 0) {
+        return -1;
+    }
  
  
-    const STRLEN len = e - s;
+    /* Here is overlong.  Such a sequence starting with FE is below 31 bits, as
+     * the max it can be is 2**31 - 1 */
+    if (*s == 0xFE) {
+        return 0;
+    }
  
  #endif
  
  
  #endif
  
-    /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
-     * platform, that is if it represents a code point larger than the highest
-     * representable code point.  (For ASCII platforms, we could use memcmp()
-     * because we don't have to convert each byte to I8, but it's very rare
-     * input indeed that would approach overflow, so the loop below will likely
-     * only get executed once.
-     *
-     * 'e' must not be beyond a full character.  If it is less than a full
-     * character, the function returns FALSE if there is any input beyond 'e'
-     * that could result in a non-overflowing code point */
+    /* Here, ASCII and EBCDIC rejoin:
+    *  On ASCII:   We have an overlong sequence starting with FF
+    *  On EBCDIC:  We have a sequence starting with FE. */
  
  
-    PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
-    assert(s <= e && s + UTF8SKIP(s) >= e);
+    {   /* For C89, use a block so the declaration can be close to its use */
  
  
-#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+#ifdef EBCDIC
  
  
-    /* On 32 bit ASCII machines, many overlongs that start with FF don't
-     * overflow */
+        /* U+7FFFFFFF (2 ** 31 - 1)
+         *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
+         *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
+         *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
+         *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
+         *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
+         * U+80000000 (2 ** 31):
+         *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+         *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+         *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+         *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+         *
+         * and since we know that *s = \xfe, any continuation sequcence
+         * following it that is gt the below is above 31 bits
+                                                [0] [1] [2] [3] [4] [5] [6] */
+        const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
  
  
-    if (isFF_OVERLONG(s, len)) {
-        const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
-        return memGE(s, max_32_bit_overlong,
-                                MIN(len, sizeof(max_32_bit_overlong) - 1));
-    }
+#else
  
  
-#endif
+        /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
+         *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
+         * FF overlong for U+80000000 (2 ** 31):
+         *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
+         * and since we know that *s = \xff, any continuation sequcence
+         * following it that is gt the below is above 30 bits
+                                                [0] [1] [2] [3] [4] [5] [6] */
+        const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
  
  
-    for (x = s; x < e; x++, y++) {
  
  
-        /* If this byte is larger than the corresponding highest UTF-8 byte, it
-         * overflows */
-        if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
-            return TRUE;
+#endif
+        const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
+        const STRLEN cmp_len = MIN(conts_len, len - 1);
+
+        /* Now compare the continuation bytes in s with the ones we have
+         * compiled in that are for the largest 30 bit code point.  If we have
+         * enough bytes available to determine the answer, or the bytes we do
+         * have differ from them, we can compare the two to get a definitive
+         * answer (Note that in UTF-EBCDIC, the two lowest possible
+         * continuation bytes are \x41 and \x42.) */
+        if (cmp_len >= conts_len || memNE(s + 1,
+                                          conts_for_highest_30_bit,
+                                          cmp_len))
+        {
+            return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
          }
  
          }
  
-        /* If not the same as this byte, it must be smaller, doesn't overflow */
-        if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
-            return FALSE;
-        }
+        /* Here, all the bytes we have are the same as the highest 30-bit code
+         * point, but we are missing so many bytes that we can't make the
+         * determination */
+        return -1;
      }
      }
-
-    /* Got to the end and all bytes are the same.  If the input is a whole
-     * character, it doesn't overflow.  And if it is a partial character,
-     * there's not enough information to tell, so assume doesn't overflow */
-    return FALSE;
  }
  
  }
  
-PERL_STATIC_INLINE bool
+#endif
+
+PERL_STATIC_INLINE int
  S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  {
  S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  {
-    /* Overlongs can occur whenever the number of continuation bytes
-     * changes.  That means whenever the number of leading 1 bits in a start
-     * byte increases from the next lower start byte.  That happens for start
-     * bytes C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following
-     * illegal start bytes have already been excluded, so don't need to be
-     * tested here;
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
+     * it isn't, and -1 if there isn't enough information to tell.  This last
+     * return value can happen if the sequence is incomplete, missing some
+     * trailing bytes that would form a complete character.  If there are
+     * enough bytes to make a definitive decision, this function does so.
+     * Usually 2 bytes sufficient.
+     *
+     * Overlongs can occur whenever the number of continuation bytes changes.
+     * That means whenever the number of leading 1 bits in a start byte
+     * increases from the next lower start byte.  That happens for start bytes
+     * C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following illegal
+     * start bytes have already been excluded, so don't need to be tested here;
       * ASCII platforms: C0, C1
       * EBCDIC platforms C0, C1, C2, C3, C4, E0
       * ASCII platforms: C0, C1
       * EBCDIC platforms C0, C1, C2, C3, C4, E0
-     *
-     * At least a second byte is required to determine if other sequences will
-     * be an overlong. */
+     */
  
      const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
      const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
  
      const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
      const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
@@ -587,7 +726,7 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  #       else
  
      if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
  #       else
  
      if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
-        return TRUE;
+        return 1;
      }
  
  #           define F0_ABOVE_OVERLONG 0x90
      }
  
  #           define F0_ABOVE_OVERLONG 0x90
@@ -603,27 +742,178 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
          || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
          || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
      {
          || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
          || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
      {
-        return TRUE;
+        return 1;
      }
  
      /* Check for the FF overlong */
      return isFF_OVERLONG(s, len);
  }
  
      }
  
      /* Check for the FF overlong */
      return isFF_OVERLONG(s, len);
  }
  
-PERL_STATIC_INLINE bool
+PERL_STATIC_INLINE int
  S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
  {
  S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
  {
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
+     * it isn't, and -1 if there isn't enough information to tell.  This last
+     * return value can happen if the sequence is incomplete, missing some
+     * trailing bytes that would form a complete character.  If there are
+     * enough bytes to make a definitive decision, this function does so. */
+
      PERL_ARGS_ASSERT_ISFF_OVERLONG;
  
      PERL_ARGS_ASSERT_ISFF_OVERLONG;
  
-    /* Check for the FF overlong.  This happens only if all these bytes match;
-     * what comes after them doesn't matter.  See tables in utf8.h,
+    /* To be an FF overlong, all the available bytes must match */
+    if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
+                     MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
+    {
+        return 0;
+    }
+
+    /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
+     * be there; what comes after them doesn't matter.  See tables in utf8.h,
       * utfebcdic.h. */
       * utfebcdic.h. */
+    if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
+        return 1;
+    }
+
+    /* The missing bytes could cause the result to go one way or the other, so
+     * the result is indeterminate */
+    return -1;
+}
+
+#if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
+#  ifdef EBCDIC     /* Actually is I8 */
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#endif
+
+PERL_STATIC_INLINE int
+S_does_utf8_overflow(const U8 * const s,
+                     const U8 * e,
+                     const bool consider_overlongs)
+{
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 'e' - 1 would overflow an IV on this platform; that is if it represents
+     * a code point larger than the highest representable code point.  It
+     * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
+     * enough information to tell.  This last return value can happen if the
+     * sequence is incomplete, missing some trailing bytes that would form a
+     * complete character.  If there are enough bytes to make a definitive
+     * decision, this function does so.
+     *
+     * If 'consider_overlongs' is TRUE, the function checks for the possibility
+     * that the sequence is an overlong that doesn't overflow.  Otherwise, it
+     * assumes the sequence is not an overlong.  This can give different
+     * results only on ASCII 32-bit platforms.
+     *
+     * (For ASCII platforms, we could use memcmp() because we don't have to
+     * convert each byte to I8, but it's very rare input indeed that would
+     * approach overflow, so the loop below will likely only get executed once.)
+     *
+     * 'e' - 1 must not be beyond a full character. */
+
+
+    PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
+    assert(s <= e && s + UTF8SKIP(s) >= e);
+
+#if ! defined(UV_IS_QUAD)
+
+    return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
+
+#else
+
+    PERL_UNUSED_ARG(consider_overlongs);
+
+    {
+        const STRLEN len = e - s;
+        const U8 *x;
+        const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+        for (x = s; x < e; x++, y++) {
+
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
+                continue;
+            }
+
+            /* If this byte is larger than the corresponding highest UTF-8
+             * byte, the sequence overflow; otherwise the byte is less than,
+             * and so the sequence doesn't overflow */
+            return NATIVE_UTF8_TO_I8(*x) > *y;
+
+        }
+
+        /* Got to the end and all bytes are the same.  If the input is a whole
+         * character, it doesn't overflow.  And if it is a partial character,
+         * there's not enough information to tell */
+        if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
+            return -1;
+        }
+
+        return 0;
+    }
+
+#endif
  
  
-    return    len >= sizeof(FF_OVERLONG_PREFIX) - 1
-           && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
-                                            sizeof(FF_OVERLONG_PREFIX) - 1));
  }
  
  }
  
+#if 0
+
+/* This is the portions of the above function that deal with UV_MAX instead of
+ * IV_MAX.  They are left here in case we want to combine them so that internal
+ * uses can have larger code points.  The only logic difference is that the
+ * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
+ * different logic.
+ */
+
+/* Anything larger than this will overflow the word if it were converted into a UV */
+#if defined(UV_IS_QUAD)
+#  ifdef EBCDIC     /* Actually is I8 */
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#else   /* 32-bit */
+#  ifdef EBCDIC
+#   define HIGHEST_REPRESENTABLE_UTF8                                       \
+                "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
+#  else
+#   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
+#  endif
+#endif
+
+#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+
+    /* On 32 bit ASCII machines, many overlongs that start with FF don't
+     * overflow */
+    if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
+
+        /* To be such an overlong, the first bytes of 's' must match
+         * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80".  If we
+         * don't have any additional bytes available, the sequence, when
+         * completed might or might not fit in 32 bits.  But if we have that
+         * next byte, we can tell for sure.  If it is <= 0x83, then it does
+         * fit. */
+        if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
+            return -1;
+        }
+
+        return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
+    }
+
+/* Starting with the #else, the rest of the function is identical except
+ *      1.  we need to move the 'len' declaration to be global to the function
+ *      2.  the endif move to just after the UNUSED_ARG.
+ * An empty endif is given just below to satisfy the preprocessor
+ */
+#endif
+
+#endif
+
  #undef F0_ABOVE_OVERLONG
  #undef F8_ABOVE_OVERLONG
  #undef FC_ABOVE_OVERLONG
  #undef F0_ABOVE_OVERLONG
  #undef F8_ABOVE_OVERLONG
  #undef FC_ABOVE_OVERLONG
@@ -652,8 +942,8 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
       * return will be larger than 'e - s'.
       *
       * This function assumes that the code point represented is UTF-8 variant.
       * return will be larger than 'e - s'.
       *
       * This function assumes that the code point represented is UTF-8 variant.
-     * The caller should have excluded this possibility before calling this
-     * function.
+     * The caller should have excluded the possibility of it being invariant
+     * before calling this function.
       *
       * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
       * accepted by L</utf8n_to_uvchr>.  If non-zero, this function will return
       *
       * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
       * accepted by L</utf8n_to_uvchr>.  If non-zero, this function will return
@@ -671,7 +961,7 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
      PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
  
      assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
      PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
  
      assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
      assert(! UTF8_IS_INVARIANT(*s));
  
      /* A variant char must begin with a start byte */
      assert(! UTF8_IS_INVARIANT(*s));
  
      /* A variant char must begin with a start byte */
@@ -689,17 +979,29 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
      if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
          const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
  
      if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
          const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
  
-        /* The code below is derived from this table.  Keep in mind that legal
-         * continuation bytes range between \x80..\xBF for UTF-8, and
-         * \xA0..\xBF for I8.  Anything above those aren't continuation bytes.
-         * Hence, we don't have to test the upper edge because if any of those
-         * are encountered, the sequence is malformed, and will fail elsewhere
-         * in this function.
+        /* Here, we are disallowing some set of largish code points, and the
+         * first byte indicates the sequence is for a code point that could be
+         * in the excluded set.  We generally don't have to look beyond this or
+         * the second byte to see if the sequence is actually for one of the
+         * excluded classes.  The code below is derived from this table:
+         *
           *              UTF-8            UTF-EBCDIC I8
           *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
           *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
           * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
           *
           *              UTF-8            UTF-EBCDIC I8
           *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
           *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
           * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
           *
+         * Keep in mind that legal continuation bytes range between \x80..\xBF
+         * for UTF-8, and \xA0..\xBF for I8.  Anything above those aren't
+         * continuation bytes.  Hence, we don't have to test the upper edge
+         * because if any of those is encountered, the sequence is malformed,
+         * and would fail elsewhere in this function.
+         *
+         * The code here likewise assumes that there aren't other
+         * malformations; again the function should fail elsewhere because of
+         * these.  For example, an overlong beginning with FC doesn't actually
+         * have to be a super; it could actually represent a small code point,
+         * even U+0000.  But, since overlongs (and other malformations) are
+         * illegal, the function should return FALSE in either case.
           */
  
  #ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
           */
  
  #ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
@@ -709,10 +1011,12 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
                                                         /* B6 and B7 */      \
                                                && ((s1) & 0xFE ) == 0xB6)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
                                                         /* B6 and B7 */      \
                                                && ((s1) & 0xFE ) == 0xB6)
+#  define isUTF8_PERL_EXTENDED(s)   (*s == I8_TO_NATIVE_UTF8(0xFF))
  #else
  #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
  #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
  #else
  #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
  #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
  #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
+#  define isUTF8_PERL_EXTENDED(s)   (*s >= 0xFE)
  #endif
  
          if (  (flags & UTF8_DISALLOW_SUPER)
  #endif
  
          if (  (flags & UTF8_DISALLOW_SUPER)
@@ -721,10 +1025,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
              return 0;           /* Above Unicode */
          }
  
              return 0;           /* Above Unicode */
          }
  
-        if (   (flags & UTF8_DISALLOW_ABOVE_31_BIT)
-            &&  UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+        if (   (flags & UTF8_DISALLOW_PERL_EXTENDED)
+            &&  UNLIKELY(isUTF8_PERL_EXTENDED(s)))
          {
          {
-            return 0;           /* Above 31 bits */
+            return 0;
          }
  
          if (len > 1) {
          }
  
          if (len > 1) {
@@ -759,13 +1063,16 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  
      /* Here is syntactically valid.  Next, make sure this isn't the start of an
       * overlong. */
  
      /* Here is syntactically valid.  Next, make sure this isn't the start of an
       * overlong. */
-    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
          return 0;
      }
  
      /* And finally, that the code point represented fits in a word on this
       * platform */
          return 0;
      }
  
      /* And finally, that the code point represented fits in a word on this
       * platform */
-    if (does_utf8_overflow(s, e)) {
+    if (0 < does_utf8_overflow(s, e,
+                               0 /* Don't consider overlongs */
+                              ))
+    {
          return 0;
      }
  
          return 0;
      }
  
@@ -773,10 +1080,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  }
  
  char *
  }
  
  char *
-Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
+Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
  {
      /* Returns a mortalized C string that is a displayable copy of the 'len'
  {
      /* Returns a mortalized C string that is a displayable copy of the 'len'
-     * bytes starting at 's'.  'format' gives how to display each byte.
+     * bytes starting at 'start'.  'format' gives how to display each byte.
       * Currently, there are only two formats, so it is currently a bool:
       *      0   \xab
       *      1    ab         (that is a space between two hex digit bytes)
       * Currently, there are only two formats, so it is currently a bool:
       *      0   \xab
       *      1    ab         (that is a space between two hex digit bytes)
@@ -784,7 +1091,8 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
  
      const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
                                                 trailing NUL */
  
      const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
                                                 trailing NUL */
-    const U8 * const e = s + len;
+    const U8 * s = start;
+    const U8 * const e = start + len;
      char * output;
      char * d;
  
      char * output;
      char * d;
  
@@ -794,12 +1102,14 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
      SAVEFREEPV(output);
  
      d = output;
      SAVEFREEPV(output);
  
      d = output;
-    for (; s < e; s++) {
+    for (s = start; s < e; s++) {
          const unsigned high_nibble = (*s & 0xF0) >> 4;
          const unsigned low_nibble =  (*s & 0x0F);
  
          if (format) {
          const unsigned high_nibble = (*s & 0xF0) >> 4;
          const unsigned low_nibble =  (*s & 0x0F);
  
          if (format) {
-            *d++ = ' ';
+            if (s > start) {
+                *d++ = ' ';
+            }
          }
          else {
              *d++ = '\\';
          }
          else {
              *d++ = '\\';
@@ -828,7 +1138,7 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
  PERL_STATIC_INLINE char *
  S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
  
  PERL_STATIC_INLINE char *
  S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
  
-                                         /* How many bytes to print */
+                                         /* Max number of bytes to print */
                                           STRLEN print_len,
  
                                           /* Which one is the non-continuation */
                                           STRLEN print_len,
  
                                           /* Which one is the non-continuation */
@@ -844,6 +1154,8 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
                                 ? "immediately"
                                 : Perl_form(aTHX_ "%d bytes",
                                                   (int) non_cont_byte_pos);
                                 ? "immediately"
                                 : Perl_form(aTHX_ "%d bytes",
                                                   (int) non_cont_byte_pos);
+    const U8 * x = s + non_cont_byte_pos;
+    const U8 * e = s + print_len;
  
      PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
  
  
      PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
  
@@ -851,10 +1163,20 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
       * calculated, it's likely faster to pass it; verify under DEBUGGING */
      assert(expect_len == UTF8SKIP(s));
  
       * calculated, it's likely faster to pass it; verify under DEBUGGING */
      assert(expect_len == UTF8SKIP(s));
  
+    /* As a defensive coding measure, don't output anything past a NUL.  Such
+     * bytes shouldn't be in the middle of a malformation, and could mark the
+     * end of the allocated string, and what comes after is undefined */
+    for (; x < e; x++) {
+        if (*x == '\0') {
+            x++;            /* Output this particular NUL */
+            break;
+        }
+    }
+
      return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
                             " %s after start byte 0x%02x; need %d bytes, got %d)",
                             malformed_text,
      return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
                             " %s after start byte 0x%02x; need %d bytes, got %d)",
                             malformed_text,
-                           _byte_dump_string(s, print_len, 0),
+                           _byte_dump_string(s, x - s, 0),
                             *(s + non_cont_byte_pos),
                             where,
                             *s,
                             *(s + non_cont_byte_pos),
                             where,
                             *s,
@@ -882,7 +1204,7 @@ is the next possible position in C<s> that could begin a non-malformed
  character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
  is raised.  Some UTF-8 input sequences may contain multiple malformations.
  This function tries to find every possible one in each call, so multiple
  character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
  is raised.  Some UTF-8 input sequences may contain multiple malformations.
  This function tries to find every possible one in each call, so multiple
-warnings can be raised for each sequence.
+warnings can be raised for the same sequence.
  
  Various ALLOW flags can be set in C<flags> to allow (and not warn on)
  individual types of malformations, such as the sequence being overlong (that
  
  Various ALLOW flags can be set in C<flags> to allow (and not warn on)
  individual types of malformations, such as the sequence being overlong (that
@@ -933,35 +1255,29 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags.
  (But note that warnings are not raised if lexically disabled nor if
  C<UTF8_CHECK_ONLY> is also specified.)
  
  (But note that warnings are not raised if lexically disabled nor if
  C<UTF8_CHECK_ONLY> is also specified.)
  
-It is now deprecated to have very high code points (above C<IV_MAX> on the
-platforms) and this function will raise a deprecation warning for these (unless
-such warnings are turned off).  This value is typically 0x7FFF_FFFF (2**31 -1)
-in a 32-bit word.
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does.  It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension.  For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
+C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations.
+(Note that the Unicode standard considers anything above 0x10FFFF to be
+illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
+(2**31 -1))
+
+A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
+C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
+can apply to code points that actually do fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
+describe the situation in all cases.
  
  
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points.  Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these; nor would Perl understand files
-written by something that uses a different extension.  For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted.  These are the
-C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent.  When that happens,
-effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UTF8_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII.  Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1.  The flags C<UTF8_WARN_ABOVE_31_BIT>
-and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
@@ -973,10 +1289,10 @@ Also implemented as a macro in utf8.h
  */
  
  UV
  */
  
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
-                          STRLEN curlen,
-                          STRLEN *retlen,
-                          const U32 flags)
+Perl_utf8n_to_uvchr(const U8 *s,
+                    STRLEN curlen,
+                    STRLEN *retlen,
+                    const U32 flags)
  {
      PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
  
  {
      PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
  
@@ -991,7 +1307,8 @@ THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
  Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  This function is for code that needs to know what the precise malformation(s)
  Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  This function is for code that needs to know what the precise malformation(s)
-are when an error is found.
+are when an error is found.  If you also need to know the generated warning
+messages, use L</utf8n_to_uvchr_msgs>() instead.
  
  It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
  all the others, C<errors>.  If this parameter is 0, this function behaves
  
  It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
  all the others, C<errors>.  If this parameter is 0, this function behaves
@@ -1005,12 +1322,36 @@ exceptions are noted:
  
  =over 4
  
  
  =over 4
  
-=item C<UTF8_GOT_ABOVE_31_BIT>
+=item C<UTF8_GOT_PERL_EXTENDED>
  
  
-The code point represented by the input UTF-8 sequence occupies more than 31
-bits.
-This bit is set only if the input C<flags> parameter contains either the
-C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+The input sequence is not standard UTF-8, but a Perl extension.  This bit is
+set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+and so some extension must be used to express them.  Perl uses a natural
+extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
+extension to represent even higher ones, so that any code point that fits in a
+64-bit word can be represented.  Text using these extensions is not likely to
+be portable to non-Perl code.  We lump both of these extensions together and
+refer to them as Perl extended UTF-8.  There exist other extensions that people
+have invented, incompatible with Perl's.
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII.  Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.
+
+On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
+Perl extended UTF-8 is used.
+
+In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
+may use for backward compatibility.  That name is misleading, as this flag may
+be set when the code point actually does fit in 31 bits.  This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
+describes the situation in all cases.
  
  =item C<UTF8_GOT_CONTINUATION>
  
  
  =item C<UTF8_GOT_CONTINUATION>
  
@@ -1039,12 +1380,13 @@ C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
  =item C<UTF8_GOT_NON_CONTINUATION>
  
  The input sequence was malformed in that a non-continuation type byte was found
  =item C<UTF8_GOT_NON_CONTINUATION>
  
  The input sequence was malformed in that a non-continuation type byte was found
-in a position where only a continuation type one should be.
+in a position where only a continuation type one should be.  See also
+L</C<UTF8_GOT_SHORT>>.
  
  =item C<UTF8_GOT_OVERFLOW>
  
  The input sequence was malformed in that it is for a code point that is not
  
  =item C<UTF8_GOT_OVERFLOW>
  
  The input sequence was malformed in that it is for a code point that is not
-representable in the number of bits available in a UV on the current platform.
+representable in the number of bits available in an IV on the current platform.
  
  =item C<UTF8_GOT_SHORT>
  
  
  =item C<UTF8_GOT_SHORT>
  
@@ -1052,6 +1394,34 @@ The input sequence was malformed in that C<curlen> is smaller than required for
  a complete sequence.  In other words, the input is for a partial character
  sequence.
  
  a complete sequence.  In other words, the input is for a partial character
  sequence.
  
+
+C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
+sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
+that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
+sequence was looked at.   If no other flags are present, it means that the
+sequence was valid as far as it went.  Depending on the application, this could
+mean one of three things:
+
+=over
+
+=item *
+
+The C<curlen> length parameter passed in was too small, and the function was
+prevented from examining all the necessary bytes.
+
+=item *
+
+The buffer being looked at is based on reading data, and the data received so
+far stopped in the middle of a character, so that the next read will
+read the remainder of this character.  (It is up to the caller to deal with the
+split bytes somehow.)
+
+=item *
+
+This is a real error, and the partial sequence is all we're going to get.
+
+=back
+
  =item C<UTF8_GOT_SUPER>
  
  The input sequence was malformed in that it is for a non-Unicode code point;
  =item C<UTF8_GOT_SUPER>
  
  The input sequence was malformed in that it is for a non-Unicode code point;
@@ -1072,39 +1442,148 @@ To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
  flag to suppress any warnings, and then examine the C<*errors> return.
  
  =cut
  flag to suppress any warnings, and then examine the C<*errors> return.
  
  =cut
+
+Also implemented as a macro in utf8.h
  */
  
  UV
  */
  
  UV
-Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
-                                STRLEN curlen,
-                                STRLEN *retlen,
-                                const U32 flags,
-                                U32 * errors)
+Perl_utf8n_to_uvchr_error(const U8 *s,
+                          STRLEN curlen,
+                          STRLEN *retlen,
+                          const U32 flags,
+                          U32 * errors)
  {
  {
-    const U8 * const s0 = s;
-    U8 * send = NULL;           /* (initialized to silence compilers' wrong
-                                   warning) */
-    U32 possible_problems = 0;  /* A bit is set here for each potential problem
-                                   found as we go along */
-    UV uv = *s;
-    STRLEN expectlen   = 0;     /* How long should this sequence be?
-                                   (initialized to silence compilers' wrong
-                                   warning) */
-    STRLEN avail_len   = 0;     /* When input is too short, gives what that is */
-    U32 discard_errors = 0;     /* Used to save branches when 'errors' is NULL;
-                                   this gets set and discarded */
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
  
  
-    /* The below are used only if there is both an overlong malformation and a
+    return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
+}
+
+/*
+
+=for apidoc utf8n_to_uvchr_msgs
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+
+This function is for code that needs to know what the precise malformation(s)
+are when an error is found, and wants the corresponding warning and/or error
+messages to be returned to the caller rather than be displayed.  All messages
+that would have been displayed if all lexcial warnings are enabled will be
+returned.
+
+It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
+placed after all the others, C<msgs>.  If this parameter is 0, this function
+behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
+be a pointer to an C<AV *> variable, in which this function creates a new AV to
+contain any appropriate messages.  The elements of the array are ordered so
+that the first message that would have been displayed is in the 0th element,
+and so on.  Each element is a hash with three key-value pairs, as follows:
+
+=over 4
+
+=item C<text>
+
+The text of the message as a C<SVpv>.
+
+=item C<warn_categories>
+
+The warning category (or categories) packed into a C<SVuv>.
+
+=item C<flag>
+
+A single flag bit associated with this message, in a C<SVuv>.
+The bit corresponds to some bit in the C<*errors> return value,
+such as C<UTF8_GOT_LONG>.
+
+=back
+
+It's important to note that specifying this parameter as non-null will cause
+any warnings this function would otherwise generate to be suppressed, and
+instead be placed in C<*msgs>.  The caller can check the lexical warnings state
+(or not) when choosing what to do with the returned messages.
+
+If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
+no AV is created.
+
+The caller, of course, is responsible for freeing any returned AV.
+
+=cut
+*/
+
+UV
+Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
+                               STRLEN curlen,
+                               STRLEN *retlen,
+                               const U32 flags,
+                               U32 * errors,
+                               AV ** msgs)
+{
+    const U8 * const s0 = s;
+    const U8 * send = s0 + curlen;
+    U32 possible_problems;  /* A bit is set here for each potential problem
+                               found as we go along */
+    UV uv;
+    STRLEN expectlen;     /* How long should this sequence be? */
+    STRLEN avail_len;     /* When input is too short, gives what that is */
+    U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
+                             gets set and discarded */
+
+    /* The below are used only if there is both an overlong malformation and a
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
       * the third not used at all */
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
       * the third not used at all */
-    U8 * adjusted_s0 = (U8 *) s0;
-    U8 * adjusted_send = NULL;  /* (Initialized to silence compilers' wrong
-                                   warning) */
+    U8 * adjusted_s0;
      U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                              routine; see [perl #130921] */
      U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
                                              routine; see [perl #130921] */
-    UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
+    UV uv_so_far;
+    dTHX;
  
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+    PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
+
+    /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
+     * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
+     * syllables that the dfa doesn't properly handle.  Quickly dispose of the
+     * final case. */
+
+#ifndef EBCDIC
+
+    /* Each of the affected Hanguls starts with \xED */
+
+    if (is_HANGUL_ED_utf8_safe(s0, send)) {
+        if (retlen) {
+            *retlen = 3;
+        }
+        if (errors) {
+            *errors = 0;
+        }
+        if (msgs) {
+            *msgs = NULL;
+        }
+
+        return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
+             | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
+             |  (s0[2] & UTF_CONTINUATION_MASK);
+    }
+
+#endif
+
+    /* In conjunction with the exhaustive tests that can be enabled in
+     * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
+     * what it is intended to do, and that no flaws in it are masked by
+     * dropping down and executing the code below
+    assert(! isUTF8_CHAR(s0, send)
+          || UTF8_IS_SURROGATE(s0, send)
+          || UTF8_IS_SUPER(s0, send)
+          || UTF8_IS_NONCHAR(s0,send));
+    */
+
+    s = s0;
+    uv = *s0;
+    possible_problems = 0;
+    expectlen = 0;
+    avail_len = 0;
+    discard_errors = 0;
+    adjusted_s0 = (U8 *) s0;
+    uv_so_far = 0;
  
      if (errors) {
          *errors = 0;
  
      if (errors) {
          *errors = 0;
@@ -1157,11 +1636,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
         *retlen = expectlen;
      }
  
         *retlen = expectlen;
      }
  
-    /* An invariant is trivially well-formed */
-    if (UTF8_IS_INVARIANT(uv)) {
-       return uv;
-    }
-
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
@@ -1182,16 +1656,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
      /* Setup the loop end point, making sure to not look past the end of the
       * input string, and flag it as too short if the size isn't big enough. */
  
      /* Setup the loop end point, making sure to not look past the end of the
       * input string, and flag it as too short if the size isn't big enough. */
-    send = (U8*) s0;
      if (UNLIKELY(curlen < expectlen)) {
          possible_problems |= UTF8_GOT_SHORT;
          avail_len = curlen;
      if (UNLIKELY(curlen < expectlen)) {
          possible_problems |= UTF8_GOT_SHORT;
          avail_len = curlen;
-        send += curlen;
      }
      else {
      }
      else {
-        send += expectlen;
+        send = (U8*) s0 + expectlen;
      }
      }
-    adjusted_send = send;
  
      /* Now, loop through the remaining bytes in the character's sequence,
       * accumulating each into the working value as we go. */
  
      /* Now, loop through the remaining bytes in the character's sequence,
       * accumulating each into the working value as we go. */
@@ -1228,8 +1699,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
          uv = UNICODE_REPLACEMENT;
      }
  
          uv = UNICODE_REPLACEMENT;
      }
  
-    /* Check for overflow */
-    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+    /* Check for overflow.  The algorithm requires us to not look past the end
+     * of the current character, even if partial, so the upper limit is 's' */
+    if (UNLIKELY(0 < does_utf8_overflow(s0, s,
+                                         1 /* Do consider overlongs */
+                                        )))
+    {
          possible_problems |= UTF8_GOT_OVERFLOW;
          uv = UNICODE_REPLACEMENT;
      }
          possible_problems |= UTF8_GOT_OVERFLOW;
          uv = UNICODE_REPLACEMENT;
      }
@@ -1240,15 +1715,22 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
       * overlong */
      if (     (   LIKELY(! possible_problems)
                && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
       * overlong */
      if (     (   LIKELY(! possible_problems)
                && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
-        || (   UNLIKELY(  possible_problems)
+        || (       UNLIKELY(possible_problems)
              && (   UNLIKELY(! UTF8_IS_START(*s0))
                  || (   curlen > 1
              && (   UNLIKELY(! UTF8_IS_START(*s0))
                  || (   curlen > 1
-                    && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
-                                                                send - s0))))))
+                    && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
+                                                                s - s0))))))
      {
          possible_problems |= UTF8_GOT_LONG;
  
      {
          possible_problems |= UTF8_GOT_LONG;
  
-        if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+        if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
+
+                          /* The calculation in the 'true' branch of this 'if'
+                           * below won't work if overflows, and isn't needed
+                           * anyway.  Further below we handle all overflow
+                           * cases */
+            &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
+        {
              UV min_uv = uv_so_far;
              STRLEN i;
  
              UV min_uv = uv_so_far;
              STRLEN i;
  
@@ -1256,27 +1738,31 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
               * bytes.  There is no single code point it could be for, but there
               * may be enough information present to determine if what we have
               * so far is for an unallowed code point, such as for a surrogate.
               * bytes.  There is no single code point it could be for, but there
               * may be enough information present to determine if what we have
               * so far is for an unallowed code point, such as for a surrogate.
-             * The code below has the intelligence to determine this, but just
-             * for non-overlong UTF-8 sequences.  What we do here is calculate
-             * the smallest code point the input could represent if there were
-             * no too short malformation.  Then we compute and save the UTF-8
-             * for that, which is what the code below looks at instead of the
-             * raw input.  It turns out that the smallest such code point is
-             * all we need. */
+             * The code further below has the intelligence to determine this,
+             * but just for non-overlong UTF-8 sequences.  What we do here is
+             * calculate the smallest code point the input could represent if
+             * there were no too short malformation.  Then we compute and save
+             * the UTF-8 for that, which is what the code below looks at
+             * instead of the raw input.  It turns out that the smallest such
+             * code point is all we need. */
              for (i = curlen; i < expectlen; i++) {
                  min_uv = UTF8_ACCUMULATE(min_uv,
                                       I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
              }
  
              adjusted_s0 = temp_char_buf;
              for (i = curlen; i < expectlen; i++) {
                  min_uv = UTF8_ACCUMULATE(min_uv,
                                       I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
              }
  
              adjusted_s0 = temp_char_buf;
-            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+            (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
          }
      }
  
          }
      }
  
-    /* Now check that the input isn't for a problematic code point not allowed
-     * by the input parameters. */
-                                              /* isn't problematic if < this */
-    if (   (   (   LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
+    /* Here, we have found all the possible problems, except for when the input
+     * is for a problematic code point not allowed by the input parameters. */
+
+                                /* uv is valid for overlongs */
+    if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
+
+                      /* isn't problematic if < this */
+                   && uv >= UNICODE_SURROGATE_FIRST)
              || (   UNLIKELY(possible_problems)
  
                            /* if overflow, we know without looking further
              || (   UNLIKELY(possible_problems)
  
                            /* if overflow, we know without looking further
@@ -1284,22 +1770,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                             * and we deal with those in the overflow handling
                             * code */
                  && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
                             * and we deal with those in the overflow handling
                             * code */
                  && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
-                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+                && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
+                    || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
         && ((flags & ( UTF8_DISALLOW_NONCHAR
                        |UTF8_DISALLOW_SURROGATE
                        |UTF8_DISALLOW_SUPER
         && ((flags & ( UTF8_DISALLOW_NONCHAR
                        |UTF8_DISALLOW_SURROGATE
                        |UTF8_DISALLOW_SUPER
-                      |UTF8_DISALLOW_ABOVE_31_BIT
+                      |UTF8_DISALLOW_PERL_EXTENDED
                       |UTF8_WARN_NONCHAR
                        |UTF8_WARN_SURROGATE
                        |UTF8_WARN_SUPER
                       |UTF8_WARN_NONCHAR
                        |UTF8_WARN_SURROGATE
                        |UTF8_WARN_SUPER
-                      |UTF8_WARN_ABOVE_31_BIT))
-                   /* In case of a malformation, 'uv' is not valid, and has
-                    * been changed to something in the Unicode range.
-                    * Currently we don't output a deprecation message if there
-                    * is already a malformation, so we don't have to special
-                    * case the test immediately below */
-            || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-                && ckWARN_d(WARN_DEPRECATED))))
+                      |UTF8_WARN_PERL_EXTENDED))))
      {
          /* If there were no malformations, or the only malformation is an
           * overlong, 'uv' is valid */
      {
          /* If there were no malformations, or the only malformation is an
           * overlong, 'uv' is valid */
@@ -1358,34 +1838,38 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
       *                      some subsitute value, typically the REPLACEMENT
       *                      CHARACTER.
       * s0                   points to the first byte of the character
       *                      some subsitute value, typically the REPLACEMENT
       *                      CHARACTER.
       * s0                   points to the first byte of the character
-     * send                 points to just after where that (potentially
-     *                      partial) character ends
-     * adjusted_s0          normally is the same as s0, but in case of an
-     *                      overlong for which the UTF-8 matters below, it is
-     *                      the first byte of the shortest form representation
-     *                      of the input.
-     * adjusted_send        normally is the same as 'send', but if adjusted_s0
-     *                      is set to something other than s0, this points one
-     *                      beyond its end
+     * s                    points to just after were we left off processing
+     *                      the character
+     * send                 points to just after where that character should
+     *                      end, based on how many bytes the start byte tells
+     *                      us should be in it, but no further than s0 +
+     *                      avail_len
       */
  
      if (UNLIKELY(possible_problems)) {
          bool disallowed = FALSE;
          const U32 orig_problems = possible_problems;
  
       */
  
      if (UNLIKELY(possible_problems)) {
          bool disallowed = FALSE;
          const U32 orig_problems = possible_problems;
  
+        if (msgs) {
+            *msgs = NULL;
+        }
+
          while (possible_problems) { /* Handle each possible problem */
              UV pack_warn = 0;
              char * message = NULL;
          while (possible_problems) { /* Handle each possible problem */
              UV pack_warn = 0;
              char * message = NULL;
+            U32 this_flag_bit = 0;
  
              /* Each 'if' clause handles one problem.  They are ordered so that
               * the first ones' messages will be displayed before the later
  
              /* Each 'if' clause handles one problem.  They are ordered so that
               * the first ones' messages will be displayed before the later
-             * ones; this is kinda in decreasing severity order */
+             * ones; this is kinda in decreasing severity order.  But the
+             * overlong must come last, as it changes 'uv' looked at by the
+             * others */
              if (possible_problems & UTF8_GOT_OVERFLOW) {
  
              if (possible_problems & UTF8_GOT_OVERFLOW) {
  
-                /* Overflow means also got a super and above 31 bits, but we
-                 * handle all three cases here */
+                /* Overflow means also got a super and are using Perl's
+                 * extended UTF-8, but we handle all three cases here */
                  possible_problems
                  possible_problems
-                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
                  *errors |= UTF8_GOT_OVERFLOW;
  
                  /* But the API says we flag all errors found */
                  *errors |= UTF8_GOT_OVERFLOW;
  
                  /* But the API says we flag all errors found */
@@ -1393,25 +1877,22 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
                  }
                  if (flags
                      *errors |= UTF8_GOT_SUPER;
                  }
                  if (flags
-                        & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
+                        & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
                  {
                  {
-                    *errors |= UTF8_GOT_ABOVE_31_BIT;
+                    *errors |= UTF8_GOT_PERL_EXTENDED;
                  }
  
                  /* Disallow if any of the three categories say to */
                  }
  
                  /* Disallow if any of the three categories say to */
-                if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+                if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
                      || (flags & ( UTF8_DISALLOW_SUPER
                      || (flags & ( UTF8_DISALLOW_SUPER
-                                 |UTF8_DISALLOW_ABOVE_31_BIT)))
+                                 |UTF8_DISALLOW_PERL_EXTENDED)))
                  {
                      disallowed = TRUE;
                  }
  
                  {
                      disallowed = TRUE;
                  }
  
-
-                /* Likewise, warn if any say to, plus if deprecation warnings
-                 * are on, because this code point is above IV_MAX */
-                if (  ckWARN_d(WARN_DEPRECATED)
-                    || ! (flags & UTF8_ALLOW_OVERFLOW)
-                    ||   (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+                /* Likewise, warn if any say to */
+                if (  ! (flags & UTF8_ALLOW_OVERFLOW)
+                    ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
                  {
  
                      /* The warnings code explicitly says it doesn't handle the
                  {
  
                      /* The warnings code explicitly says it doesn't handle the
@@ -1419,18 +1900,19 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                       * parent-child relationship.  Even if it works now to
                       * raise the warning if either is enabled, it wouldn't
                       * necessarily do so in the future.  We output (only) the
                       * parent-child relationship.  Even if it works now to
                       * raise the warning if either is enabled, it wouldn't
                       * necessarily do so in the future.  We output (only) the
-                     * most dire warning*/
+                     * most dire warning */
                      if (! (flags & UTF8_CHECK_ONLY)) {
                      if (! (flags & UTF8_CHECK_ONLY)) {
-                        if (ckWARN_d(WARN_UTF8)) {
+                        if (msgs || ckWARN_d(WARN_UTF8)) {
                              pack_warn = packWARN(WARN_UTF8);
                          }
                              pack_warn = packWARN(WARN_UTF8);
                          }
-                        else if (ckWARN_d(WARN_NON_UNICODE)) {
+                        else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
                              pack_warn = packWARN(WARN_NON_UNICODE);
                          }
                          if (pack_warn) {
                              message = Perl_form(aTHX_ "%s: %s (overflows)",
                                              malformed_text,
                                              _byte_dump_string(s0, curlen, 0));
                              pack_warn = packWARN(WARN_NON_UNICODE);
                          }
                          if (pack_warn) {
                              message = Perl_form(aTHX_ "%s: %s (overflows)",
                                              malformed_text,
                                              _byte_dump_string(s0, curlen, 0));
+                            this_flag_bit = UTF8_GOT_OVERFLOW;
                          }
                      }
                  }
                          }
                      }
                  }
@@ -1447,10 +1929,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      assert(0);
  
                      disallowed = TRUE;
                      assert(0);
  
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if (  (msgs
+                        || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_ "%s (empty string)",
                                                     malformed_text);
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_ "%s (empty string)",
                                                     malformed_text);
+                        this_flag_bit = UTF8_GOT_EMPTY;
                      }
                  }
              }
                      }
                  }
              }
@@ -1460,13 +1945,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                  if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                      disallowed = TRUE;
  
                  if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                                  "%s: %s (unexpected continuation byte 0x%02x,"
                                  " with no preceding start byte)",
                                  malformed_text,
                                  _byte_dump_string(s0, 1, 0), *s0);
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                                  "%s: %s (unexpected continuation byte 0x%02x,"
                                  " with no preceding start byte)",
                                  malformed_text,
                                  _byte_dump_string(s0, 1, 0), *s0);
+                        this_flag_bit = UTF8_GOT_CONTINUATION;
                      }
                  }
              }
                      }
                  }
              }
@@ -1476,15 +1964,18 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                  if (! (flags & UTF8_ALLOW_SHORT)) {
                      disallowed = TRUE;
  
                  if (! (flags & UTF8_ALLOW_SHORT)) {
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_
-                                "%s: %s (too short; %d byte%s available, need %d)",
-                                malformed_text,
-                                _byte_dump_string(s0, send - s0, 0),
-                                (int)avail_len,
-                                avail_len == 1 ? "" : "s",
-                                (int)expectlen);
+                             "%s: %s (too short; %d byte%s available, need %d)",
+                             malformed_text,
+                             _byte_dump_string(s0, send - s0, 0),
+                             (int)avail_len,
+                             avail_len == 1 ? "" : "s",
+                             (int)expectlen);
+                        this_flag_bit = UTF8_GOT_SHORT;
                      }
                  }
  
                      }
                  }
  
@@ -1495,7 +1986,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
                  if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                      disallowed = TRUE;
  
                  if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                      disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
  
                          /* If we don't know for sure that the input length is
                           * valid, avoid as much as possible reading past the
  
                          /* If we don't know for sure that the input length is
                           * valid, avoid as much as possible reading past the
@@ -1509,59 +2002,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                                              printlen,
                                                              s - s0,
                                                              (int) expectlen));
                                                              printlen,
                                                              s - s0,
                                                              (int) expectlen));
-                    }
-                }
-            }
-            else if (possible_problems & UTF8_GOT_LONG) {
-                possible_problems &= ~UTF8_GOT_LONG;
-                *errors |= UTF8_GOT_LONG;
-
-                if (flags & UTF8_ALLOW_LONG) {
-
-                    /* We don't allow the actual overlong value, unless the
-                     * special extra bit is also set */
-                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
-                                    & ~UTF8_ALLOW_LONG)))
-                    {
-                        uv = UNICODE_REPLACEMENT;
-                    }
-                }
-                else {
-                    disallowed = TRUE;
-
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
-                        pack_warn = packWARN(WARN_UTF8);
-
-                        /* These error types cause 'uv' to be something that
-                         * isn't what was intended, so can't use it in the
-                         * message.  The other error types either can't
-                         * generate an overlong, or else the 'uv' is valid */
-                        if (orig_problems &
-                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                        {
-                            message = Perl_form(aTHX_
-                                    "%s: %s (any UTF-8 sequence that starts"
-                                    " with \"%s\" is overlong which can and"
-                                    " should be represented with a"
-                                    " different, shorter sequence)",
-                                    malformed_text,
-                                    _byte_dump_string(s0, send - s0, 0),
-                                    _byte_dump_string(s0, curlen, 0));
-                        }
-                        else {
-                            U8 tmpbuf[UTF8_MAXBYTES+1];
-                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
-                                                                        uv, 0);
-                            message = Perl_form(aTHX_
-                                "%s: %s (overlong; instead use %s to represent"
-                                " U+%0*" UVXf ")",
-                                malformed_text,
-                                _byte_dump_string(s0, curlen, 0),
-                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
-                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
-                                                         small code points */
-                                uv);
-                        }
+                        this_flag_bit = UTF8_GOT_NON_CONTINUATION;
                      }
                  }
              }
                      }
                  }
              }
@@ -1572,7 +2013,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SURROGATE;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
                      *errors |= UTF8_GOT_SURROGATE;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_SURROGATE))
+                        && (msgs || ckWARN_d(WARN_SURROGATE)))
                      {
                          pack_warn = packWARN(WARN_SURROGATE);
  
                      {
                          pack_warn = packWARN(WARN_SURROGATE);
  
@@ -1585,9 +2026,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                      _byte_dump_string(s0, curlen, 0));
                          }
                          else {
                                      _byte_dump_string(s0, curlen, 0));
                          }
                          else {
-                            message = Perl_form(aTHX_
-                                            "UTF-16 surrogate U+%04" UVXf, uv);
+                            message = Perl_form(aTHX_ surrogate_cp_format, uv);
                          }
                          }
+                        this_flag_bit = UTF8_GOT_SURROGATE;
                      }
                  }
  
                      }
                  }
  
@@ -1603,7 +2044,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
                      *errors |= UTF8_GOT_SUPER;
  
                      if (   ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_NON_UNICODE))
+                        && (msgs || ckWARN_d(WARN_NON_UNICODE)))
                      {
                          pack_warn = packWARN(WARN_NON_UNICODE);
  
                      {
                          pack_warn = packWARN(WARN_NON_UNICODE);
  
@@ -1615,58 +2056,51 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                      _byte_dump_string(s0, curlen, 0));
                          }
                          else {
                                      _byte_dump_string(s0, curlen, 0));
                          }
                          else {
-                            message = Perl_form(aTHX_
-                                                "Code point 0x%04" UVXf " is not"
-                                                " Unicode, may not be portable",
-                                                uv);
+                            message = Perl_form(aTHX_ super_cp_format, uv);
                          }
                          }
+                        this_flag_bit = UTF8_GOT_SUPER;
                      }
                  }
  
                      }
                  }
  
-                /* The maximum code point ever specified by a standard was
-                 * 2**31 - 1.  Anything larger than that is a Perl extension
-                 * that very well may not be understood by other applications
-                 * (including earlier perl versions on EBCDIC platforms).  We
-                 * test for these after the regular SUPER ones, and before
-                 * possibly bailing out, so that the slightly more dire warning
-                 * will override the regular one. */
-                if (   (flags & (UTF8_WARN_ABOVE_31_BIT
-                                |UTF8_WARN_SUPER
-                                |UTF8_DISALLOW_ABOVE_31_BIT))
-                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
-                            && UNLIKELY(is_utf8_cp_above_31_bits(
-                                                                adjusted_s0,
-                                                                adjusted_send)))
-                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
-                            && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
-                {
+                /* Test for Perl's extended UTF-8 after the regular SUPER ones,
+                 * and before possibly bailing out, so that the more dire
+                 * warning will override the regular one. */
+                if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
                      if (  ! (flags & UTF8_CHECK_ONLY)
                      if (  ! (flags & UTF8_CHECK_ONLY)
-                        &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
-                        &&  ckWARN_d(WARN_UTF8))
+                        &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
+                        &&  (msgs || ckWARN_d(WARN_NON_UNICODE)))
                      {
                      {
-                        pack_warn = packWARN(WARN_UTF8);
+                        pack_warn = packWARN(WARN_NON_UNICODE);
  
  
-                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                        /* If it is an overlong that evaluates to a code point
+                         * that doesn't have to use the Perl extended UTF-8, it
+                         * still used it, and so we output a message that
+                         * doesn't refer to the code point.  The same is true
+                         * if there was a SHORT malformation where the code
+                         * point is not valid.  In that case, 'uv' will have
+                         * been set to the REPLACEMENT CHAR, and the message
+                         * below without the code point in it will be selected
+                         * */
+                        if (UNICODE_IS_PERL_EXTENDED(uv)) {
                              message = Perl_form(aTHX_
                              message = Perl_form(aTHX_
-                                        "Any UTF-8 sequence that starts with"
-                                        " \"%s\" is for a non-Unicode code"
-                                        " point, and is not portable",
-                                        _byte_dump_string(s0, curlen, 0));
+                                            perl_extended_cp_format, uv);
                          }
                          else {
                              message = Perl_form(aTHX_
                          }
                          else {
                              message = Perl_form(aTHX_
-                                        "Code point 0x%" UVXf " is not Unicode,"
-                                        " and not portable",
-                                         uv);
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is a Perl extension, and"
+                                        " so is not portable",
+                                        _byte_dump_string(s0, curlen, 0));
                          }
                          }
+                        this_flag_bit = UTF8_GOT_PERL_EXTENDED;
                      }
  
                      }
  
-                    if (flags & ( UTF8_WARN_ABOVE_31_BIT
-                                 |UTF8_DISALLOW_ABOVE_31_BIT))
+                    if (flags & ( UTF8_WARN_PERL_EXTENDED
+                                 |UTF8_DISALLOW_PERL_EXTENDED))
                      {
                      {
-                        *errors |= UTF8_GOT_ABOVE_31_BIT;
+                        *errors |= UTF8_GOT_PERL_EXTENDED;
  
  
-                        if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                        if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
                              disallowed = TRUE;
                          }
                      }
                              disallowed = TRUE;
                          }
                      }
@@ -1676,21 +2110,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_SUPER;
                      disallowed = TRUE;
                  }
                      *errors |= UTF8_GOT_SUPER;
                      disallowed = TRUE;
                  }
-
-                /* The deprecated warning overrides any non-deprecated one.  If
-                 * there are other problems, a deprecation message is not
-                 * really helpful, so don't bother to raise it in that case.
-                 * This also keeps the code from having to handle the case
-                 * where 'uv' is not valid. */
-                if (   ! (orig_problems
-                                    & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                    && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
-                    && ckWARN_d(WARN_DEPRECATED))
-                {
-                    message = Perl_form(aTHX_ cp_above_legal_max,
-                                              uv, MAX_NON_DEPRECATED_CP);
-                    pack_warn = packWARN(WARN_DEPRECATED);
-                }
              }
              else if (possible_problems & UTF8_GOT_NONCHAR) {
                  possible_problems &= ~UTF8_GOT_NONCHAR;
              }
              else if (possible_problems & UTF8_GOT_NONCHAR) {
                  possible_problems &= ~UTF8_GOT_NONCHAR;
@@ -1699,7 +2118,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      *errors |= UTF8_GOT_NONCHAR;
  
                      if (  ! (flags & UTF8_CHECK_ONLY)
                      *errors |= UTF8_GOT_NONCHAR;
  
                      if (  ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_NONCHAR))
+                        && (msgs || ckWARN_d(WARN_NONCHAR)))
                      {
                          /* The code above should have guaranteed that we don't
                           * get here with errors other than overlong */
                      {
                          /* The code above should have guaranteed that we don't
                           * get here with errors other than overlong */
@@ -1707,9 +2126,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                          & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
  
                          pack_warn = packWARN(WARN_NONCHAR);
                                          & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
  
                          pack_warn = packWARN(WARN_NONCHAR);
-                        message = Perl_form(aTHX_ "Unicode non-character"
-                                                " U+%04" UVXf " is not recommended"
-                                                " for open interchange", uv);
+                        message = Perl_form(aTHX_ nonchar_cp_format, uv);
+                        this_flag_bit = UTF8_GOT_NONCHAR;
                      }
                  }
  
                      }
                  }
  
@@ -1717,12 +2135,89 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      disallowed = TRUE;
                      *errors |= UTF8_GOT_NONCHAR;
                  }
                      disallowed = TRUE;
                      *errors |= UTF8_GOT_NONCHAR;
                  }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
+
+                if (flags & UTF8_ALLOW_LONG) {
+
+                    /* We don't allow the actual overlong value, unless the
+                     * special extra bit is also set */
+                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
+                                    & ~UTF8_ALLOW_LONG)))
+                    {
+                        uv = UNICODE_REPLACEMENT;
+                    }
+                }
+                else {
+                    disallowed = TRUE;
+
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0, 0),
+                                    _byte_dump_string(s0, curlen, 0));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            /* Don't use U+ for non-Unicode code points, which
+                             * includes those in the Latin1 range */
+                            const char * preface = (    uv > PERL_UNICODE_MAX
+#ifdef EBCDIC
+                                                     || uv <= 0xFF
+#endif
+                                                    )
+                                                   ? "0x"
+                                                   : "U+";
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " %s%0*" UVXf ")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0, 0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
+                                preface,
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                UNI_TO_NATIVE(uv));
+                        }
+                        this_flag_bit = UTF8_GOT_LONG;
+                    }
+                }
              } /* End of looking through the possible flags */
  
              /* Display the message (if any) for the problem being handled in
               * this iteration of the loop */
              if (message) {
              } /* End of looking through the possible flags */
  
              /* Display the message (if any) for the problem being handled in
               * this iteration of the loop */
              if (message) {
-                if (PL_op)
+                if (msgs) {
+                    assert(this_flag_bit);
+
+                    if (*msgs == NULL) {
+                        *msgs = newAV();
+                    }
+
+                    av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
+                                                                pack_warn,
+                                                                this_flag_bit)));
+                }
+                else if (PL_op)
                      Perl_warner(aTHX_ pack_warn, "%s in %s", message,
                                                   OP_DESC(PL_op));
                  else
                      Perl_warner(aTHX_ pack_warn, "%s in %s", message,
                                                   OP_DESC(PL_op));
                  else
@@ -1764,9 +2259,6 @@ the next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
-Code points above the platform's C<IV_MAX> will raise a deprecation warning,
-unless those are turned off.
-
  =cut
  
  Also implemented as a macro in utf8.h
  =cut
  
  Also implemented as a macro in utf8.h
@@ -1779,10 +2271,7 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
  
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
  
-    assert(s < send);
-
-    return utf8n_to_uvchr(s, send - s, retlen,
-                     ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return _utf8_to_uvchr_buf(s, send, retlen);
  }
  
  /* This is marked as deprecated
  }
  
  /* This is marked as deprecated
@@ -1791,7 +2280,9 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
  Only in very rare circumstances should code need to be dealing in Unicode
  (as opposed to native) code points.  In those few cases, use
  
  Only in very rare circumstances should code need to be dealing in Unicode
  (as opposed to native) code points.  In those few cases, use
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.  If you
+are not absolutely sure this is one of those cases, then assume it isn't and
+use plain C<utf8_to_uvchr_buf> instead.
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
@@ -1806,9 +2297,6 @@ is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
-Code points above the platform's C<IV_MAX> will raise a deprecation warning,
-unless those are turned off.
-
  =cut
  */
  
  =cut
  */
  
@@ -1819,16 +2307,18 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
      assert(send > s);
  
  
      assert(send > s);
  
-    /* Call the low level routine, asking for checks */
      return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
  }
  
  /*
  =for apidoc utf8_length
  
      return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
  }
  
  /*
  =for apidoc utf8_length
  
-Return the length of the UTF-8 char encoded string C<s> in characters.
-Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
-up past C<e>, croaks.
+Returns the number of characters in the sequence of UTF-8-encoded bytes starting
+at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
+same place, it returns 0 with no warning raised.
+
+If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
+and returns the number of valid characters.
  
  =cut
  */
  
  =cut
  */
@@ -1844,14 +2334,14 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
       * the bitops (especially ~) can create illegal UTF-8.
       * In other words: in Perl UTF-8 is not just for Unicode. */
  
       * the bitops (especially ~) can create illegal UTF-8.
       * In other words: in Perl UTF-8 is not just for Unicode. */
  
-    if (e < s)
+    if (UNLIKELY(e < s))
         goto warn_and_return;
      while (s < e) {
          s += UTF8SKIP(s);
         len++;
      }
  
         goto warn_and_return;
      while (s < e) {
          s += UTF8SKIP(s);
         len++;
      }
  
-    if (e != s) {
+    if (UNLIKELY(e != s)) {
         len--;
          warn_and_return:
         if (PL_op)
         len--;
          warn_and_return:
         if (PL_op)
@@ -1900,10 +2390,10 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                     } else {
                          /* diag_listed_as: Malformed UTF-8 character%s */
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                     } else {
                          /* diag_listed_as: Malformed UTF-8 character%s */
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
-                                    "%s %s%s",
-                                    unexpected_non_continuation_text(u - 2, 2, 1, 2),
-                                    PL_op ? " in " : "",
-                                    PL_op ? OP_DESC(PL_op) : "");
+                              "%s %s%s",
+                              unexpected_non_continuation_text(u - 2, 2, 1, 2),
+                              PL_op ? " in " : "",
+                              PL_op ? OP_DESC(PL_op) : "");
                         return -2;
                     }
                 } else {
                         return -2;
                     }
                 } else {
@@ -2013,7 +2503,9 @@ C<*lenp> are unchanged, and the return value is the original C<s>.
  
  Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
  newly created string containing a downgraded copy of C<s>, and whose length is
  
  Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
  newly created string containing a downgraded copy of C<s>, and whose length is
-returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.
+returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
+caller is responsible for arranging for the memory used by this string to get
+freed.
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting the
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting the
@@ -2128,8 +2620,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f
      }
  
    finish_and_return:
      }
  
    finish_and_return:
-        *d = '\0';
-        *lenp = d - converted_start;
+    *d = '\0';
+    *lenp = d - converted_start;
  
      /* Trim unused space */
      Renew(converted_start, *lenp + 1, U8);
  
      /* Trim unused space */
      Renew(converted_start, *lenp + 1, U8);
@@ -2143,7 +2635,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f
  Converts a string C<s> of length C<*lenp> bytes from the native encoding into
  UTF-8.
  Returns a pointer to the newly-created string, and sets C<*lenp> to
  Converts a string C<s> of length C<*lenp> bytes from the native encoding into
  UTF-8.
  Returns a pointer to the newly-created string, and sets C<*lenp> to
-reflect the new length in bytes.
+reflect the new length in bytes.  The caller is responsible for arranging for
+the memory used by this string to get freed.
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting it from the
  
  Upon successful return, the number of variants in the string can be computed by
  having saved the value of C<*lenp> before the call, and subtracting it from the
@@ -2168,23 +2661,35 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
      PERL_ARGS_ASSERT_BYTES_TO_UTF8;
      PERL_UNUSED_CONTEXT;
  
      PERL_ARGS_ASSERT_BYTES_TO_UTF8;
      PERL_UNUSED_CONTEXT;
  
-    Newx(d, (*lenp) * 2 + 1, U8);
+    /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
      dst = d;
  
      while (s < send) {
          append_utf8_from_native_byte(*s, &d);
          s++;
      }
      dst = d;
  
      while (s < send) {
          append_utf8_from_native_byte(*s, &d);
          s++;
      }
+
      *d = '\0';
      *lenp = d-dst;
      *d = '\0';
      *lenp = d-dst;
+
      return dst;
  }
  
  /*
      return dst;
  }
  
  /*
- * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
+ * Convert native (big-endian) UTF-16 to UTF-8.  For reversed (little-endian),
+ * use utf16_to_utf8_reversed().
+ *
+ * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
+ * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
+ * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
   *
   *
- * Destination must be pre-extended to 3/2 source.  Do not use in-place.
- * We optimize for native, for obvious reasons. */
+ * These functions don't check for overflow.  The worst case is every code
+ * point in the input is 2 bytes, and requires 4 bytes on output.  (If the code
+ * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.)  Therefore the
+ * destination must be pre-extended to 2 times the source length.
+ *
+ * Do not use in-place.  We optimize for native, for obvious reasons. */
  
  U8*
  Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  
  U8*
  Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
@@ -2195,7 +2700,8 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
      PERL_ARGS_ASSERT_UTF16_TO_UTF8;
  
      if (bytelen & 1)
      PERL_ARGS_ASSERT_UTF16_TO_UTF8;
  
      if (bytelen & 1)
-       Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen);
+       Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
+                                                               (UV)bytelen);
  
      pend = p + bytelen;
  
  
      pend = p + bytelen;
  
@@ -2211,10 +2717,12 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
             continue;
         }
             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
             continue;
         }
+
  #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
  #define LAST_HIGH_SURROGATE  0xDBFF
  #define FIRST_LOW_SURROGATE  0xDC00
  #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
  #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
  #define LAST_HIGH_SURROGATE  0xDBFF
  #define FIRST_LOW_SURROGATE  0xDC00
  #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
+#define FIRST_IN_PLANE1      0x10000
  
          /* This assumes that most uses will be in the first Unicode plane, not
           * needing surrogates */
  
          /* This assumes that most uses will be in the first Unicode plane, not
           * needing surrogates */
@@ -2233,13 +2741,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
                  }
                 p += 2;
                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
                  }
                 p += 2;
                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
-                                       + (low - FIRST_LOW_SURROGATE) + 0x10000;
+                                + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
             }
         }
  #ifdef EBCDIC
          d = uvoffuni_to_utf8_flags(d, uv, 0);
  #else
             }
         }
  #ifdef EBCDIC
          d = uvoffuni_to_utf8_flags(d, uv, 0);
  #else
-       if (uv < 0x10000) {
+       if (uv < FIRST_IN_PLANE1) {
             *d++ = (U8)(( uv >> 12)         | 0xe0);
             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
             *d++ = (U8)(( uv >> 12)         | 0xe0);
             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
@@ -2284,9 +2792,8 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  bool
  Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
  {
  bool
  Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
+    dVAR;
+    return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
  }
  
  /* Internal function so we can deprecate the external one, and call
  }
  
  /* Internal function so we can deprecate the external one, and call
@@ -2295,31 +2802,32 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
  bool
  Perl__is_utf8_idstart(pTHX_ const U8 *p)
  {
  bool
  Perl__is_utf8_idstart(pTHX_ const U8 *p)
  {
+    dVAR;
+
      PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
  
      if (*p == '_')
         return TRUE;
      PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
  
      if (*p == '_')
         return TRUE;
-    return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
+    return is_utf8_common(p, PL_utf8_idstart);
  }
  
  bool
  Perl__is_uni_perl_idcont(pTHX_ UV c)
  {
  }
  
  bool
  Perl__is_uni_perl_idcont(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
+    dVAR;
+    return _invlist_contains_cp(PL_utf8_perl_idcont, c);
  }
  
  bool
  Perl__is_uni_perl_idstart(pTHX_ UV c)
  {
  }
  
  bool
  Perl__is_uni_perl_idstart(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
+    dVAR;
+    return _invlist_contains_cp(PL_utf8_perl_idstart, c);
  }
  
  UV
  }
  
  UV
-Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
+Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
+                                  const char S_or_s)
  {
      /* We have the latin1-range values compiled into the core, so just use
       * those, converting the result to UTF-8.  The only difference between upper
  {
      /* We have the latin1-range values compiled into the core, so just use
       * those, converting the result to UTF-8.  The only difference between upper
@@ -2361,7 +2869,9 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
                 return 'S';
  #endif
             default:
                 return 'S';
  #endif
             default:
-               Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
+               Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
+                                 " '%c' to map to '%c'",
+                                 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
                 NOT_REACHED; /* NOTREACHED */
         }
      }
                 NOT_REACHED; /* NOTREACHED */
         }
      }
@@ -2373,22 +2883,72 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
      return converted;
  }
  
      return converted;
  }
  
+/* If compiled on an early Unicode version, there may not be auxiliary tables
+ * */
+#ifndef HAS_UC_AUX_TABLES
+#  define UC_AUX_TABLE_ptrs     NULL
+#  define UC_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_TC_AUX_TABLES
+#  define TC_AUX_TABLE_ptrs     NULL
+#  define TC_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_LC_AUX_TABLES
+#  define LC_AUX_TABLE_ptrs     NULL
+#  define LC_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_CF_AUX_TABLES
+#  define CF_AUX_TABLE_ptrs     NULL
+#  define CF_AUX_TABLE_lengths  NULL
+#endif
+#ifndef HAS_UC_AUX_TABLES
+#  define UC_AUX_TABLE_ptrs     NULL
+#  define UC_AUX_TABLE_lengths  NULL
+#endif
+
  /* Call the function to convert a UTF-8 encoded character to the specified case.
   * Note that there may be more than one character in the result.
  /* Call the function to convert a UTF-8 encoded character to the specified case.
   * Note that there may be more than one character in the result.
- * INP is a pointer to the first byte of the input character
- * OUTP will be set to the first byte of the string of changed characters.  It
+ * 's' is a pointer to the first byte of the input character
+ * 'd' will be set to the first byte of the string of changed characters.  It
   *     needs to have space for UTF8_MAXBYTES_CASE+1 bytes
   *     needs to have space for UTF8_MAXBYTES_CASE+1 bytes
- * LENP will be set to the length in bytes of the string of changed characters
+ * 'lenp' will be set to the length in bytes of the string of changed characters
   *
   *
- * The functions return the ordinal of the first character in the string of OUTP */
-#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
-#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
-#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
+ * The functions return the ordinal of the first character in the string of
+ * 'd' */
+#define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
+                _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
+                                              Uppercase_Mapping_invmap,     \
+                                              UC_AUX_TABLE_ptrs,            \
+                                              UC_AUX_TABLE_lengths,         \
+                                              "uppercase")
+#define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
+                _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
+                                              Titlecase_Mapping_invmap,     \
+                                              TC_AUX_TABLE_ptrs,            \
+                                              TC_AUX_TABLE_lengths,         \
+                                              "titlecase")
+#define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
+                _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
+                                              Lowercase_Mapping_invmap,     \
+                                              LC_AUX_TABLE_ptrs,            \
+                                              LC_AUX_TABLE_lengths,         \
+                                              "lowercase")
+
  
  /* This additionally has the input parameter 'specials', which if non-zero will
   * cause this to use the specials hash for folding (meaning get full case
   * folding); otherwise, when zero, this implies a simple case fold */
  
  /* This additionally has the input parameter 'specials', which if non-zero will
   * cause this to use the specials hash for folding (meaning get full case
   * folding); otherwise, when zero, this implies a simple case fold */
-#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
+#define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
+        (specials)                                                          \
+        ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
+                                          Case_Folding_invmap,              \
+                                          CF_AUX_TABLE_ptrs,                \
+                                          CF_AUX_TABLE_lengths,             \
+                                          "foldcase")                       \
+        : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
+                                         Simple_Case_Folding_invmap,        \
+                                         NULL, NULL,                        \
+                                         "foldcase")
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
@@ -2401,27 +2961,27 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
       * The ordinal of the first character of the changed version is returned
       * (but note, as explained above, that there may be more.) */
  
       * The ordinal of the first character of the changed version is returned
       * (but note, as explained above, that there may be more.) */
  
+    dVAR;
      PERL_ARGS_ASSERT_TO_UNI_UPPER;
  
      if (c < 256) {
         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
      }
  
      PERL_ARGS_ASSERT_TO_UNI_UPPER;
  
      if (c < 256) {
         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_UPPER_CASE(c, p, p, lenp);
+    return CALL_UPPER_CASE(c, NULL, p, lenp);
  }
  
  UV
  Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
  }
  
  UV
  Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
+    dVAR;
      PERL_ARGS_ASSERT_TO_UNI_TITLE;
  
      if (c < 256) {
         return _to_upper_title_latin1((U8) c, p, lenp, 's');
      }
  
      PERL_ARGS_ASSERT_TO_UNI_TITLE;
  
      if (c < 256) {
         return _to_upper_title_latin1((U8) c, p, lenp, 's');
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_TITLE_CASE(c, p, p, lenp);
+    return CALL_TITLE_CASE(c, NULL, p, lenp);
  }
  
  STATIC U8
  }
  
  STATIC U8
@@ -2454,18 +3014,18 @@ S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
  UV
  Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
  UV
  Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
+    dVAR;
      PERL_ARGS_ASSERT_TO_UNI_LOWER;
  
      if (c < 256) {
         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
      }
  
      PERL_ARGS_ASSERT_TO_UNI_LOWER;
  
      if (c < 256) {
         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_LOWER_CASE(c, p, p, lenp);
+    return CALL_LOWER_CASE(c, NULL, p, lenp);
  }
  
  UV
  }
  
  UV
-Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
+Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
  {
      /* Corresponds to to_lower_latin1(); <flags> bits meanings:
       *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
  {
      /* Corresponds to to_lower_latin1(); <flags> bits meanings:
       *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
@@ -2477,7 +3037,6 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
      UV converted;
  
      PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
      UV converted;
  
      PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
-    PERL_UNUSED_CONTEXT;
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
@@ -2537,15 +3096,17 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
       *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
       */
  
       *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
       */
  
+    dVAR;
      PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      if (flags & FOLD_FLAGS_LOCALE) {
      PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      if (flags & FOLD_FLAGS_LOCALE) {
-        /* Treat a UTF-8 locale as not being in locale at all */
-        if (IN_UTF8_CTYPE_LOCALE) {
+        /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
+         * except for potentially warning */
+        _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+        if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
              flags &= ~FOLD_FLAGS_LOCALE;
          }
          else {
              flags &= ~FOLD_FLAGS_LOCALE;
          }
          else {
-            _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
              goto needs_full_generality;
          }
      }
              goto needs_full_generality;
          }
      }
@@ -2557,8 +3118,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  
      /* Here, above 255.  If no special needs, just use the macro */
      if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
  
      /* Here, above 255.  If no special needs, just use the macro */
      if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
-       uvchr_to_utf8(p, c);
-       return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+       return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
      }
      else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
                the special flags. */
      }
      else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
                the special flags. */
@@ -2566,26 +3126,23 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  
        needs_full_generality:
         uvchr_to_utf8(utf8_c, c);
  
        needs_full_generality:
         uvchr_to_utf8(utf8_c, c);
-       return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c), p, lenp, flags);
+       return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
+                                  p, lenp, flags);
      }
  }
  
  PERL_STATIC_INLINE bool
      }
  }
  
  PERL_STATIC_INLINE bool
-S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
-                const char *const swashname, SV* const invlist)
+S_is_utf8_common(pTHX_ const U8 *const p, SV* const invlist)
  {
      /* returns a boolean giving whether or not the UTF8-encoded character that
  {
      /* returns a boolean giving whether or not the UTF8-encoded character that
-     * starts at <p> is in the swash indicated by <swashname>.  <swash>
-     * contains a pointer to where the swash indicated by <swashname>
-     * is to be stored; which this routine will do, so that future calls will
-     * look at <*swash> and only generate a swash if it is not null.  <invlist>
-     * is NULL or an inversion list that defines the swash.  If not null, it
-     * saves time during initialization of the swash.
+     * starts at <p> is in the inversion list indicated by <invlist>.
       *
       * Note that it is assumed that the buffer length of <p> is enough to
       * contain all the bytes that comprise the character.  Thus, <*p> should
       * have been checked before this call for mal-formedness enough to assure
       *
       * Note that it is assumed that the buffer length of <p> is enough to
       * contain all the bytes that comprise the character.  Thus, <*p> should
       * have been checked before this call for mal-formedness enough to assure
-     * that. */
+     * that.  This function, does make sure to not look past any NUL, so it is
+     * safe to use on C, NUL-terminated, strings */
+    STRLEN len = my_strnlen((char *) p, UTF8SKIP(p));
  
      PERL_ARGS_ASSERT_IS_UTF8_COMMON;
  
  
      PERL_ARGS_ASSERT_IS_UTF8_COMMON;
  
@@ -2594,59 +3151,34 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
       * as far as there being enough bytes available in it to accommodate the
       * character without reading beyond the end, and pass that number on to the
       * validating routine */
       * as far as there being enough bytes available in it to accommodate the
       * character without reading beyond the end, and pass that number on to the
       * validating routine */
-    if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
-        _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
-                                          _UTF8_NO_CONFIDENCE_IN_CURLEN,
+    if (! isUTF8_CHAR(p, p + len)) {
+        _force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN,
                                            1 /* Die */ );
          NOT_REACHED; /* NOTREACHED */
      }
  
                                            1 /* Die */ );
          NOT_REACHED; /* NOTREACHED */
      }
  
-    if (!*swash) {
-        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-        *swash = _core_swash_init("utf8",
-
-                                  /* Only use the name if there is no inversion
-                                   * list; otherwise will go out to disk */
-                                  (invlist) ? "" : swashname,
-
-                                  &PL_sv_undef, 1, 0, invlist, &flags);
-    }
-
-    return swash_fetch(*swash, p, TRUE) != 0;
+    return is_utf8_common_with_len(p, p + len, invlist);
  }
  
  PERL_STATIC_INLINE bool
  }
  
  PERL_STATIC_INLINE bool
-S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e, SV **swash,
-                         const char *const swashname, SV* const invlist)
+S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
+                          SV* const invlist)
  {
      /* returns a boolean giving whether or not the UTF8-encoded character that
  {
      /* returns a boolean giving whether or not the UTF8-encoded character that
-     * starts at <p>, and extending no further than <e - 1> is in the swash
-     * indicated by <swashname>.  <swash> contains a pointer to where the swash
-     * indicated by <swashname> is to be stored; which this routine will do, so
-     * that future calls will look at <*swash> and only generate a swash if it
-     * is not null.  <invlist> is NULL or an inversion list that defines the
-     * swash.  If not null, it saves time during initialization of the swash.
-     */
+     * starts at <p>, and extending no further than <e - 1> is in the inversion
+     * list <invlist>. */
+
+    UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
  
      PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
  
  
      PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
  
-    if (! isUTF8_CHAR(p, e)) {
+    if (cp == 0 && (p >= e || *p != '\0')) {
          _force_out_malformed_utf8_message(p, e, 0, 1);
          NOT_REACHED; /* NOTREACHED */
      }
  
          _force_out_malformed_utf8_message(p, e, 0, 1);
          NOT_REACHED; /* NOTREACHED */
      }
  
-    if (!*swash) {
-        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-        *swash = _core_swash_init("utf8",
-
-                                  /* Only use the name if there is no inversion
-                                   * list; otherwise will go out to disk */
-                                  (invlist) ? "" : swashname,
-
-                                  &PL_sv_undef, 1, 0, invlist, &flags);
-    }
-
-    return swash_fetch(*swash, p, TRUE) != 0;
+    assert(invlist);
+    return _invlist_contains_cp(invlist, cp);
  }
  
  STATIC void
  }
  
  STATIC void
@@ -2675,14 +3207,14 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name,
  
              if (instr(file, "mathoms.c")) {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
  
              if (instr(file, "mathoms.c")) {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
-                            "In %s, line %d, starting in Perl v5.30, %s()"
+                            "In %s, line %d, starting in Perl v5.32, %s()"
                              " will be removed.  Avoid this message by"
                              " converting to use %s().\n",
                              file, line, name, alternative);
              }
              else {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
                              " will be removed.  Avoid this message by"
                              " converting to use %s().\n",
                              file, line, name, alternative);
              }
              else {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
-                            "In %s, line %d, starting in Perl v5.30, %s() will"
+                            "In %s, line %d, starting in Perl v5.32, %s() will"
                              " require an additional parameter.  Avoid this"
                              " message by converting to use %s().\n",
                              file, line, name, alternative);
                              " require an additional parameter.  Avoid this"
                              " message by converting to use %s().\n",
                              file, line, name, alternative);
@@ -2701,6 +3233,7 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
                          const char * const file,
                          const unsigned line)
  {
                          const char * const file,
                          const unsigned line)
  {
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_FOO;
  
      warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
      PERL_ARGS_ASSERT__IS_UTF8_FOO;
  
      warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
@@ -2719,10 +3252,7 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
              case _CC_GRAPH:
              case _CC_CASED:
  
              case _CC_GRAPH:
              case _CC_CASED:
  
-                return is_utf8_common(p,
-                                      &PL_utf8_swash_ptrs[classnum],
-                                      swash_property_names[classnum],
-                                      PL_XPosix_ptrs[classnum]);
+                return is_utf8_common(p, PL_XPosix_ptrs[classnum]);
  
              case _CC_SPACE:
                  return is_XPERLSPACE_high(p);
  
              case _CC_SPACE:
                  return is_XPERLSPACE_high(p);
@@ -2737,19 +3267,9 @@ Perl__is_utf8_FOO(pTHX_       U8   classnum,
              case _CC_VERTSPACE:
                  return is_VERTWS_high(p);
              case _CC_IDFIRST:
              case _CC_VERTSPACE:
                  return is_VERTWS_high(p);
              case _CC_IDFIRST:
-                if (! PL_utf8_perl_idstart) {
-                    PL_utf8_perl_idstart
-                                = _new_invlist_C_array(_Perl_IDStart_invlist);
-                }
-                return is_utf8_common(p, &PL_utf8_perl_idstart,
-                                      "_Perl_IDStart", NULL);
+                return is_utf8_common(p, PL_utf8_perl_idstart);
              case _CC_IDCONT:
              case _CC_IDCONT:
-                if (! PL_utf8_perl_idcont) {
-                    PL_utf8_perl_idcont
-                                = _new_invlist_C_array(_Perl_IDCont_invlist);
-                }
-                return is_utf8_common(p, &PL_utf8_perl_idcont,
-                                      "_Perl_IDCont", NULL);
+                return is_utf8_common(p, PL_utf8_perl_idcont);
          }
      }
  
          }
      }
  
@@ -2786,86 +3306,88 @@ bool
  Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
                                                              const U8 * const e)
  {
  Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
                                                              const U8 * const e)
  {
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
  
      PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
  
-    assert(classnum < _FIRST_NON_SWASH_CC);
-
-    return is_utf8_common_with_len(p,
-                                   e,
-                                   &PL_utf8_swash_ptrs[classnum],
-                                   swash_property_names[classnum],
-                                   PL_XPosix_ptrs[classnum]);
+    return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]);
  }
  
  bool
  Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
  }
  
  bool
  Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
-    SV* invlist = NULL;
-
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
  
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
  
-    if (! PL_utf8_perl_idstart) {
-        invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
-    }
-    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
-                                      "_Perl_IDStart", invlist);
+    return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart);
  }
  
  bool
  Perl__is_utf8_xidstart(pTHX_ const U8 *p)
  {
  }
  
  bool
  Perl__is_utf8_xidstart(pTHX_ const U8 *p)
  {
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
  
      if (*p == '_')
         return TRUE;
      PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
  
      if (*p == '_')
         return TRUE;
-    return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
+    return is_utf8_common(p, PL_utf8_xidstart);
  }
  
  bool
  Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
  }
  
  bool
  Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
  {
-    SV* invlist = NULL;
-
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
  
      PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
  
-    if (! PL_utf8_perl_idcont) {
-        invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
-    }
-    return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
-                                   "_Perl_IDCont", invlist);
+    return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont);
  }
  
  bool
  Perl__is_utf8_idcont(pTHX_ const U8 *p)
  {
  }
  
  bool
  Perl__is_utf8_idcont(pTHX_ const U8 *p)
  {
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
  
      PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
  
-    return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
+    return is_utf8_common(p, PL_utf8_idcont);
  }
  
  bool
  Perl__is_utf8_xidcont(pTHX_ const U8 *p)
  {
  }
  
  bool
  Perl__is_utf8_xidcont(pTHX_ const U8 *p)
  {
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
  
      PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
  
-    return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
+    return is_utf8_common(p, PL_utf8_xidcont);
  }
  
  bool
  Perl__is_utf8_mark(pTHX_ const U8 *p)
  {
  }
  
  bool
  Perl__is_utf8_mark(pTHX_ const U8 *p)
  {
+    dVAR;
      PERL_ARGS_ASSERT__IS_UTF8_MARK;
  
      PERL_ARGS_ASSERT__IS_UTF8_MARK;
  
-    return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+    return is_utf8_common(p, PL_utf8_mark);
  }
  
  }
  
-    /* change namve uv1 to 'from' */
  STATIC UV
  STATIC UV
-S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
-               SV **swashp, const char *normal, const char *special)
+S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
+                      U8* ustrp, STRLEN *lenp,
+                      SV *invlist, const int * const invmap,
+                      const unsigned int * const * const aux_tables,
+                      const U8 * const aux_table_lengths,
+                      const char * const normal)
  {
      STRLEN len = 0;
  
  {
      STRLEN len = 0;
  
+    /* Change the case of code point 'uv1' whose UTF-8 representation (assumed
+     * by this routine to be valid) begins at 'p'.  'normal' is a string to use
+     * to name the new case in any generated messages, as a fallback if the
+     * operation being used is not available.  The new case is given by the
+     * data structures in the remaining arguments.
+     *
+     * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
+     * entire changed case string, and the return value is the first code point
+     * in that string */
+
      PERL_ARGS_ASSERT__TO_UTF8_CASE;
  
      /* For code points that don't change case, we already know that the output
      PERL_ARGS_ASSERT__TO_UTF8_CASE;
  
      /* For code points that don't change case, we already know that the output
@@ -2920,7 +3442,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                      if (ckWARN_d(WARN_SURROGATE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
                          Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                      if (ckWARN_d(WARN_SURROGATE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
                          Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
-                            "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1);
+                            "Operation \"%s\" returns its argument for"
+                            " UTF-16 surrogate U+%04" UVXf, desc, uv1);
                      }
                      goto cases_to_self;
                  }
                      }
                      goto cases_to_self;
                  }
@@ -2929,20 +3452,18 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                   * some others */
                  if (uv1 < 0xFB00) {
                      goto cases_to_self;
                   * some others */
                  if (uv1 < 0xFB00) {
                      goto cases_to_self;
-
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
-                    if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
-                        && ckWARN_d(WARN_DEPRECATED))
-                    {
-                        Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                                cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
+                    if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
+                        Perl_croak(aTHX_ cp_above_legal_max, uv1,
+                                         MAX_LEGAL_CP);
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
                          Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
                          Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
-                            "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1);
+                            "Operation \"%s\" returns its argument for"
+                            " non-Unicode code point 0x%04" UVXf, desc, uv1);
                      }
                      goto cases_to_self;
                  }
                      }
                      goto cases_to_self;
                  }
@@ -2951,8 +3472,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                      > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
                  {
  
                      > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
                  {
  
-                    /* As of this writing, this means we avoid swash creation
-                     * for anything beyond low Plane 1 */
+                    /* As of Unicode 10.0, this means we avoid swash creation
+                     * for anything beyond high Plane 1 (below emojis)  */
                      goto cases_to_self;
                  }
  #endif
                      goto cases_to_self;
                  }
  #endif
@@ -2960,101 +3481,158 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
          }
  
         /* Note that non-characters are perfectly legal, so no warning should
          }
  
         /* Note that non-characters are perfectly legal, so no warning should
-         * be given.  There are so few of them, that it isn't worth the extra
-         * tests to avoid swash creation */
+         * be given. */
      }
  
      }
  
-    if (!*swashp) /* load on-demand */
-         *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
+    {
+        unsigned int i;
+        const unsigned int * cp_list;
+        U8 * d;
  
  
-    if (special) {
-         /* It might be "special" (sometimes, but not always,
-         * a multicharacter mapping) */
-         HV *hv = NULL;
-        SV **svp;
+        /* 'index' is guaranteed to be non-negative, as this is an inversion
+         * map that covers all possible inputs.  See [perl #133365] */
+        SSize_t index = _invlist_search(invlist, uv1);
+        IV base = invmap[index];
  
  
-        /* If passed in the specials name, use that; otherwise use any
-         * given in the swash */
-         if (*special != '\0') {
-            hv = get_hv(special, 0);
-        }
-        else {
-            svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
-            if (svp) {
-                hv = MUTABLE_HV(SvRV(*svp));
-            }
-        }
+        /* The data structures are set up so that if 'base' is non-negative,
+         * the case change is 1-to-1; and if 0, the change is to itself */
+        if (base >= 0) {
+            IV lc;
  
  
-        if (hv
-             && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
-             && (*svp))
-         {
-            const char *s;
-
-             s = SvPV_const(*svp, len);
-             if (len == 1)
-                  /* EIGHTBIT */
-                  len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
-             else {
-                  Copy(s, ustrp, len, U8);
-             }
-        }
-    }
+            if (base == 0) {
+                goto cases_to_self;
+            }
  
  
-    if (!len && *swashp) {
-       const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
+            /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
+            lc = base + uv1 - invlist_array(invlist)[index];
+            *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
+            return lc;
+        }
  
  
-        if (uv2) {
-             /* It was "normal" (a single character mapping). */
-             len = uvchr_to_utf8(ustrp, uv2) - ustrp;
-        }
-    }
+        /* Here 'base' is negative.  That means the mapping is 1-to-many, and
+         * requires an auxiliary table look up.  abs(base) gives the index into
+         * a list of such tables which points to the proper aux table.  And a
+         * parallel list gives the length of each corresponding aux table. */
+        cp_list = aux_tables[-base];
  
  
-    if (len) {
-        if (lenp) {
-            *lenp = len;
+        /* Create the string of UTF-8 from the mapped-to code points */
+        d = ustrp;
+        for (i = 0; i < aux_table_lengths[-base]; i++) {
+            d = uvchr_to_utf8(d, cp_list[i]);
          }
          }
-        return valid_utf8_to_uvchr(ustrp, 0);
+        *d = '\0';
+        *lenp = d - ustrp;
+
+        return cp_list[0];
      }
  
      /* Here, there was no mapping defined, which means that the code point maps
       * to itself.  Return the inputs */
    cases_to_self:
      }
  
      /* Here, there was no mapping defined, which means that the code point maps
       * to itself.  Return the inputs */
    cases_to_self:
-    len = UTF8SKIP(p);
-    if (p != ustrp) {   /* Don't copy onto itself */
-        Copy(p, ustrp, len, U8);
+    if (p) {
+        len = UTF8SKIP(p);
+        if (p != ustrp) {   /* Don't copy onto itself */
+            Copy(p, ustrp, len, U8);
+        }
+        *lenp = len;
+    }
+    else {
+       *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
      }
      }
-
-    if (lenp)
-        *lenp = len;
  
      return uv1;
  
  }
  
  
      return uv1;
  
  }
  
-STATIC UV
-S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
+Size_t
+Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
+                          const unsigned int ** remaining_folds_to)
  {
  {
-    /* This is called when changing the case of a UTF-8-encoded character above
-     * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
-     * result contains a character that crosses the 255/256 boundary, disallow
-     * the change, and return the original code point.  See L<perlfunc/lc> for
-     * why;
+    /* Returns the count of the number of code points that fold to the input
+     * 'cp' (besides itself).
       *
       *
-     * p       points to the original string whose case was changed; assumed
-     *          by this routine to be well-formed
-     * result  the code point of the first character in the changed-case string
-     * ustrp   points to the changed-case string (<result> represents its first char)
-     * lenp    points to the length of <ustrp> */
-
-    UV original;    /* To store the first code point of <p> */
+     * If the return is 0, there is nothing else that folds to it, and
+     * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
+     *
+     * If the return is 1, '*first_folds_to' is set to the single code point,
+     * and '*remaining_folds_to' is set to NULL.
+     *
+     * Otherwise, '*first_folds_to' is set to a code point, and
+     * '*remaining_fold_to' is set to an array that contains the others.  The
+     * length of this array is the returned count minus 1.
+     *
+     * The reason for this convolution is to avoid having to deal with
+     * allocating and freeing memory.  The lists are already constructed, so
+     * the return can point to them, but single code points aren't, so would
+     * need to be constructed if we didn't employ something like this API */
+
+    dVAR;
+    /* 'index' is guaranteed to be non-negative, as this is an inversion map
+     * that covers all possible inputs.  See [perl #133365] */
+    SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
+    int base = _Perl_IVCF_invmap[index];
+
+    PERL_ARGS_ASSERT__INVERSE_FOLDS;
+
+    if (base == 0) {            /* No fold */
+        *first_folds_to = 0;
+        *remaining_folds_to = NULL;
+        return 0;
+    }
+
+#ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
+
+    assert(base > 0);
+
+#else
+
+    if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
+
+        /* The data structure is set up so that the absolute value of 'base' is
+         * an index into a table of pointers to arrays, with the array
+         * corresponding to the index being the list of code points that fold
+         * to 'cp', and the parallel array containing the length of the list
+         * array */
+        *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
+        *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
+                                                                 *first_folds_to
+                                                                */
+        return IVCF_AUX_TABLE_lengths[-base];
+    }
+
+#endif
+
+    /* Only the single code point.  This works like 'fc(G) = G - A + a' */
+    *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+    *remaining_folds_to = NULL;
+    return 1;
+}
+
+STATIC UV
+S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
+                                       U8* const ustrp, STRLEN *lenp)
+{
+    /* This is called when changing the case of a UTF-8-encoded character above
+     * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
+     * result contains a character that crosses the 255/256 boundary, disallow
+     * the change, and return the original code point.  See L<perlfunc/lc> for
+     * why;
+     *
+     * p       points to the original string whose case was changed; assumed
+     *          by this routine to be well-formed
+     * result  the code point of the first character in the changed-case string
+     * ustrp   points to the changed-case string (<result> represents its
+     *          first char)
+     * lenp    points to the length of <ustrp> */
+
+    UV original;    /* To store the first code point of <p> */
  
      PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
  
      assert(UTF8_IS_ABOVE_LATIN1(*p));
  
      /* We know immediately if the first character in the string crosses the
  
      PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
  
      assert(UTF8_IS_ABOVE_LATIN1(*p));
  
      /* We know immediately if the first character in the string crosses the
-     * boundary, so can skip */
+     * boundary, so can skip testing */
      if (result > 255) {
  
         /* Look at every character in the result; if any cross the
      if (result > 255) {
  
         /* Look at every character in the result; if any cross the
@@ -3080,8 +3658,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
  
      /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
      Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
  
      /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
      Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
-                           "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; "
-                           "resolved to \"\\x{%" UVXf "}\".",
+                           "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
+                           " locale; resolved to \"\\x{%" UVXf "}\".",
                             OP_DESC(PL_op),
                             original,
                             original);
                             OP_DESC(PL_op),
                             original,
                             original);
@@ -3128,7 +3706,10 @@ S_check_and_deprecate(pTHX_ const U8 *p,
  
      if (*e == NULL) {
          utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
  
      if (*e == NULL) {
          utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
-        *e = p + UTF8SKIP(p);
+
+        /* strnlen() makes this function safe for the common case of
+         * NUL-terminated strings */
+        *e = p + my_strnlen((char *) p, UTF8SKIP(p));
  
          /* For mathoms.c calls, we use the function name we know is stored
           * there.  It could be part of a larger path */
  
          /* For mathoms.c calls, we use the function name we know is stored
           * there.  It could be part of a larger path */
@@ -3167,6 +3748,120 @@ S_check_and_deprecate(pTHX_ const U8 *p,
      return utf8n_flags;
  }
  
      return utf8n_flags;
  }
  
+STATIC UV
+S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
+                        U8 * ustrp, STRLEN *lenp)
+{
+    /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
+     * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+     * Otherwise, it returns the first code point of the Turkic foldcased
+     * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
+     * contain *lenp bytes
+     *
+     * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+     * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
+     * DOTLESS I */
+
+    PERL_ARGS_ASSERT_TURKIC_FC;
+    assert(e > p);
+
+    if (UNLIKELY(*p == 'I')) {
+        *lenp = 2;
+        ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+        ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+        return LATIN_SMALL_LETTER_DOTLESS_I;
+    }
+
+    if (UNLIKELY(memBEGINs(p, e - p,
+                           LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
+    {
+        *lenp = 1;
+        *ustrp = 'i';
+        return 'i';
+    }
+
+    return 0;
+}
+
+STATIC UV
+S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
+                        U8 * ustrp, STRLEN *lenp)
+{
+    /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
+     * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+     * Otherwise, it returns the first code point of the Turkic lowercased
+     * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
+     * contain *lenp bytes */
+
+    dVAR;
+    PERL_ARGS_ASSERT_TURKIC_LC;
+    assert(e > p0);
+
+    /* A 'I' requires context as to what to do */
+    if (UNLIKELY(*p0 == 'I')) {
+        const U8 * p = p0 + 1;
+
+        /* According to the Unicode SpecialCasing.txt file, a capital 'I'
+         * modified by a dot above lowercases to 'i' even in turkic locales. */
+        while (p < e) {
+            UV cp;
+
+            if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
+                ustrp[0] = 'i';
+                *lenp = 1;
+                return 'i';
+            }
+
+            /* For the dot above to modify the 'I', it must be part of a
+             * combining sequence immediately following the 'I', and no other
+             * modifier with a ccc of 230 may intervene */
+            cp = utf8_to_uvchr_buf(p, e, NULL);
+            if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
+                break;
+            }
+
+            /* Here the combining sequence continues */
+            p += UTF8SKIP(p);
+        }
+    }
+
+    /* In all other cases the lc is the same as the fold */
+    return turkic_fc(p0, e, ustrp, lenp);
+}
+
+STATIC UV
+S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
+                        U8 * ustrp, STRLEN *lenp)
+{
+    /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
+     * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
+     * Otherwise, it returns the first code point of the Turkic upper or
+     * title-cased sequence, and the entire sequence will be stored in *ustrp.
+     * ustrp will contain *lenp bytes
+     *
+     * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+     * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
+     * DOTLESS I */
+
+    PERL_ARGS_ASSERT_TURKIC_UC;
+    assert(e > p);
+
+    if (*p == 'i') {
+        *lenp = 2;
+        ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+        ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+        return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+    }
+
+    if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
+        *lenp = 1;
+        *ustrp = 'I';
+        return 'I';
+    }
+
+    return 0;
+}
+
  /* The process for changing the case is essentially the same for the four case
   * change types, except there are complications for folding.  Otherwise the
   * difference is only which case to change to.  To make sure that they all do
  /* The process for changing the case is essentially the same for the four case
   * change types, except there are complications for folding.  Otherwise the
   * difference is only which case to change to.  To make sure that they all do
@@ -3193,19 +3888,26 @@ S_check_and_deprecate(pTHX_ const U8 *p,
   * the input code point calculated from the UTF-8.  The fold code needs to
   * realize all this and take it from there.
   *
   * the input code point calculated from the UTF-8.  The fold code needs to
   * realize all this and take it from there.
   *
+ * To deal with Turkic locales, the function specified by the parameter
+ * 'turkic' is called when appropriate.
+ *
   * If you read the two macros as sequential, it's easier to understand what's
   * going on. */
  #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func,    \
   * If you read the two macros as sequential, it's easier to understand what's
   * going on. */
  #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func,    \
-                               L1_func_extra_param)                          \
+                               L1_func_extra_param, turkic)                  \
                                                                               \
      if (flags & (locale_flags)) {                                            \
                                                                               \
      if (flags & (locale_flags)) {                                            \
-        /* Treat a UTF-8 locale as not being in locale at all */             \
+        _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                  \
          if (IN_UTF8_CTYPE_LOCALE) {                                          \
          if (IN_UTF8_CTYPE_LOCALE) {                                          \
+            if (UNLIKELY(PL_in_utf8_turkic_locale)) {                        \
+                UV ret = turkic(p, e, ustrp, lenp);                          \
+                if (ret) return ret;                                         \
+            }                                                                \
+                                                                             \
+            /* Otherwise, treat a UTF-8 locale as not being in locale at     \
+             * all */                                                        \
              flags &= ~(locale_flags);                                        \
          }                                                                    \
              flags &= ~(locale_flags);                                        \
          }                                                                    \
-        else {                                                               \
-            _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                              \
-        }                                                                    \
      }                                                                        \
                                                                               \
      if (UTF8_IS_INVARIANT(*p)) {                                             \
      }                                                                        \
                                                                               \
      if (UTF8_IS_INVARIANT(*p)) {                                             \
@@ -3217,13 +3919,12 @@ S_check_and_deprecate(pTHX_ const U8 *p,
          }                                                                    \
      }                                                                        \
      else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
          }                                                                    \
      }                                                                        \
      else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
+        U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
          if (flags & (locale_flags)) {                                        \
          if (flags & (locale_flags)) {                                        \
-            result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p,         \
-                                                                 *(p+1)));   \
+            result = LC_L1_change_macro(c);                                  \
          }                                                                    \
          else {                                                               \
          }                                                                    \
          else {                                                               \
-            return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),             \
-                           ustrp, lenp,  L1_func_extra_param);               \
+            return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
          }                                                                    \
      }                                                                        \
      else {  /* malformed UTF-8 or ord above 255 */                           \
          }                                                                    \
      }                                                                        \
      else {  /* malformed UTF-8 or ord above 255 */                           \
@@ -3276,6 +3977,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
                                  const char * const file,
                                  const int line)
  {
                                  const char * const file,
                                  const int line)
  {
+    dVAR;
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
                                                  cBOOL(flags), file, line);
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
                                                  cBOOL(flags), file, line);
@@ -3284,7 +3986,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
  
      /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
      /* 2nd char of uc(U+DF) is 'S' */
  
      /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
      /* 2nd char of uc(U+DF) is 'S' */
-    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
+    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
+                                                                    turkic_uc);
      CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
  }
  
      CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
  }
  
@@ -3310,6 +4013,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p,
                                  const char * const file,
                                  const int line)
  {
                                  const char * const file,
                                  const int line)
  {
+    dVAR;
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
                                                  cBOOL(flags), file, line);
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
                                                  cBOOL(flags), file, line);
@@ -3317,7 +4021,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p,
      PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
  
      /* 2nd char of ucfirst(U+DF) is 's' */
      PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
  
      /* 2nd char of ucfirst(U+DF) is 's' */
-    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
+    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
+                                                                    turkic_uc);
      CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
  }
  
      CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
  }
  
@@ -3342,13 +4047,15 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
                                  const char * const file,
                                  const int line)
  {
                                  const char * const file,
                                  const int line)
  {
+    dVAR;
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
                                                  cBOOL(flags), file, line);
  
      PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
  
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
                                                  cBOOL(flags), file, line);
  
      PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
  
-    CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
+    CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
+                                                                    turkic_lc);
      CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
  }
  
      CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
  }
  
@@ -3378,6 +4085,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
                                 const char * const file,
                                 const int line)
  {
                                 const char * const file,
                                 const int line)
  {
+    dVAR;
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
                                                  cBOOL(flags), file, line);
      UV result;
      const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
                                                  cBOOL(flags), file, line);
@@ -3390,24 +4098,20 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
      assert(p != ustrp); /* Otherwise overwrites */
  
      CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
      assert(p != ustrp); /* Otherwise overwrites */
  
      CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
-                 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
+                 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
+                                                                    turkic_fc);
  
         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
  
         if (flags & FOLD_FLAGS_LOCALE) {
  
  #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
  
         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
  
         if (flags & FOLD_FLAGS_LOCALE) {
  
  #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
-            const unsigned int long_s_t_len    = sizeof(LONG_S_T) - 1;
-
  #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  
  #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  
-            const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
-
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
-            if (UTF8SKIP(p) == cap_sharp_s_len
-                && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
+            if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3417,8 +4121,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
              }
              else
  #endif
              }
              else
  #endif
-                 if (UTF8SKIP(p) == long_s_t_len
-                     && memEQ((char *) p, LONG_S_T, long_s_t_len))
+                 if (memBEGINs((char *) p, e - p, LONG_S_T))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3437,9 +4140,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
               * 255/256 boundary which is forbidden under /l, and so the code
               * wouldn't catch that they are equivalent (which they are only in
               * this release) */
               * 255/256 boundary which is forbidden under /l, and so the code
               * wouldn't catch that they are equivalent (which they are only in
               * this release) */
-            else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
-                     && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
-            {
+            else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
                                "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
                                "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
@@ -3521,7 +4222,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
       * works. */
  
      *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
       * works. */
  
      *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
-    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
          ustrp, *lenp, U8);
      return LATIN_SMALL_LETTER_LONG_S;
  
          ustrp, *lenp, U8);
      return LATIN_SMALL_LETTER_LONG_S;
  
@@ -3553,88 +4254,50 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
   */
  
  SV*
   */
  
  SV*
-Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
+Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
+                      I32 minbits, I32 none)
  {
  {
-    PERL_ARGS_ASSERT_SWASH_INIT;
-
      /* Returns a copy of a swash initiated by the called function.  This is the
       * public interface, and returning a copy prevents others from doing
      /* Returns a copy of a swash initiated by the called function.  This is the
       * public interface, and returning a copy prevents others from doing
-     * mischief on the original */
-
-    return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
-}
-
-SV*
-Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
-{
+     * mischief on the original.  The only remaining use of this is in tr/// */
  
      /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
       * use the following define */
  
  
      /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
       * use the following define */
  
-#define CORE_SWASH_INIT_RETURN(x)   \
+#define SWASH_INIT_RETURN(x)   \
      PL_curpm= old_PL_curpm;         \
      PL_curpm= old_PL_curpm;         \
-    return x
+    return newSVsv(x)
  
      /* Initialize and return a swash, creating it if necessary.  It does this
  
      /* Initialize and return a swash, creating it if necessary.  It does this
-     * by calling utf8_heavy.pl in the general case.  The returned value may be
-     * the swash's inversion list instead if the input parameters allow it.
-     * Which is returned should be immaterial to callers, as the only
-     * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
-     * and swash_to_invlist() handle both these transparently.
-     *
-     * This interface should only be used by functions that won't destroy or
-     * adversely change the swash, as doing so affects all other uses of the
-     * swash in the program; the general public should use 'Perl_swash_init'
-     * instead.
+     * by calling utf8_heavy.pl in the general case.
       *
       * pkg  is the name of the package that <name> should be in.
       *
       * pkg  is the name of the package that <name> should be in.
-     * name is the name of the swash to find.  Typically it is a Unicode
-     *     property name, including user-defined ones
+     * name is the name of the swash to find.
       * listsv is a string to initialize the swash with.  It must be of the form
       *     documented as the subroutine return value in
       *     L<perlunicode/User-Defined Character Properties>
       * minbits is the number of bits required to represent each data element.
       * listsv is a string to initialize the swash with.  It must be of the form
       *     documented as the subroutine return value in
       *     L<perlunicode/User-Defined Character Properties>
       * minbits is the number of bits required to represent each data element.
-     *     It is '1' for binary properties.
       * none I (khw) do not understand this one, but it is used only in tr///.
       * none I (khw) do not understand this one, but it is used only in tr///.
-     * invlist is an inversion list to initialize the swash with (or NULL)
-     * flags_p if non-NULL is the address of various input and output flag bits
-     *      to the routine, as follows:  ('I' means is input to the routine;
-     *      'O' means output from the routine.  Only flags marked O are
-     *      meaningful on return.)
-     *  _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
-     *      came from a user-defined property.  (I O)
-     *  _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
-     *      when the swash cannot be located, to simply return NULL. (I)
-     *  _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
-     *      return of an inversion list instead of a swash hash if this routine
-     *      thinks that would result in faster execution of swash_fetch() later
-     *      on. (I)
       *
       *
-     * Thus there are three possible inputs to find the swash: <name>,
-     * <listsv>, and <invlist>.  At least one must be specified.  The result
+     * Thus there are two possible inputs to find the swash: <name> and
+     * <listsv>.  At least one must be specified.  The result
       * will be the union of the specified ones, although <listsv>'s various
       * actions can intersect, etc. what <name> gives.  To avoid going out to
       * disk at all, <invlist> should specify completely what the swash should
       * have, and <listsv> should be &PL_sv_undef and <name> should be "".
       * will be the union of the specified ones, although <listsv>'s various
       * actions can intersect, etc. what <name> gives.  To avoid going out to
       * disk at all, <invlist> should specify completely what the swash should
       * have, and <listsv> should be &PL_sv_undef and <name> should be "".
-     *
-     * <invlist> is only valid for binary properties */
+     */
  
      PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
  
      SV* retval = &PL_sv_undef;
  
      PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
  
      SV* retval = &PL_sv_undef;
-    HV* swash_hv = NULL;
-    const int invlist_swash_boundary =
-        (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
-        ? 512    /* Based on some benchmarking, but not extensive, see commit
-                    message */
-        : -1;   /* Never return just an inversion list */
  
  
-    assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
-    assert(! invlist || minbits == 1);
+    PERL_ARGS_ASSERT_SWASH_INIT;
+
+    assert(listsv != &PL_sv_undef || strNE(name, ""));
  
  
-    PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
-                       that triggered the swash init and the swash init perl logic itself.
-                       See perl #122747 */
+    PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
+                       regex that triggered the swash init and the swash init
+                       perl logic itself.  See perl #122747 */
  
      /* If data was passed in to go out to utf8_heavy to find the swash of, do
       * so */
  
      /* If data was passed in to go out to utf8_heavy to find the swash of, do
       * so */
@@ -3646,7 +4309,6 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         SV* errsv_save;
         GV *method;
  
         SV* errsv_save;
         GV *method;
  
-       PERL_ARGS_ASSERT__CORE_SWASH_INIT;
  
         PUSHSTACKi(PERLSI_MAGIC);
         ENTER;
  
         PUSHSTACKi(PERLSI_MAGIC);
         ENTER;
@@ -3673,8 +4335,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             SAVEBOOL(TAINT_get);
             TAINT_NOT;
  #endif
             SAVEBOOL(TAINT_get);
             TAINT_NOT;
  #endif
-           Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
-                            NULL);
+            require_pv("utf8_heavy.pl");
             {
                 /* Not ERRSV, as there is no need to vivify a scalar we are
                    about to discard. */
             {
                 /* Not ERRSV, as there is no need to vivify a scalar we are
                    about to discard. */
@@ -3719,118 +4380,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         if (IN_PERL_COMPILETIME) {
             CopHINTS_set(PL_curcop, PL_hints);
         }
         if (IN_PERL_COMPILETIME) {
             CopHINTS_set(PL_curcop, PL_hints);
         }
-       if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
-           if (SvPOK(retval)) {
-
-               /* If caller wants to handle missing properties, let them */
-               if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
-                    CORE_SWASH_INIT_RETURN(NULL);
-               }
-               Perl_croak(aTHX_
-                          "Can't find Unicode property definition \"%" SVf "\"",
-                          SVfARG(retval));
-                NOT_REACHED; /* NOTREACHED */
-            }
-       }
      } /* End of calling the module to find the swash */
  
      } /* End of calling the module to find the swash */
  
-    /* If this operation fetched a swash, and we will need it later, get it */
-    if (retval != &PL_sv_undef
-        && (minbits == 1 || (flags_p
-                            && ! (*flags_p
-                                  & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
-    {
-        swash_hv = MUTABLE_HV(SvRV(retval));
-
-        /* If we don't already know that there is a user-defined component to
-         * this swash, and the user has indicated they wish to know if there is
-         * one (by passing <flags_p>), find out */
-        if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
-            SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
-            if (user_defined && SvUV(*user_defined)) {
-                *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
-            }
-        }
-    }
-
-    /* Make sure there is an inversion list for binary properties */
-    if (minbits == 1) {
-       SV** swash_invlistsvp = NULL;
-       SV* swash_invlist = NULL;
-       bool invlist_in_swash_is_valid = FALSE;
-       bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
-                                           an unclaimed reference count */
-
-        /* If this operation fetched a swash, get its already existing
-         * inversion list, or create one for it */
-
-        if (swash_hv) {
-           swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
-           if (swash_invlistsvp) {
-               swash_invlist = *swash_invlistsvp;
-               invlist_in_swash_is_valid = TRUE;
-           }
-           else {
-               swash_invlist = _swash_to_invlist(retval);
-               swash_invlist_unclaimed = TRUE;
-           }
-       }
-
-       /* If an inversion list was passed in, have to include it */
-       if (invlist) {
-
-            /* Any fetched swash will by now have an inversion list in it;
-             * otherwise <swash_invlist>  will be NULL, indicating that we
-             * didn't fetch a swash */
-           if (swash_invlist) {
-
-               /* Add the passed-in inversion list, which invalidates the one
-                * already stored in the swash */
-               invlist_in_swash_is_valid = FALSE;
-                SvREADONLY_off(swash_invlist);  /* Turned on again below */
-               _invlist_union(invlist, swash_invlist, &swash_invlist);
-           }
-           else {
-
-                /* Here, there is no swash already.  Set up a minimal one, if
-                 * we are going to return a swash */
-                if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
-                    swash_hv = newHV();
-                    retval = newRV_noinc(MUTABLE_SV(swash_hv));
-                }
-               swash_invlist = invlist;
-           }
-       }
-
-        /* Here, we have computed the union of all the passed-in data.  It may
-         * be that there was an inversion list in the swash which didn't get
-         * touched; otherwise save the computed one */
-       if (! invlist_in_swash_is_valid
-            && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
-        {
-           if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
-            {
-               Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-           }
-           /* We just stole a reference count. */
-           if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
-           else SvREFCNT_inc_simple_void_NN(swash_invlist);
-       }
-
-        /* The result is immutable.  Forbid attempts to change it. */
-        SvREADONLY_on(swash_invlist);
-
-        /* Use the inversion list stand-alone if small enough */
-        if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
-           SvREFCNT_dec(retval);
-           if (!swash_invlist_unclaimed)
-               SvREFCNT_inc_simple_void_NN(swash_invlist);
-            retval = newRV_noinc(swash_invlist);
-        }
-    }
-
-    CORE_SWASH_INIT_RETURN(retval);
-#undef CORE_SWASH_INIT_RETURN
+    SWASH_INIT_RETURN(retval);
+#undef SWASH_INIT_RETURN
  }
  
  
  }
  
  
@@ -4153,41 +4706,32 @@ STATIC SV*
  S_swatch_get(pTHX_ SV* swash, UV start, UV span)
  {
      SV *swatch;
  S_swatch_get(pTHX_ SV* swash, UV start, UV span)
  {
      SV *swatch;
-    U8 *l, *lend, *x, *xend, *s, *send;
+    U8 *l, *lend, *x, *xend, *s;
      STRLEN lcur, xcur, scur;
      HV *const hv = MUTABLE_HV(SvRV(swash));
      STRLEN lcur, xcur, scur;
      HV *const hv = MUTABLE_HV(SvRV(swash));
-    SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
  
      SV** listsvp = NULL; /* The string containing the main body of the table */
      SV** extssvp = NULL;
  
      SV** listsvp = NULL; /* The string containing the main body of the table */
      SV** extssvp = NULL;
-    SV** invert_it_svp = NULL;
      U8* typestr = NULL;
      U8* typestr = NULL;
-    STRLEN bits;
+    STRLEN bits = 0;
      STRLEN octets; /* if bits == 1, then octets == 0 */
      UV  none;
      UV  end = start + span;
  
      STRLEN octets; /* if bits == 1, then octets == 0 */
      UV  none;
      UV  end = start + span;
  
-    if (invlistsvp == NULL) {
          SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
          SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
          SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
          extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
          listsvp = hv_fetchs(hv, "LIST", FALSE);
          SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
          SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
          SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
          extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
          listsvp = hv_fetchs(hv, "LIST", FALSE);
-        invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
  
         bits  = SvUV(*bitssvp);
         none  = SvUV(*nonesvp);
         typestr = (U8*)SvPV_nolen(*typesvp);
  
         bits  = SvUV(*bitssvp);
         none  = SvUV(*nonesvp);
         typestr = (U8*)SvPV_nolen(*typesvp);
-    }
-    else {
-       bits = 1;
-       none = 0;
-    }
      octets = bits >> 3; /* if bits == 1, then octets == 0 */
  
      PERL_ARGS_ASSERT_SWATCH_GET;
  
      octets = bits >> 3; /* if bits == 1, then octets == 0 */
  
      PERL_ARGS_ASSERT_SWATCH_GET;
  
-    if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+    if (bits != 8 && bits != 16 && bits != 32) {
         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
                                                  (UV)bits);
      }
         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
                                                  (UV)bits);
      }
@@ -4227,16 +4771,11 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
      SvCUR_set(swatch, scur);
      s = (U8*)SvPVX(swatch);
  
      SvCUR_set(swatch, scur);
      s = (U8*)SvPVX(swatch);
  
-    if (invlistsvp) {  /* If has an inversion list set up use that */
-       _invlist_populate_swatch(*invlistsvp, start, end, s);
-        return swatch;
-    }
-
      /* read $swash->{LIST} */
      l = (U8*)SvPV(*listsvp, lcur);
      lend = l + lcur;
      while (l < lend) {
      /* read $swash->{LIST} */
      l = (U8*)SvPV(*listsvp, lcur);
      lend = l + lcur;
      while (l < lend) {
-       UV min, max, val, upper;
+       UV min = 0, max = 0, val = 0, upper;
         l = swash_scan_list_line(l, lend, &min, &max, &val,
                                                          cBOOL(octets), typestr);
         if (l > lend) {
         l = swash_scan_list_line(l, lend, &min, &max, &val,
                                                          cBOOL(octets), typestr);
         if (l > lend) {
@@ -4285,43 +4824,9 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
                     ++val;
             }
         }
                     ++val;
             }
         }
-       else { /* bits == 1, then val should be ignored */
-           UV key;
-           if (min < start)
-               min = start;
-
-           for (key = min; key <= upper; key++) {
-               const STRLEN offset = (STRLEN)(key - start);
-               s[offset >> 3] |= 1 << (offset & 7);
-           }
-       }
      } /* while */
  
      } /* while */
  
-    /* Invert if the data says it should be.  Assumes that bits == 1 */
-    if (invert_it_svp && SvUV(*invert_it_svp)) {
-
-       /* Unicode properties should come with all bits above PERL_UNICODE_MAX
-        * be 0, and their inversion should also be 0, as we don't succeed any
-        * Unicode property matches for non-Unicode code points */
-       if (start <= PERL_UNICODE_MAX) {
-
-           /* The code below assumes that we never cross the
-            * Unicode/above-Unicode boundary in a range, as otherwise we would
-            * have to figure out where to stop flipping the bits.  Since this
-            * boundary is divisible by a large power of 2, and swatches comes
-            * in small powers of 2, this should be a valid assumption */
-           assert(start + span - 1 <= PERL_UNICODE_MAX);
-
-           send = s + scur;
-           while (s < send) {
-               *s = ~(*s);
-               s++;
-           }
-       }
-    }
-
-    /* read $swash->{EXTRAS}
-     * This code also copied to swash_to_invlist() below */
+    /* read $swash->{EXTRAS} */
      x = (U8*)SvPV(*extssvp, xcur);
      xend = x + xcur;
      while (x < xend) {
      x = (U8*)SvPV(*extssvp, xcur);
      xend = x + xcur;
      while (x < xend) {
@@ -4377,34 +4882,7 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
             Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
  
         s = (U8*)SvPV(swatch, slen);
             Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
  
         s = (U8*)SvPV(swatch, slen);
-       if (bits == 1 && otherbits == 1) {
-           if (slen != olen)
-               Perl_croak(aTHX_ "panic: swatch_get found swatch length "
-                          "mismatch, slen=%" UVuf ", olen=%" UVuf,
-                          (UV)slen, (UV)olen);
-
-           switch (opc) {
-           case '+':
-               while (slen--)
-                   *s++ |= *o++;
-               break;
-           case '!':
-               while (slen--)
-                   *s++ |= ~*o++;
-               break;
-           case '-':
-               while (slen--)
-                   *s++ &= ~*o++;
-               break;
-           case '&':
-               while (slen--)
-                   *s++ &= *o++;
-               break;
-           default:
-               break;
-           }
-       }
-       else {
+        {
             STRLEN otheroctets = otherbits >> 3;
             STRLEN offset = 0;
             U8* const send = s + slen;
             STRLEN otheroctets = otherbits >> 3;
             STRLEN offset = 0;
             U8* const send = s + slen;
@@ -4450,577 +4928,21 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
                     *s++ = (U8)((otherval >>  8) & 0xff);
                     *s++ = (U8)( otherval        & 0xff);
                 }
                     *s++ = (U8)((otherval >>  8) & 0xff);
                     *s++ = (U8)( otherval        & 0xff);
                 }
-           }
+            }
         }
         sv_free(other); /* through with it! */
      } /* while */
      return swatch;
  }
  
         }
         sv_free(other); /* through with it! */
      } /* while */
      return swatch;
  }
  
-HV*
-Perl__swash_inversion_hash(pTHX_ SV* const swash)
-{
-
-   /* Subject to change or removal.  For use only in regcomp.c and regexec.c
-    * Can't be used on a property that is subject to user override, as it
-    * relies on the value of SPECIALS in the swash which would be set by
-    * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
-    * for overridden properties
-    *
-    * Returns a hash which is the inversion and closure of a swash mapping.
-    * For example, consider the input lines:
-    * 004B             006B
-    * 004C             006C
-    * 212A             006B
-    *
-    * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
-    * 006C.  The value for each key is an array.  For 006C, the array would
-    * have two elements, the UTF-8 for itself, and for 004C.  For 006B, there
-    * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
-    *
-    * Note that there are no elements in the hash for 004B, 004C, 212A.  The
-    * keys are only code points that are folded-to, so it isn't a full closure.
-    *
-    * Essentially, for any code point, it gives all the code points that map to
-    * it, or the list of 'froms' for that point.
-    *
-    * Currently it ignores any additions or deletions from other swashes,
-    * looking at just the main body of the swash, and if there are SPECIALS
-    * in the swash, at that hash
-    *
-    * The specials hash can be extra code points, and most likely consists of
-    * maps from single code points to multiple ones (each expressed as a string
-    * of UTF-8 characters).   This function currently returns only 1-1 mappings.
-    * However consider this possible input in the specials hash:
-    * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
-    * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
-    *
-    * Both FB05 and FB06 map to the same multi-char sequence, which we don't
-    * currently handle.  But it also means that FB05 and FB06 are equivalent in
-    * a 1-1 mapping which we should handle, and this relationship may not be in
-    * the main table.  Therefore this function examines all the multi-char
-    * sequences and adds the 1-1 mappings that come out of that.
-    *
-    * XXX This function was originally intended to be multipurpose, but its
-    * only use is quite likely to remain for constructing the inversion of
-    * the CaseFolding (//i) property.  If it were more general purpose for
-    * regex patterns, it would have to do the FB05/FB06 game for simple folds,
-    * because certain folds are prohibited under /iaa and /il.  As an example,
-    * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
-    * equivalent under /i.  But under /iaa and /il, the folds to 'i' are
-    * prohibited, so we would not figure out that they fold to each other.
-    * Code could be written to automatically figure this out, similar to the
-    * code that does this for multi-character folds, but this is the only case
-    * where something like this is ever likely to happen, as all the single
-    * char folds to the 0-255 range are now quite settled.  Instead there is a
-    * little special code that is compiled only for this Unicode version.  This
-    * is smaller and didn't require much coding time to do.  But this makes
-    * this routine strongly tied to being used just for CaseFolding.  If ever
-    * it should be generalized, this would have to be fixed */
-
-    U8 *l, *lend;
-    STRLEN lcur;
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-
-    /* The string containing the main body of the table.  This will have its
-     * assertion fail if the swash has been converted to its inversion list */
-    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
-
-    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
-    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
-    /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
-    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
-    const STRLEN bits  = SvUV(*bitssvp);
-    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
-    const UV     none  = SvUV(*nonesvp);
-    SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
-
-    HV* ret = newHV();
-
-    PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
-
-    /* Must have at least 8 bits to get the mappings */
-    if (bits != 8 && bits != 16 && bits != 32) {
-       Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" UVuf,
-                                                (UV)bits);
-    }
-
-    if (specials_p) { /* It might be "special" (sometimes, but not always, a
-                       mapping to more than one character */
-
-       /* Construct an inverse mapping hash for the specials */
-       HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
-       HV * specials_inverse = newHV();
-       char *char_from; /* the lhs of the map */
-       I32 from_len;   /* its byte length */
-       char *char_to;  /* the rhs of the map */
-       I32 to_len;     /* its byte length */
-       SV *sv_to;      /* and in a sv */
-       AV* from_list;  /* list of things that map to each 'to' */
-
-       hv_iterinit(specials_hv);
-
-       /* The keys are the characters (in UTF-8) that map to the corresponding
-        * UTF-8 string value.  Iterate through the list creating the inverse
-        * list. */
-       while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
-           SV** listp;
-           if (! SvPOK(sv_to)) {
-               Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
-                          "unexpectedly is not a string, flags=%lu",
-                          (unsigned long)SvFLAGS(sv_to));
-           }
-           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
-
-           /* Each key in the inverse list is a mapped-to value, and the key's
-            * hash value is a list of the strings (each in UTF-8) that map to
-            * it.  Those strings are all one character long */
-           if ((listp = hv_fetch(specials_inverse,
-                                   SvPVX(sv_to),
-                                   SvCUR(sv_to), 0)))
-           {
-               from_list = (AV*) *listp;
-           }
-           else { /* No entry yet for it: create one */
-               from_list = newAV();
-               if (! hv_store(specials_inverse,
-                               SvPVX(sv_to),
-                               SvCUR(sv_to),
-                               (SV*) from_list, 0))
-               {
-                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-               }
-           }
-
-           /* Here have the list associated with this 'to' (perhaps newly
-            * created and empty).  Just add to it.  Note that we ASSUME that
-            * the input is guaranteed to not have duplications, so we don't
-            * check for that.  Duplications just slow down execution time. */
-           av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
-       }
-
-       /* Here, 'specials_inverse' contains the inverse mapping.  Go through
-        * it looking for cases like the FB05/FB06 examples above.  There would
-        * be an entry in the hash like
-       *       'st' => [ FB05, FB06 ]
-       * In this example we will create two lists that get stored in the
-       * returned hash, 'ret':
-       *       FB05 => [ FB05, FB06 ]
-       *       FB06 => [ FB05, FB06 ]
-       *
-       * Note that there is nothing to do if the array only has one element.
-       * (In the normal 1-1 case handled below, we don't have to worry about
-       * two lists, as everything gets tied to the single list that is
-       * generated for the single character 'to'.  But here, we are omitting
-       * that list, ('st' in the example), so must have multiple lists.) */
-       while ((from_list = (AV *) hv_iternextsv(specials_inverse,
-                                                &char_to, &to_len)))
-       {
-           if (av_tindex_skip_len_mg(from_list) > 0) {
-               SSize_t i;
-
-               /* We iterate over all combinations of i,j to place each code
-                * point on each list */
-               for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
-                   SSize_t j;
-                   AV* i_list = newAV();
-                   SV** entryp = av_fetch(from_list, i, FALSE);
-                   if (entryp == NULL) {
-                       Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
-                   }
-                   if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
-                       Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
-                   }
-                   if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
-                                  (SV*) i_list, FALSE))
-                   {
-                       Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-                   }
-
-                   /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
-                   for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
-                       entryp = av_fetch(from_list, j, FALSE);
-                       if (entryp == NULL) {
-                           Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
-                       }
-
-                       /* When i==j this adds itself to the list */
-                       av_push(i_list, newSVuv(utf8_to_uvchr_buf(
-                                       (U8*) SvPVX(*entryp),
-                                       (U8*) SvPVX(*entryp) + SvCUR(*entryp),
-                                       0)));
-                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
-                   }
-               }
-           }
-       }
-       SvREFCNT_dec(specials_inverse); /* done with it */
-    } /* End of specials */
-
-    /* read $swash->{LIST} */
-
-#if    UNICODE_MAJOR_VERSION   == 3         \
-    && UNICODE_DOT_VERSION     == 0         \
-    && UNICODE_DOT_DOT_VERSION == 1
-
-    /* For this version only U+130 and U+131 are equivalent under qr//i.  Add a
-     * rule so that things work under /iaa and /il */
-
-    SV * mod_listsv = sv_mortalcopy(*listsvp);
-    sv_catpv(mod_listsv, "130\t130\t131\n");
-    l = (U8*)SvPV(mod_listsv, lcur);
-
-#else
-
-    l = (U8*)SvPV(*listsvp, lcur);
-
-#endif
-
-    lend = l + lcur;
-
-    /* Go through each input line */
-    while (l < lend) {
-       UV min, max, val;
-       UV inverse;
-       l = swash_scan_list_line(l, lend, &min, &max, &val,
-                                                     cBOOL(octets), typestr);
-       if (l > lend) {
-           break;
-       }
-
-       /* Each element in the range is to be inverted */
-       for (inverse = min; inverse <= max; inverse++) {
-           AV* list;
-           SV** listp;
-           IV i;
-           bool found_key = FALSE;
-           bool found_inverse = FALSE;
-
-           /* The key is the inverse mapping */
-           char key[UTF8_MAXBYTES+1];
-           char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
-           STRLEN key_len = key_end - key;
-
-           /* Get the list for the map */
-           if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
-               list = (AV*) *listp;
-           }
-           else { /* No entry yet for it: create one */
-               list = newAV();
-               if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
-                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-               }
-           }
-
-           /* Look through list to see if this inverse mapping already is
-            * listed, or if there is a mapping to itself already */
-           for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
-               SV** entryp = av_fetch(list, i, FALSE);
-               SV* entry;
-               UV uv;
-               if (entryp == NULL) {
-                   Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
-               }
-               entry = *entryp;
-               uv = SvUV(entry);
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/
-               if (uv == val) {
-                   found_key = TRUE;
-               }
-               if (uv == inverse) {
-                   found_inverse = TRUE;
-               }
-
-               /* No need to continue searching if found everything we are
-                * looking for */
-               if (found_key && found_inverse) {
-                   break;
-               }
-           }
-
-           /* Make sure there is a mapping to itself on the list */
-           if (! found_key) {
-               av_push(list, newSVuv(val));
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/
-           }
-
-
-           /* Simply add the value to the list */
-           if (! found_inverse) {
-               av_push(list, newSVuv(inverse));
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/
-           }
-
-           /* swatch_get() increments the value of val for each element in the
-            * range.  That makes more compact tables possible.  You can
-            * express the capitalization, for example, of all consecutive
-            * letters with a single line: 0061\t007A\t0041 This maps 0061 to
-            * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
-            * and it's not documented; it appears to be used only in
-            * implementing tr//; I copied the semantics from swatch_get(), just
-            * in case */
-           if (!none || val < none) {
-               ++val;
-           }
-       }
-    }
-
-    return ret;
-}
-
-SV*
-Perl__swash_to_invlist(pTHX_ SV* const swash)
-{
-
-   /* Subject to change or removal.  For use only in one place in regcomp.c.
-    * Ownership is given to one reference count in the returned SV* */
-
-    U8 *l, *lend;
-    char *loc;
-    STRLEN lcur;
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-    UV elements = 0;    /* Number of elements in the inversion list */
-    U8 empty[] = "";
-    SV** listsvp;
-    SV** typesvp;
-    SV** bitssvp;
-    SV** extssvp;
-    SV** invert_it_svp;
-
-    U8* typestr;
-    STRLEN bits;
-    STRLEN octets; /* if bits == 1, then octets == 0 */
-    U8 *x, *xend;
-    STRLEN xcur;
-
-    SV* invlist;
-
-    PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
-
-    /* If not a hash, it must be the swash's inversion list instead */
-    if (SvTYPE(hv) != SVt_PVHV) {
-        return SvREFCNT_inc_simple_NN((SV*) hv);
-    }
-
-    /* The string containing the main body of the table */
-    listsvp = hv_fetchs(hv, "LIST", FALSE);
-    typesvp = hv_fetchs(hv, "TYPE", FALSE);
-    bitssvp = hv_fetchs(hv, "BITS", FALSE);
-    extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
-    invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
-    typestr = (U8*)SvPV_nolen(*typesvp);
-    bits  = SvUV(*bitssvp);
-    octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
-    /* read $swash->{LIST} */
-    if (SvPOK(*listsvp)) {
-       l = (U8*)SvPV(*listsvp, lcur);
-    }
-    else {
-       /* LIST legitimately doesn't contain a string during compilation phases
-        * of Perl itself, before the Unicode tables are generated.  In this
-        * case, just fake things up by creating an empty list */
-       l = empty;
-       lcur = 0;
-    }
-    loc = (char *) l;
-    lend = l + lcur;
-
-    if (*l == 'V') {    /*  Inversion list format */
-        const char *after_atou = (char *) lend;
-        UV element0;
-        UV* other_elements_ptr;
-
-        /* The first number is a count of the rest */
-        l++;
-        if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
-            Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
-        }
-        if (elements == 0) {
-            invlist = _new_invlist(0);
-        }
-        else {
-            l = (U8 *) after_atou;
-
-            /* Get the 0th element, which is needed to setup the inversion list */
-            while (isSPACE(*l)) l++;
-            if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
-                Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
-            }
-            l = (U8 *) after_atou;
-            invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
-            elements--;
-
-            /* Then just populate the rest of the input */
-            while (elements-- > 0) {
-                if (l > lend) {
-                    Perl_croak(aTHX_ "panic: Expecting %" UVuf " more elements than available", elements);
-                }
-                while (isSPACE(*l)) l++;
-                if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
-                    Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
-                }
-                l = (U8 *) after_atou;
-            }
-        }
-    }
-    else {
-
-        /* Scan the input to count the number of lines to preallocate array
-         * size based on worst possible case, which is each line in the input
-         * creates 2 elements in the inversion list: 1) the beginning of a
-         * range in the list; 2) the beginning of a range not in the list.  */
-        while ((loc = (strchr(loc, '\n'))) != NULL) {
-            elements += 2;
-            loc++;
-        }
-
-        /* If the ending is somehow corrupt and isn't a new line, add another
-         * element for the final range that isn't in the inversion list */
-        if (! (*lend == '\n'
-            || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
-        {
-            elements++;
-        }
-
-        invlist = _new_invlist(elements);
-
-        /* Now go through the input again, adding each range to the list */
-        while (l < lend) {
-            UV start, end;
-            UV val;            /* Not used by this function */
-
-            l = swash_scan_list_line(l, lend, &start, &end, &val,
-                                                        cBOOL(octets), typestr);
-
-            if (l > lend) {
-                break;
-            }
-
-            invlist = _add_range_to_invlist(invlist, start, end);
-        }
-    }
-
-    /* Invert if the data says it should be */
-    if (invert_it_svp && SvUV(*invert_it_svp)) {
-       _invlist_invert(invlist);
-    }
-
-    /* This code is copied from swatch_get()
-     * read $swash->{EXTRAS} */
-    x = (U8*)SvPV(*extssvp, xcur);
-    xend = x + xcur;
-    while (x < xend) {
-       STRLEN namelen;
-       U8 *namestr;
-       SV** othersvp;
-       HV* otherhv;
-       STRLEN otherbits;
-       SV **otherbitssvp, *other;
-       U8 *nl;
-
-       const U8 opc = *x++;
-       if (opc == '\n')
-           continue;
-
-       nl = (U8*)memchr(x, '\n', xend - x);
-
-       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
-           if (nl) {
-               x = nl + 1; /* 1 is length of "\n" */
-               continue;
-           }
-           else {
-               x = xend; /* to EXTRAS' end at which \n is not found */
-               break;
-           }
-       }
-
-       namestr = x;
-       if (nl) {
-           namelen = nl - namestr;
-           x = nl + 1;
-       }
-       else {
-           namelen = xend - namestr;
-           x = xend;
-       }
-
-       othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
-       otherhv = MUTABLE_HV(SvRV(*othersvp));
-       otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
-       otherbits = (STRLEN)SvUV(*otherbitssvp);
-
-       if (bits != otherbits || bits != 1) {
-           Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
-                      "properties, bits=%" UVuf ", otherbits=%" UVuf,
-                      (UV)bits, (UV)otherbits);
-       }
-
-       /* The "other" swatch must be destroyed after. */
-       other = _swash_to_invlist((SV *)*othersvp);
-
-       /* End of code copied from swatch_get() */
-       switch (opc) {
-       case '+':
-           _invlist_union(invlist, other, &invlist);
-           break;
-       case '!':
-            _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
-           break;
-       case '-':
-           _invlist_subtract(invlist, other, &invlist);
-           break;
-       case '&':
-           _invlist_intersection(invlist, other, &invlist);
-           break;
-       default:
-           break;
-       }
-       sv_free(other); /* through with it! */
-    }
-
-    SvREADONLY_on(invlist);
-    return invlist;
-}
-
-SV*
-Perl__get_swash_invlist(pTHX_ SV* const swash)
-{
-    SV** ptr;
-
-    PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
-
-    if (! SvROK(swash)) {
-        return NULL;
-    }
-
-    /* If it really isn't a hash, it isn't really swash; must be an inversion
-     * list */
-    if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
-        return SvRV(swash);
-    }
-
-    ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
-    if (! ptr) {
-        return NULL;
-    }
-
-    return *ptr;
-}
-
  bool
  Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  {
      /* May change: warns if surrogates, non-character code points, or
  bool
  Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  {
      /* May change: warns if surrogates, non-character code points, or
-     * non-Unicode code points are in s which has length len bytes.  Returns
-     * TRUE if none found; FALSE otherwise.  The only other validity check is
-     * to make sure that this won't exceed the string's length.
-     *
-     * Code points above the platform's C<IV_MAX> will raise a deprecation
-     * warning, unless those are turned off.  */
+     * non-Unicode code points are in 's' which has length 'len' bytes.
+     * Returns TRUE if none found; FALSE otherwise.  The only other validity
+     * check is to make sure that this won't exceed the string's length nor
+     * overflow */
  
      const U8* const e = s + len;
      bool ok = TRUE;
  
      const U8* const e = s + len;
      bool ok = TRUE;
@@ -5036,22 +4958,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
                  if (   ckWARN_d(WARN_NON_UNICODE)
         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
                  if (   ckWARN_d(WARN_NON_UNICODE)
-                    || (   ckWARN_d(WARN_DEPRECATED)
-#ifndef UV_IS_QUAD
-                        && UNLIKELY(is_utf8_cp_above_31_bits(s, e))
-#else   /* Below is 64-bit words */
-                        /* 2**63 and up meet these conditions provided we have
-                         * a 64-bit word. */
-#   ifdef EBCDIC
-                        && *s == 0xFE
-                        && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8
-#   else
-                        && *s == 0xFF
-                           /* s[1] being above 0x80 overflows */
-                        && s[2] >= 0x88
-#   endif
-#endif
-                )) {
+                    || UNLIKELY(0 < does_utf8_overflow(s, s + len,
+                                               0 /* Don't consider overlongs */
+                                               )))
+                {
                      /* A side effect of this function will be to warn */
                      (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
                      ok = FALSE;
                      /* A side effect of this function will be to warn */
                      (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
                      ok = FALSE;
@@ -5064,11 +4974,14 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
                       * do for the non-chars and above-unicodes */
                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                       * do for the non-chars and above-unicodes */
                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
-                       "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", uv);
+                       "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
+                                             uv);
                     ok = FALSE;
                 }
             }
                     ok = FALSE;
                 }
             }
-           else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
+           else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
+                     && (ckWARN_d(WARN_NONCHAR)))
+            {
                  /* A side effect of this function will be to warn */
                  (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
                 ok = FALSE;
                  /* A side effect of this function will be to warn */
                  (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
                 ok = FALSE;
@@ -5100,7 +5013,8 @@ See also L</sv_uni_display>.
  
  =cut */
  char *
  
  =cut */
  char *
-Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
+Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
+                          UV flags)
  {
      int truncated = 0;
      const char *s, *e;
  {
      int truncated = 0;
      const char *s, *e;
@@ -5189,28 +5103,28 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
  /*
  =for apidoc foldEQ_utf8
  
  /*
  =for apidoc foldEQ_utf8
  
-Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
-of which may be in UTF-8) are the same case-insensitively; false otherwise.
-How far into the strings to compare is determined by other input parameters.
+Returns true if the leading portions of the strings C<s1> and C<s2> (either or
+both of which may be in UTF-8) are the same case-insensitively; false
+otherwise.  How far into the strings to compare is determined by other input
+parameters.
  
  If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
  
  If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
-otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
-with respect to C<s2>.
-
-If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
-equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
-scan will not be considered to be a match unless the goal is reached, and
-scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
-C<s2>.
-
-If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that pointer is
-considered an end pointer to the position 1 byte past the maximum point
-in C<s1> beyond which scanning will not continue under any circumstances.
+otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
+C<u2> with respect to C<s2>.
+
+If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
+fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
+The scan will not be considered to be a match unless the goal is reached, and
+scanning won't continue past that goal.  Correspondingly for C<l2> with respect
+to C<s2>.
+
+If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
+pointer is considered an end pointer to the position 1 byte past the maximum
+point in C<s1> beyond which scanning will not continue under any circumstances.
  (This routine assumes that UTF-8 encoded input strings are not malformed;
  (This routine assumes that UTF-8 encoded input strings are not malformed;
-malformed input can cause it to read past C<pe1>).
-This means that if both C<l1> and C<pe1> are specified, and C<pe1>
-is less than C<s1>+C<l1>, the match will never be successful because it can
-never
+malformed input can cause it to read past C<pe1>).  This means that if both
+C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
+will never be successful because it can never
  get as far as its goal (and in fact is asserted against).  Correspondingly for
  C<pe2> with respect to C<s2>.
  
  get as far as its goal (and in fact is asserted against).  Correspondingly for
  C<pe2> with respect to C<s2>.
  
@@ -5248,11 +5162,19 @@ L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
   *                          that effect.  However, if the caller knows what
   *                          it's doing, it can pass this flag to indicate that,
   *                          and the assertion is skipped.
   *                          that effect.  However, if the caller knows what
   *                          it's doing, it can pass this flag to indicate that,
   *                          and the assertion is skipped.
- *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
+ *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ *                          to s2, and s2 doesn't have to be UTF-8 encoded.
+ *                          This introduces an asymmetry to save a few branches
+ *                          in a loop.  Currently, this is not a problem, as
+ *                          never are both inputs pre-folded.  Simply call this
+ *                          function with the pre-folded one as the second
+ *                          string.
   *  FOLDEQ_S2_FOLDS_SANE
   */
  I32
   *  FOLDEQ_S2_FOLDS_SANE
   */
  I32
-Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
+Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
+                             const char *s2, char **pe2, UV l2, bool u2,
+                             U32 flags)
  {
      const U8 *p1  = (const U8*)s1; /* Point to current char */
      const U8 *p2  = (const U8*)s2;
  {
      const U8 *p1  = (const U8*)s1; /* Point to current char */
      const U8 *p2  = (const U8*)s2;
@@ -5269,11 +5191,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
-    assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
-               && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
-                     && !(flags & FOLDEQ_S1_FOLDS_SANE))
-                   || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
-                       && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+    assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+               && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
+                        && !(flags &  FOLDEQ_S1_FOLDS_SANE))
+                    || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
+                        && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
      /* The algorithm is to trial the folds without regard to the flags on
       * the first line of the above assert(), and then see if the result
       * violates them.  This means that the inputs can't be pre-folded to a
      /* The algorithm is to trial the folds without regard to the flags on
       * the first line of the above assert(), and then see if the result
       * violates them.  This means that the inputs can't be pre-folded to a
@@ -5287,12 +5209,20 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
  
      if (flags & FOLDEQ_LOCALE) {
          if (IN_UTF8_CTYPE_LOCALE) {
  
      if (flags & FOLDEQ_LOCALE) {
          if (IN_UTF8_CTYPE_LOCALE) {
-            flags &= ~FOLDEQ_LOCALE;
+            if (UNLIKELY(PL_in_utf8_turkic_locale)) {
+                flags_for_folder |= FOLD_FLAGS_LOCALE;
+            }
+            else {
+                flags &= ~FOLDEQ_LOCALE;
+            }
          }
          else {
              flags_for_folder |= FOLD_FLAGS_LOCALE;
          }
      }
          }
          else {
              flags_for_folder |= FOLD_FLAGS_LOCALE;
          }
      }
+    if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+        flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+    }
  
      if (pe1) {
          e1 = *(U8**)pe1;
  
      if (pe1) {
          e1 = *(U8**)pe1;
@@ -5377,9 +5307,23 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
  
          if (n2 == 0) {    /* Same for s2 */
             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
  
          if (n2 == 0) {    /* Same for s2 */
             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
-               f2 = (U8 *) p2;
-                assert(u2);
-               n2 = UTF8SKIP(f2);
+
+                /* Point to the already-folded character.  But for non-UTF-8
+                 * variants, convert to UTF-8 for the algorithm below */
+               if (UTF8_IS_INVARIANT(*p2)) {
+                    f2 = (U8 *) p2;
+                    n2 = 1;
+                }
+                else if (u2) {
+                    f2 = (U8 *) p2;
+                    n2 = UTF8SKIP(f2);
+                }
+                else {
+                    foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+                    foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+                    f2 = foldbuf2;
+                    n2 = 2;
+                }
             }
             else {
                  if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
             }
             else {
                  if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
@@ -5459,7 +5403,7 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
  
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
  
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
+    return uvoffuni_to_utf8_flags(d, uv, 0);
  }
  
  /*
  }
  
  /*
@@ -5511,5 +5455,57 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  }
  
  /*
  }
  
  /*
+=for apidoc utf8_to_uvchr
+
+Returns the native code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+C<NULL>) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
+
+    /* This function is unsafe if malformed UTF-8 input is given it, which is
+     * why the function is deprecated.  If the first byte of the input
+     * indicates that there are more bytes remaining in the sequence that forms
+     * the character than there are in the input buffer, it can read past the
+     * end.  But we can make it safe if the input string happens to be
+     * NUL-terminated, as many strings in Perl are, by refusing to read past a
+     * NUL.  A NUL indicates the start of the next character anyway.  If the
+     * input isn't NUL-terminated, the function remains unsafe, as it always
+     * has been.
+     *
+     * An initial NUL has to be handled separately, but all ASCIIs can be
+     * handled the same way, speeding up this common case */
+
+    if (UTF8_IS_INVARIANT(*s)) {  /* Assumes 's' contains at least 1 byte */
+        if (retlen) {
+            *retlen = 1;
+        }
+        return (UV) *s;
+    }
+
+    return utf8_to_uvchr_buf(s,
+                             s + my_strnlen((char *) s, UTF8SKIP(s)),
+                             retlen);
+}
+
+/*
   * ex: set ts=8 sts=4 sw=4 et:
   */
   * ex: set ts=8 sts=4 sw=4 et:
   */