PATCH: [perl #130010] a5540cf breaks texinfo

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 184ed31..4dbefe5 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -31,10 +31,15 @@
  #include "EXTERN.h"
  #define PERL_IN_UTF8_C
  #include "perl.h"
  #include "EXTERN.h"
  #define PERL_IN_UTF8_C
  #include "perl.h"
-#include "inline_invlist.c"
+#include "invlist_inline.h"
  
  
+static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
  static const char unees[] =
-    "Malformed UTF-8 character (unexpected end of string)";
+                        "Malformed UTF-8 character (unexpected end of string)";
+static const char cp_above_legal_max[] =
+ "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
+
+#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
  
  /*
  =head1 Unicode Support
  
  /*
  =head1 Unicode Support
@@ -48,42 +53,6 @@ within non-zero characters.
  */
  
  /*
  */
  
  /*
-=for apidoc is_invariant_string
-
-Returns true iff the first C<len> bytes of the string C<s> are the same
-regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
-EBCDIC machines).  That is, if they are UTF-8 invariant.  On ASCII-ish
-machines, all the ASCII characters and only the ASCII characters fit this
-definition.  On EBCDIC machines, the ASCII-range characters are invariant, but
-so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
-EBCDIC).
-
-If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
-use this option, that C<s> can't have embedded C<NUL> characters and has to
-have a terminating C<NUL> byte).
-
-See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
-
-=cut
-*/
-
-bool
-Perl_is_invariant_string(const U8 *s, STRLEN len)
-{
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
-
-    PERL_ARGS_ASSERT_IS_INVARIANT_STRING;
-
-    for (; x < send; ++x) {
-       if (!UTF8_IS_INVARIANT(*x))
-           break;
-    }
-
-    return x == send;
-}
-
-/*
  =for apidoc uvoffuni_to_utf8_flags
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
  =for apidoc uvoffuni_to_utf8_flags
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
@@ -94,153 +63,172 @@ This function is like them, but the input is a strict Unicode
  (as opposed to native) code point.  Only in very rare circumstances should code
  not be using the native code point.
  
  (as opposed to native) code point.  Only in very rare circumstances should code
  not be using the native code point.
  
-For details, see the description for L</uvchr_to_utf8_flags>>.
+For details, see the description for L</uvchr_to_utf8_flags>.
  
  =cut
  */
  
  
  =cut
  */
  
+#define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
+    STMT_START {                                                    \
+        if (flags & UNICODE_WARN_SURROGATE) {                       \
+            Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),        \
+                                "UTF-16 surrogate U+%04"UVXf, uv);  \
+        }                                                           \
+        if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
+            return NULL;                                            \
+        }                                                           \
+    } STMT_END;
+
+#define HANDLE_UNICODE_NONCHAR(uv, flags)                           \
+    STMT_START {                                                    \
+        if (flags & UNICODE_WARN_NONCHAR) {                         \
+            Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),          \
+                "Unicode non-character U+%04"UVXf" is not "        \
+                 "recommended for open interchange", uv);           \
+        }                                                           \
+        if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
+            return NULL;                                            \
+        }                                                           \
+    } STMT_END;
+
+/*  Use shorter names internally in this file */
+#define SHIFT   UTF_ACCUMULATION_SHIFT
+#undef  MARK
+#define MARK    UTF_CONTINUATION_MARK
+#define MASK    UTF_CONTINUATION_MASK
+
  U8 *
  Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
  
  U8 *
  Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
  
-    if (UNI_IS_INVARIANT(uv)) {
-       *d++ = (U8) LATIN1_TO_NATIVE(uv);
+    if (OFFUNI_IS_INVARIANT(uv)) {
+       *d++ = LATIN1_TO_NATIVE(uv);
         return d;
      }
  
         return d;
      }
  
-#ifdef EBCDIC
-    /* Not representable in UTF-EBCDIC */
-    flags |= UNICODE_DISALLOW_FE_FF;
+    if (uv <= MAX_UTF8_TWO_BYTE) {
+        *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
+        *d++ = I8_TO_NATIVE_UTF8(( uv           & MASK) |   MARK);
+        return d;
+    }
+
+    /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
+     * below, the 16 is for start bytes E0-EF (which are all the possible ones
+     * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
+     * contribute SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
+     * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
+     * 0x800-0xFFFF on ASCII */
+    if (uv < (16 * (1U << (2 * SHIFT)))) {
+       *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
+       *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
+       *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */        & MASK) |   MARK);
+
+#ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
+                   aren't tested here */
+        /* The most likely code points in this range are below the surrogates.
+         * Do an extra test to quickly exclude those. */
+        if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
+            if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
+                         || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
+            {
+                HANDLE_UNICODE_NONCHAR(uv, flags);
+            }
+            else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                HANDLE_UNICODE_SURROGATE(uv, flags);
+            }
+        }
  #endif
  #endif
+       return d;
+    }
  
  
-    /* The first problematic code point is the first surrogate */
-    if (uv >= UNICODE_SURROGATE_FIRST
-        && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
-    {
-       if (UNICODE_IS_SURROGATE(uv)) {
-           if (flags & UNICODE_WARN_SURROGATE) {
-               Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
-                                           "UTF-16 surrogate U+%04"UVXf, uv);
-           }
-           if (flags & UNICODE_DISALLOW_SURROGATE) {
-               return NULL;
-           }
-       }
-       else if (UNICODE_IS_SUPER(uv)) {
-           if (flags & UNICODE_WARN_SUPER
-               || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
-           {
-               Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
-                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
-           }
-           if (flags & UNICODE_DISALLOW_SUPER
-               || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
-           {
-#ifdef EBCDIC
-                Perl_die(aTHX_ "Can't represent character for Ox%"UVXf" on this platform", uv);
-                NOT_REACHED; /* NOTREACHED */
+    /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
+     * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
+     * happen starting with 4-byte characters on ASCII platforms.  We unify the
+     * code for these with EBCDIC, even though some of them require 5-bytes on
+     * those, because khw believes the code saving is worth the very slight
+     * performance hit on these high EBCDIC code points. */
+
+    if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
+        if (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+            && ckWARN_d(WARN_DEPRECATED))
+        {
+            Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                        cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
+        }
+        if (   (flags & UNICODE_WARN_SUPER)
+            || (   UNICODE_IS_ABOVE_31_BIT(uv)
+                && (flags & UNICODE_WARN_ABOVE_31_BIT)))
+        {
+            Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
+
+              /* Choose the more dire applicable warning */
+              (UNICODE_IS_ABOVE_31_BIT(uv))
+              ? "Code point 0x%"UVXf" is not Unicode, and not portable"
+              : "Code point 0x%"UVXf" is not Unicode, may not be portable",
+             uv);
+        }
+        if (flags & UNICODE_DISALLOW_SUPER
+            || (   UNICODE_IS_ABOVE_31_BIT(uv)
+                && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
+        {
+            return NULL;
+        }
+    }
+    else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
+        HANDLE_UNICODE_NONCHAR(uv, flags);
+    }
+
+    /* Test for and handle 4-byte result.   In the test immediately below, the
+     * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
+     * characters).  The 3 is for 3 continuation bytes; these each contribute
+     * SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
+     * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
+     * 0x1_0000-0x1F_FFFF on ASCII */
+    if (uv < (8 * (1U << (3 * SHIFT)))) {
+       *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
+       *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) |   MARK);
+       *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
+       *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */        & MASK) |   MARK);
+
+#ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
+                   characters.  The end-plane non-characters for EBCDIC were
+                   handled just above */
+        if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
+            HANDLE_UNICODE_NONCHAR(uv, flags);
+        }
+        else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+            HANDLE_UNICODE_SURROGATE(uv, flags);
+        }
  #endif
  #endif
-               return NULL;
-           }
-       }
-       else if (UNICODE_IS_NONCHAR(uv)) {
-           if (flags & UNICODE_WARN_NONCHAR) {
-               Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
-                "Unicode non-character U+%04"UVXf" is illegal for open interchange",
-                uv);
-           }
-           if (flags & UNICODE_DISALLOW_NONCHAR) {
-               return NULL;
-           }
-       }
+
+       return d;
      }
  
      }
  
-#if defined(EBCDIC)
+    /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
+     * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
+     * format.  The unrolled version above turns out to not save all that much
+     * time, and at these high code points (well above the legal Unicode range
+     * on ASCII platforms, and well above anything in common use in EBCDIC),
+     * khw believes that less code outweighs slight performance gains. */
+
      {
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
      {
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
-           *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
+           *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
             uv >>= UTF_ACCUMULATION_SHIFT;
         }
             uv >>= UTF_ACCUMULATION_SHIFT;
         }
-       *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
+       *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
         return d+len;
      }
         return d+len;
      }
-#else /* Non loop style */
-    if (uv < 0x800) {
-       *d++ = (U8)(( uv >>  6)         | 0xc0);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-    if (uv < 0x10000) {
-       *d++ = (U8)(( uv >> 12)         | 0xe0);
-       *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-    if (uv < 0x200000) {
-       *d++ = (U8)(( uv >> 18)         | 0xf0);
-       *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-    if (uv < 0x4000000) {
-       *d++ = (U8)(( uv >> 24)         | 0xf8);
-       *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-    if (uv < 0x80000000) {
-       *d++ = (U8)(( uv >> 30)         | 0xfc);
-       *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-#ifdef UTF8_QUAD_MAX
-    if (uv < UTF8_QUAD_MAX)
-#endif
-    {
-       *d++ =                            0xfe; /* Can't match U+FEFF! */
-       *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-#ifdef UTF8_QUAD_MAX
-    {
-       *d++ =                            0xff;         /* Can't match U+FFFE! */
-       *d++ =                            0x80;         /* 6 Reserved bits */
-       *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
-       *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
-       *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
-       *d++ = (U8)(( uv        & 0x3f) | 0x80);
-       return d;
-    }
-#endif
-#endif /* Non loop style */
  }
  }
+
  /*
  =for apidoc uvchr_to_utf8
  
  Adds the UTF-8 representation of the native code point C<uv> to the end
  /*
  =for apidoc uvchr_to_utf8
  
  Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
  C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
  the byte after the end of the new character.  In other words,
  
  C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
  the byte after the end of the new character.  In other words,
  
@@ -250,8 +238,12 @@ is the recommended wide native character-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
-This function accepts any UV as input.  To forbid or warn on non-Unicode code
-points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
+This function accepts any UV as input, but very high code points (above
+C<IV_MAX> on the platform)  will raise a deprecation warning.  This is
+typically 0x7FFF_FFFF in a 32-bit word.
+
+It is possible to forbid or warn on non-Unicode code points, or those that may
+be problematic by using L</uvchr_to_utf8_flags>.
  
  =cut
  */
  
  =cut
  */
@@ -269,7 +261,7 @@ Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
  =for apidoc uvchr_to_utf8_flags
  
  Adds the UTF-8 representation of the native code point C<uv> to the end
  =for apidoc uvchr_to_utf8_flags
  
  Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
  C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
  the byte after the end of the new character.  In other words,
  
  C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
  the byte after the end of the new character.  In other words,
  
@@ -283,27 +275,61 @@ This is the Unicode-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
-This function will convert to UTF-8 (and not warn) even code points that aren't
-legal Unicode or are problematic, unless C<flags> contains one or more of the
-following flags:
-
-If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
-the function will raise a warning, provided UTF8 warnings are enabled.  If instead
-UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
-If both flags are set, the function will both warn and return NULL.
-
-The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
-affect how the function handles a Unicode non-character.  And likewise, the
-UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
-code points that are
-above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
-even less portable) can be warned and/or disallowed even if other above-Unicode
-code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
-flags.
-
-And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
-above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
-DISALLOW flags.
+If C<flags> is 0, this function accepts any UV as input, but very high code
+points (above C<IV_MAX> for the platform)  will raise a deprecation warning.
+This is typically 0x7FFF_FFFF in a 32-bit word.
+
+Specifying C<flags> can further restrict what is allowed and not warned on, as
+follows:
+
+If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
+the function will raise a warning, provided UTF8 warnings are enabled.  If
+instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
+NULL.  If both flags are set, the function will both warn and return NULL.
+
+Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
+affect how the function handles a Unicode non-character.
+
+And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
+affect the handling of code points that are above the Unicode maximum of
+0x10FFFF.  Languages other than Perl may not be able to accept files that
+contain these.
+
+The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
+the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
+three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
+allowed inputs to the strict UTF-8 traditionally defined by Unicode.
+Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
+C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
+above-Unicode and surrogate flags, but not the non-character ones, as
+defined in
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+See L<perlunicode/Noncharacter code points>.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+so using them is more problematic than other above-Unicode code points.  Perl
+invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
+likely that non-Perl languages will not be able to read files that contain
+these that written by the perl interpreter; nor would Perl understand files
+written by something that uses a different extension.  For these reasons, there
+is a separate set of flags that can warn and/or disallow these extremely high
+code points, even if other above-Unicode ones are accepted.  These are the
+C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
+are entirely independent from the deprecation warning for code points above
+C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
+code point that needs more than 31 bits to represent.  When that happens,
+effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
+32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations; and
+C<UNICODE_WARN_SUPER> warns on these.)
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII.  Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.  The flags C<UNICODE_WARN_ABOVE_31_BIT>
+and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
+platforms, warning and disallowing 2**31 and higher.
  
  =cut
  */
  
  =cut
  */
@@ -317,87 +343,454 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
      return uvchr_to_utf8_flags(d, uv, flags);
  }
  
-/*
-=for apidoc is_utf8_string
+PERL_STATIC_INLINE bool
+S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+{
+    /* Returns TRUE if the first code point represented by the Perl-extended-
+     * UTF-8-encoded string starting at 's', and looking no further than 'e -
+     * 1' doesn't fit into 31 bytes.  That is, that if it is >= 2**31.
+     *
+     * The function handles the case where the input bytes do not include all
+     * the ones necessary to represent a full character.  That is, they may be
+     * the intial bytes of the representation of a code point, but possibly
+     * the final ones necessary for the complete representation may be beyond
+     * 'e - 1'.
+     *
+     * The function assumes that the sequence is well-formed UTF-8 as far as it
+     * goes, and is for a UTF-8 variant code point.  If the sequence is
+     * incomplete, the function returns FALSE if there is any well-formed
+     * UTF-8 byte sequence that can complete it in such a way that a code point
+     * < 2**31 is produced; otherwise it returns TRUE.
+     *
+     * Getting this exactly right is slightly tricky, and has to be done in
+     * several places in this file, so is centralized here.  It is based on the
+     * following table:
+     *
+     * U+7FFFFFFF (2 ** 31 - 1)
+     *      ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
+     *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
+     *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
+     *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
+     *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
+     * U+80000000 (2 ** 31):
+     *      ASCII: \xFE\x82\x80\x80\x80\x80\x80
+     *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
+     *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+     *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+     *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
+     *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
+     */
  
  
-Returns true if the first C<len> bytes of string C<s> form a valid
-UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
-using C<strlen(s)> (which means if you use this option, that C<s> can't have
-embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
-that all characters being ASCII constitute 'a valid UTF-8 string'.
+#ifdef EBCDIC
  
  
-See also L</is_invariant_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
+        /* [0] is start byte           [1] [2] [3] [4] [5] [6] [7] */
+    const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
+    const STRLEN prefix_len = sizeof(prefix) - 1;
+    const STRLEN len = e - s;
+    const STRLEN cmp_len = MIN(prefix_len, len - 1);
  
  
-=cut
-*/
+#else
  
  
-bool
-Perl_is_utf8_string(const U8 *s, STRLEN len)
+    PERL_UNUSED_ARG(e);
+
+#endif
+
+    PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
+
+    assert(! UTF8_IS_INVARIANT(*s));
+
+#ifndef EBCDIC
+
+    /* Technically, a start byte of FE can be for a code point that fits into
+     * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
+     * malformation. */
+    return (*s >= 0xFE);
+
+#else
+
+    /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
+     * larger code point (0xFF is an invariant).  For 0xFE, we need at least 2
+     * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
+     * bits. */
+    if (*s != 0xFE || len == 1) {
+        return FALSE;
+    }
+
+    /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
+     * \x41 and \x42. */
+    return cBOOL(memGT(s + 1, prefix, cmp_len));
+
+#endif
+
+}
+
+PERL_STATIC_INLINE bool
+S_does_utf8_overflow(const U8 * const s, const U8 * e)
  {
  {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
+    const U8 *x;
+    const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
+
+    /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
+     * platform, that is if it represents a code point larger than the highest
+     * representable code point.  (For ASCII platforms, we could use memcmp()
+     * because we don't have to convert each byte to I8, but it's very rare
+     * input indeed that would approach overflow, so the loop below will likely
+     * only get executed once.
+     *
+     * 'e' must not be beyond a full character.  If it is less than a full
+     * character, the function returns FALSE if there is any input beyond 'e'
+     * that could result in a non-overflowing code point */
+
+    PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
+    assert(s <= e && s + UTF8SKIP(s) >= e);
  
  
-    PERL_ARGS_ASSERT_IS_UTF8_STRING;
+#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
  
  
-    while (x < send) {
-        STRLEN len = isUTF8_CHAR(x, send);
-        if (UNLIKELY(! len)) {
+    /* On 32 bit ASCII machines, many overlongs that start with FF don't
+     * overflow */
+
+    if (isFF_OVERLONG(s, e - s)) {
+        const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
+        return memGE(s, max_32_bit_overlong,
+                                MIN(e - s, sizeof(max_32_bit_overlong) - 1));
+    }
+
+#endif
+
+    for (x = s; x < e; x++, y++) {
+
+        /* If this byte is larger than the corresponding highest UTF-8 byte, it
+         * overflows */
+        if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
+            return TRUE;
+        }
+
+        /* If not the same as this byte, it must be smaller, doesn't overflow */
+        if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
              return FALSE;
          }
              return FALSE;
          }
-        x += len;
      }
  
      }
  
-    return TRUE;
+    /* Got to the end and all bytes are the same.  If the input is a whole
+     * character, it doesn't overflow.  And if it is a partial character,
+     * there's not enough information to tell, so assume doesn't overflow */
+    return FALSE;
  }
  
  }
  
-/*
-Implemented as a macro in utf8.h
+PERL_STATIC_INLINE bool
+S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+{
+    /* Overlongs can occur whenever the number of continuation bytes
+     * changes.  That means whenever the number of leading 1 bits in a start
+     * byte increases from the next lower start byte.  That happens for start
+     * bytes C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following
+     * illegal start bytes have already been excluded, so don't need to be
+     * tested here;
+     * ASCII platforms: C0, C1
+     * EBCDIC platforms C0, C1, C2, C3, C4, E0
+     *
+     * At least a second byte is required to determine if other sequences will
+     * be an overlong. */
+
+    const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+    const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+    PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
+    assert(len > 1 && UTF8_IS_START(*s));
+
+    /* Each platform has overlongs after the start bytes given above (expressed
+     * in I8 for EBCDIC).  What constitutes an overlong varies by platform, but
+     * the logic is the same, except the E0 overlong has already been excluded
+     * on EBCDIC platforms.   The  values below were found by manually
+     * inspecting the UTF-8 patterns.  See the tables in utf8.h and
+     * utfebcdic.h. */
+
+#       ifdef EBCDIC
+#           define F0_ABOVE_OVERLONG 0xB0
+#           define F8_ABOVE_OVERLONG 0xA8
+#           define FC_ABOVE_OVERLONG 0xA4
+#           define FE_ABOVE_OVERLONG 0xA2
+#           define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
+                                    /* I8(0xfe) is FF */
+#       else
+
+    if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
+        return TRUE;
+    }
+
+#           define F0_ABOVE_OVERLONG 0x90
+#           define F8_ABOVE_OVERLONG 0x88
+#           define FC_ABOVE_OVERLONG 0x84
+#           define FE_ABOVE_OVERLONG 0x82
+#           define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
+#       endif
+
+
+    if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
+        || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
+        || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
+        || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
+    {
+        return TRUE;
+    }
+
+    /* Check for the FF overlong */
+    return isFF_OVERLONG(s, len);
+}
+
+PERL_STATIC_INLINE bool
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
+{
+    PERL_ARGS_ASSERT_ISFF_OVERLONG;
+
+    /* Check for the FF overlong.  This happens only if all these bytes match;
+     * what comes after them doesn't matter.  See tables in utf8.h,
+     * utfebcdic.h. */
  
  
-=for apidoc is_utf8_string_loc
+    return    len >= sizeof(FF_OVERLONG_PREFIX) - 1
+           && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+                                            sizeof(FF_OVERLONG_PREFIX) - 1));
+}
  
  
-Like L</is_utf8_string> but stores the location of the failure (in the
-case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>.
+#undef F0_ABOVE_OVERLONG
+#undef F8_ABOVE_OVERLONG
+#undef FC_ABOVE_OVERLONG
+#undef FE_ABOVE_OVERLONG
+#undef FF_OVERLONG_PREFIX
  
  
-See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
+STRLEN
+Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+{
+    STRLEN len;
+    const U8 *x;
+
+    /* A helper function that should not be called directly.
+     *
+     * This function returns non-zero if the string beginning at 's' and
+     * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
+     * code point; otherwise it returns 0.  The examination stops after the
+     * first code point in 's' is validated, not looking at the rest of the
+     * input.  If 'e' is such that there are not enough bytes to represent a
+     * complete code point, this function will return non-zero anyway, if the
+     * bytes it does have are well-formed UTF-8 as far as they go, and aren't
+     * excluded by 'flags'.
+     *
+     * A non-zero return gives the number of bytes required to represent the
+     * code point.  Be aware that if the input is for a partial character, the
+     * return will be larger than 'e - s'.
+     *
+     * This function assumes that the code point represented is UTF-8 variant.
+     * The caller should have excluded this possibility before calling this
+     * function.
+     *
+     * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
+     * accepted by L</utf8n_to_uvchr>.  If non-zero, this function will return
+     * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
+     * disallowed by the flags.  If the input is only for a partial character,
+     * the function will return non-zero if there is any sequence of
+     * well-formed UTF-8 that, when appended to the input sequence, could
+     * result in an allowed code point; otherwise it returns 0.  Non characters
+     * cannot be determined based on partial character input.  But many  of the
+     * other excluded types can be determined with just the first one or two
+     * bytes.
+     *
+     */
  
  
-=for apidoc is_utf8_string_loclen
+    PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
  
  
-Like L</is_utf8_string>() but stores the location of the failure (in the
-case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>, and the number of UTF-8
-encoded characters in the C<el>.
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+    assert(! UTF8_IS_INVARIANT(*s));
  
  
-See also L</is_utf8_string_loc>() and L</is_utf8_string>().
+    /* A variant char must begin with a start byte */
+    if (UNLIKELY(! UTF8_IS_START(*s))) {
+        return 0;
+    }
  
  
-=cut
-*/
+    /* Examine a maximum of a single whole code point */
+    if (e - s > UTF8SKIP(s)) {
+        e = s + UTF8SKIP(s);
+    }
  
  
-bool
-Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+    len = e - s;
+
+    if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
+        const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
+
+        /* The code below is derived from this table.  Keep in mind that legal
+         * continuation bytes range between \x80..\xBF for UTF-8, and
+         * \xA0..\xBF for I8.  Anything above those aren't continuation bytes.
+         * Hence, we don't have to test the upper edge because if any of those
+         * are encountered, the sequence is malformed, and will fail elsewhere
+         * in this function.
+         *              UTF-8            UTF-EBCDIC I8
+         *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
+         *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
+         * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
+         *
+         */
+
+#ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
+#  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xFA
+#  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF9 && (s1) >= 0xA2)
+
+#  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
+                                                       /* B6 and B7 */      \
+                                              && ((s1) & 0xFE ) == 0xB6)
+#else
+#  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
+#  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
+#  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
+#endif
+
+        if (  (flags & UTF8_DISALLOW_SUPER)
+            && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
+            return 0;           /* Above Unicode */
+        }
+
+        if (   (flags & UTF8_DISALLOW_ABOVE_31_BIT)
+            &&  UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+        {
+            return 0;           /* Above 31 bits */
+        }
+
+        if (len > 1) {
+            const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
+
+            if (   (flags & UTF8_DISALLOW_SUPER)
+                &&  UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
+            {
+                return 0;       /* Above Unicode */
+            }
+
+            if (   (flags & UTF8_DISALLOW_SURROGATE)
+                &&  UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
+            {
+                return 0;       /* Surrogate */
+            }
+
+            if (  (flags & UTF8_DISALLOW_NONCHAR)
+                && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
+            {
+                return 0;       /* Noncharacter code point */
+            }
+        }
+    }
+
+    /* Make sure that all that follows are continuation bytes */
+    for (x = s + 1; x < e; x++) {
+        if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
+            return 0;
+        }
+    }
+
+    /* Here is syntactically valid.  Next, make sure this isn't the start of an
+     * overlong. */
+    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+        return 0;
+    }
+
+    /* And finally, that the code point represented fits in a word on this
+     * platform */
+    if (does_utf8_overflow(s, e)) {
+        return 0;
+    }
+
+    return UTF8SKIP(s);
+}
+
+STATIC char *
+S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
  {
  {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
-    STRLEN outlen = 0;
+    /* Returns a mortalized C string that is a displayable copy of the 'len'
+     * bytes starting at 's', each in a \xXY format. */
+
+    const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
+                                               trailing NUL */
+    const U8 * const e = s + len;
+    char * output;
+    char * d;
  
  
-    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
+    PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
  
  
-    while (x < send) {
-        STRLEN len = isUTF8_CHAR(x, send);
-        if (UNLIKELY(! len)) {
-            goto out;
+    Newx(output, output_len, char);
+    SAVEFREEPV(output);
+
+    d = output;
+    for (; s < e; s++) {
+        const unsigned high_nibble = (*s & 0xF0) >> 4;
+        const unsigned low_nibble =  (*s & 0x0F);
+
+        *d++ = '\\';
+        *d++ = 'x';
+
+        if (high_nibble < 10) {
+            *d++ = high_nibble + '0';
+        }
+        else {
+            *d++ = high_nibble - 10 + 'a';
+        }
+
+        if (low_nibble < 10) {
+            *d++ = low_nibble + '0';
+        }
+        else {
+            *d++ = low_nibble - 10 + 'a';
          }
          }
-        x += len;
-        outlen++;
      }
  
      }
  
- out:
-    if (el)
-        *el = outlen;
+    *d = '\0';
+    return output;
+}
+
+PERL_STATIC_INLINE char *
+S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
+
+                                         /* How many bytes to print */
+                                         STRLEN print_len,
+
+                                         /* Which one is the non-continuation */
+                                         const STRLEN non_cont_byte_pos,
+
+                                         /* How many bytes should there be? */
+                                         const STRLEN expect_len)
+{
+    /* Return the malformation warning text for an unexpected continuation
+     * byte. */
+
+    const char * const where = (non_cont_byte_pos == 1)
+                               ? "immediately"
+                               : Perl_form(aTHX_ "%d bytes",
+                                                 (int) non_cont_byte_pos);
+    unsigned int i;
+
+    PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
+
+    /* We don't need to pass this parameter, but since it has already been
+     * calculated, it's likely faster to pass it; verify under DEBUGGING */
+    assert(expect_len == UTF8SKIP(s));
+
+    /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
+     * length that is larger than is actually available in the buffer.  If we
+     * print all the bytes based on that length, we will read past the buffer
+     * end.  Often, the strings are NUL terminated, so to lower the chances of
+     * this happening, print the malformed bytes only up through any NUL. */
+    for (i = 1; i < print_len; i++) {
+        if (*(s + i) == '\0') {
+            print_len = i + 1;  /* +1 gets the NUL printed */
+            break;
+        }
+    }
  
  
-    if (ep)
-        *ep = x;
-    return (x == send);
+    return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
+                           " %s after start byte 0x%02x; need %d bytes, got %d)",
+                           malformed_text,
+                           _byte_dump_string(s, print_len),
+                           *(s + non_cont_byte_pos),
+                           where,
+                           *s,
+                           (int) expect_len,
+                           (int) non_cont_byte_pos);
  }
  
  /*
  }
  
  /*
@@ -414,10 +807,13 @@ C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
  the length, in bytes, of that character.
  
  The value of C<flags> determines the behavior when C<s> does not point to a
  the length, in bytes, of that character.
  
  The value of C<flags> determines the behavior when C<s> does not point to a
-well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
-zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
+well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
+causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
+is the next possible position in C<s> that could begin a non-malformed
+character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
+is raised.  Some UTF-8 input sequences may contain multiple malformations.
+This function tries to find every possible one in each call, so multiple
+warnings can be raised for each sequence.
  
  Various ALLOW flags can be set in C<flags> to allow (and not warn on)
  individual types of malformations, such as the sequence being overlong (that
  
  Various ALLOW flags can be set in C<flags> to allow (and not warn on)
  individual types of malformations, such as the sequence being overlong (that
@@ -430,76 +826,214 @@ overlong sequences, the computed code point is returned; for all other allowed
  malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
  determinable reasonable value.
  
  malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
  determinable reasonable value.
  
-The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
+The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
  flags) malformation is found.  If this flag is set, the routine assumes that
  the caller will raise a warning, and this function will silently just set
  C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
  
  Note that this API requires disambiguation between successful decoding a C<NUL>
  flags) malformation is found.  If this flag is set, the routine assumes that
  the caller will raise a warning, and this function will silently just set
  C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
  
  Note that this API requires disambiguation between successful decoding a C<NUL>
-character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
-in both cases, 0 is returned.  To disambiguate, upon a zero return, see if the
-first byte of C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the
-input had an error.
+character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
+in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
+be set to 1.  To disambiguate, upon a zero return, see if the first byte of
+C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
+error.  Or you can use C<L</utf8n_to_uvchr_error>>.
  
  Certain code points are considered problematic.  These are Unicode surrogates,
  Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
  By default these are considered regular code points, but certain situations
  
  Certain code points are considered problematic.  These are Unicode surrogates,
  Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
  By default these are considered regular code points, but certain situations
-warrant special handling for them.  If C<flags> contains
-UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
-malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
-UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
-maximum) can be set to disallow these categories individually.
-
-The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
-UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
-for their respective categories, but otherwise the code points are considered
-valid (not malformations).  To get a category to both be treated as a
-malformation and raise a warning, specify both the WARN and DISALLOW flags.
+warrant special handling for them, which can be specified using the C<flags>
+parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
+three classes are treated as malformations and handled as such.  The flags
+C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
+C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
+disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
+restricts the allowed inputs to the strict UTF-8 traditionally defined by
+Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
+definition given by
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+The difference between traditional strictness and C9 strictness is that the
+latter does not forbid non-character code points.  (They are still discouraged,
+however.)  For more discussion see L<perlunicode/Noncharacter code points>.
+
+The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
+C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
+C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
+raised for their respective categories, but otherwise the code points are
+considered valid (not malformations).  To get a category to both be treated as
+a malformation and raise a warning, specify both the WARN and DISALLOW flags.
  (But note that warnings are not raised if lexically disabled nor if
  (But note that warnings are not raised if lexically disabled nor if
-UTF8_CHECK_ONLY is also specified.)
-
-Very large code points (above 0x7FFF_FFFF) are considered more problematic than
-the others that are above the Unicode legal maximum.  There are several
-reasons: they requre at least 32 bits to represent them on ASCII platforms, are
-not representable at all on EBCDIC platforms, and the original UTF-8
-specification never went above this number (the current 0x10FFFF limit was
-imposed later).  (The smaller ones, those that fit into 32 bits, are
-representable by a UV on ASCII platforms, but not by an IV, which means that
-the number of operations that can be performed on them is quite restricted.)
-The UTF-8 encoding on ASCII platforms for these large code points begins with a
-byte containing 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to
-be treated as malformations, while allowing smaller above-Unicode code points.
-(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
-including these, as malformations.)
-Similarly, UTF8_WARN_FE_FF acts just like
-the other WARN flags, but applies just to these code points.
+C<UTF8_CHECK_ONLY> is also specified.)
+
+It is now deprecated to have very high code points (above C<IV_MAX> on the
+platforms) and this function will raise a deprecation warning for these (unless
+such warnings are turned off).  This value is typically 0x7FFF_FFFF (2**31 -1)
+in a 32-bit word.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+so using them is more problematic than other above-Unicode code points.  Perl
+invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
+likely that non-Perl languages will not be able to read files that contain
+these; nor would Perl understand files
+written by something that uses a different extension.  For these reasons, there
+is a separate set of flags that can warn and/or disallow these extremely high
+code points, even if other above-Unicode ones are accepted.  These are the
+C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
+are entirely independent from the deprecation warning for code points above
+C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
+code point that needs more than 31 bits to represent.  When that happens,
+effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
+32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations; and
+C<UTF8_WARN_SUPER> warns on these.)
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII.  Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.  The flags C<UTF8_WARN_ABOVE_31_BIT>
+and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
+platforms, warning and disallowing 2**31 and higher.
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
  =cut
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
  =cut
+
+Also implemented as a macro in utf8.h
  */
  
  UV
  */
  
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
+                          STRLEN curlen,
+                          STRLEN *retlen,
+                          const U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+
+    return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
+}
+
+/*
+
+=for apidoc utf8n_to_uvchr_error
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+
+This function is for code that needs to know what the precise malformation(s)
+are when an error is found.
+
+It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
+all the others, C<errors>.  If this parameter is 0, this function behaves
+identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
+to a C<U32> variable, which this function sets to indicate any errors found.
+Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
+C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
+of these bits will be set if a malformation is found, even if the input
+C<flags> parameter indicates that the given malformation is allowed; the
+exceptions are noted:
+
+=over 4
+
+=item C<UTF8_GOT_ABOVE_31_BIT>
+
+The code point represented by the input UTF-8 sequence occupies more than 31
+bits.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+
+=item C<UTF8_GOT_CONTINUATION>
+
+The input sequence was malformed in that the first byte was a a UTF-8
+continuation byte.
+
+=item C<UTF8_GOT_EMPTY>
+
+The input C<curlen> parameter was 0.
+
+=item C<UTF8_GOT_LONG>
+
+The input sequence was malformed in that there is some other sequence that
+evaluates to the same code point, but that sequence is shorter than this one.
+
+=item C<UTF8_GOT_NONCHAR>
+
+The code point represented by the input UTF-8 sequence is for a Unicode
+non-character code point.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
+
+=item C<UTF8_GOT_NON_CONTINUATION>
+
+The input sequence was malformed in that a non-continuation type byte was found
+in a position where only a continuation type one should be.
+
+=item C<UTF8_GOT_OVERFLOW>
+
+The input sequence was malformed in that it is for a code point that is not
+representable in the number of bits available in a UV on the current platform.
+
+=item C<UTF8_GOT_SHORT>
+
+The input sequence was malformed in that C<curlen> is smaller than required for
+a complete sequence.  In other words, the input is for a partial character
+sequence.
+
+=item C<UTF8_GOT_SUPER>
+
+The input sequence was malformed in that it is for a non-Unicode code point;
+that is, one above the legal Unicode maximum.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
+
+=item C<UTF8_GOT_SURROGATE>
+
+The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
+code point.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
+
+=back
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+                                STRLEN curlen,
+                                STRLEN *retlen,
+                                const U32 flags,
+                                U32 * errors)
  {
      const U8 * const s0 = s;
  {
      const U8 * const s0 = s;
-    U8 overflow_byte = '\0';   /* Save byte in case of overflow */
-    U8 * send;
+    U8 * send = NULL;           /* (initialized to silence compilers' wrong
+                                   warning) */
+    U32 possible_problems = 0;  /* A bit is set here for each potential problem
+                                   found as we go along */
      UV uv = *s;
      UV uv = *s;
-    STRLEN expectlen;
-    SV* sv = NULL;
-    UV outlier_ret = 0;        /* return value when input is in error or problematic
-                        */
-    UV pack_warn = 0;  /* Save result of packWARN() for later */
-    bool unexpected_non_continuation = FALSE;
-    bool overflowed = FALSE;
-    bool do_overlong_test = TRUE;   /* May have to skip this test */
+    STRLEN expectlen   = 0;     /* How long should this sequence be?
+                                   (initialized to silence compilers' wrong
+                                   warning) */
+    U32 discard_errors = 0;     /* Used to save branches when 'errors' is NULL;
+                                   this gets set and discarded */
  
  
-    const char* const malformed_text = "Malformed UTF-8 character";
+    /* The below are used only if there is both an overlong malformation and a
+     * too short one.  Otherwise the first two are set to 's0' and 'send', and
+     * the third not used at all */
+    U8 * adjusted_s0 = (U8 *) s0;
+    U8 * adjusted_send = NULL;  /* (Initialized to silence compilers' wrong
+                                   warning) */
+    UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
  
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+
+    if (errors) {
+        *errors = 0;
+    }
+    else {
+        errors = &discard_errors;
+    }
  
      /* The order of malformation tests here is important.  We should consume as
       * few bytes as possible in order to not skip any valid character.  This is
  
      /* The order of malformation tests here is important.  We should consume as
       * few bytes as possible in order to not skip any valid character.  This is
@@ -519,21 +1053,21 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
       * returning to the caller C<*retlen> pointing to the very next byte (one
       * which is actually part of of the overflowing sequence), that could look
       * legitimate to the caller, which could discard the initial partial
       * returning to the caller C<*retlen> pointing to the very next byte (one
       * which is actually part of of the overflowing sequence), that could look
       * legitimate to the caller, which could discard the initial partial
-     * sequence and process the rest, inappropriately */
+     * sequence and process the rest, inappropriately.
+     *
+     * Some possible input sequences are malformed in more than one way.  This
+     * function goes to lengths to try to find all of them.  This is necessary
+     * for correctness, as the inputs may allow one malformation but not
+     * another, and if we abandon searching for others after finding the
+     * allowed one, we could allow in something that shouldn't have been.
+     */
  
  
-    /* Zero length strings, if allowed, of necessity are zero */
      if (UNLIKELY(curlen == 0)) {
      if (UNLIKELY(curlen == 0)) {
-       if (retlen) {
-           *retlen = 0;
-       }
-
-       if (flags & UTF8_ALLOW_EMPTY) {
-           return 0;
-       }
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
-       }
-       goto malformed;
+        possible_problems |= UTF8_GOT_EMPTY;
+        curlen = 0;
+        uv = 0; /* XXX It could be argued that this should be
+                   UNICODE_REPLACEMENT? */
+       goto ready_to_handle_errors;
      }
  
      expectlen = UTF8SKIP(s);
      }
  
      expectlen = UTF8SKIP(s);
@@ -553,273 +1087,536 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
  
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
-       if (flags & UTF8_ALLOW_CONTINUATION) {
-           if (retlen) {
-               *retlen = 1;
-           }
-           return UNICODE_REPLACEMENT;
-       }
+       possible_problems |= UTF8_GOT_CONTINUATION;
+        curlen = 1;
+        uv = UNICODE_REPLACEMENT;
+       goto ready_to_handle_errors;
+    }
+
+    /* Here is not a continuation byte, nor an invariant.  The only thing left
+     * is a start byte (possibly for an overlong) */
+
+    /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
+     * that indicate the number of bytes in the character's whole UTF-8
+     * sequence, leaving just the bits that are part of the value.  */
+    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+
+    /* Now, loop through the remaining bytes in the character's sequence,
+     * accumulating each into the working value as we go.  Be sure to not look
+     * past the end of the input string */
+    send = adjusted_send = (U8*) s0 + ((expectlen <= curlen)
+                                       ? expectlen
+                                       : curlen);
+    for (s = s0 + 1; s < send; s++) {
+       if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
+           uv = UTF8_ACCUMULATE(uv, *s);
+            continue;
+        }
+
+        /* Here, found a non-continuation before processing all expected bytes.
+         * This byte indicates the beginning of a new character, so quit, even
+         * if allowing this malformation. */
+        curlen = s - s0;    /* Save how many bytes we actually got */
+        possible_problems |= UTF8_GOT_NON_CONTINUATION;
+        goto finish_short;
+    } /* End of loop through the character's bytes */
+
+    /* Save how many bytes were actually in the character */
+    curlen = s - s0;
+
+    /* Did we get all the continuation bytes that were expected?  Note that we
+     * know this result even without executing the loop above.  But we had to
+     * do the loop to see if there are unexpected non-continuations. */
+    if (UNLIKELY(curlen < expectlen)) {
+       possible_problems |= UTF8_GOT_SHORT;
+
+      finish_short:
+        uv_so_far = uv;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    /* Note that there are two types of too-short malformation.  One is when
+     * there is actual wrong data before the normal termination of the
+     * sequence.  The other is that the sequence wasn't complete before the end
+     * of the data we are allowed to look at, based on the input 'curlen'.
+     * This means that we were passed data for a partial character, but it is
+     * valid as far as we saw.  The other is definitely invalid.  This
+     * distinction could be important to a caller, so the two types are kept
+     * separate. */
+
+    /* Check for overflow */
+    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+        possible_problems |= UTF8_GOT_OVERFLOW;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    /* Check for overlong.  If no problems so far, 'uv' is the correct code
+     * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
+     * we must look at the UTF-8 byte sequence itself to see if it is for an
+     * overlong */
+    if (     (   LIKELY(! possible_problems)
+              && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+        || (   UNLIKELY(  possible_problems)
+            && (   UNLIKELY(! UTF8_IS_START(*s0))
+                || (   curlen > 1
+                    && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
+                                                                send - s0))))))
+    {
+        possible_problems |= UTF8_GOT_LONG;
+
+        /* A convenience macro that matches either of the too-short conditions.
+         * */
+#       define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+        if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+            UV min_uv = uv_so_far;
+            STRLEN i;
+
+            /* Here, the input is both overlong and is missing some trailing
+             * bytes.  There is no single code point it could be for, but there
+             * may be enough information present to determine if what we have
+             * so far is for an unallowed code point, such as for a surrogate.
+             * The code below has the intelligence to determine this, but just
+             * for non-overlong UTF-8 sequences.  What we do here is calculate
+             * the smallest code point the input could represent if there were
+             * no too short malformation.  Then we compute and save the UTF-8
+             * for that, which is what the code below looks at instead of the
+             * raw input.  It turns out that the smallest such code point is
+             * all we need. */
+            for (i = curlen; i < expectlen; i++) {
+                min_uv = UTF8_ACCUMULATE(min_uv,
+                                     I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+            }
+
+            Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
+            SAVEFREEPV((U8 *) adjusted_s0);    /* Needed because we may not get
+                                                  to free it ourselves if
+                                                  warnings are made fatal */
+            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+        }
+    }
+
+    /* Now check that the input isn't for a problematic code point not allowed
+     * by the input parameters. */
+                                              /* isn't problematic if < this */
+    if (   (   (   LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
+            || (   UNLIKELY(possible_problems)
+                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+       && ((flags & ( UTF8_DISALLOW_NONCHAR
+                      |UTF8_DISALLOW_SURROGATE
+                      |UTF8_DISALLOW_SUPER
+                      |UTF8_DISALLOW_ABOVE_31_BIT
+                     |UTF8_WARN_NONCHAR
+                      |UTF8_WARN_SURROGATE
+                      |UTF8_WARN_SUPER
+                      |UTF8_WARN_ABOVE_31_BIT))
+                   /* In case of a malformation, 'uv' is not valid, and has
+                    * been changed to something in the Unicode range.
+                    * Currently we don't output a deprecation message if there
+                    * is already a malformation, so we don't have to special
+                    * case the test immediately below */
+            || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+                && ckWARN_d(WARN_DEPRECATED))))
+    {
+        /* If there were no malformations, or the only malformation is an
+         * overlong, 'uv' is valid */
+        if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+            if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                possible_problems |= UTF8_GOT_SURROGATE;
+            }
+            else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+                possible_problems |= UTF8_GOT_NONCHAR;
+            }
+        }
+        else {  /* Otherwise, need to look at the source UTF-8, possibly
+                   adjusted to be non-overlong */
+
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+                                >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+            {
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (curlen > 1) {
+                if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+                {
+                    possible_problems |= UTF8_GOT_SUPER;
+                }
+                else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+                {
+                    possible_problems |= UTF8_GOT_SURROGATE;
+                }
+            }
  
  
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
-       }
-       curlen = 1;
-       goto malformed;
+            /* We need a complete well-formed UTF-8 character to discern
+             * non-characters, so can't look for them here */
+        }
      }
  
      }
  
-    /* Here is not a continuation byte, nor an invariant.  The only thing left
-     * is a start byte (possibly for an overlong) */
-
-#ifdef EBCDIC
-    uv = NATIVE_UTF8_TO_I8(uv);
-#endif
+  ready_to_handle_errors:
+
+    /* At this point:
+     * curlen               contains the number of bytes in the sequence that
+     *                      this call should advance the input by.
+     * possible_problems'   is 0 if there weren't any problems; otherwise a bit
+     *                      is set in it for each potential problem found.
+     * uv                   contains the code point the input sequence
+     *                      represents; or if there is a problem that prevents
+     *                      a well-defined value from being computed, it is
+     *                      some subsitute value, typically the REPLACEMENT
+     *                      CHARACTER.
+     * s0                   points to the first byte of the character
+     * send                 points to just after where that (potentially
+     *                      partial) character ends
+     * adjusted_s0          normally is the same as s0, but in case of an
+     *                      overlong for which the UTF-8 matters below, it is
+     *                      the first byte of the shortest form representation
+     *                      of the input.
+     * adjusted_send        normally is the same as 'send', but if adjusted_s0
+     *                      is set to something other than s0, this points one
+     *                      beyond its end
+     */
  
  
-    /* Remove the leading bits that indicate the number of bytes in the
-     * character's whole UTF-8 sequence, leaving just the bits that are part of
-     * the value */
-    uv &= UTF_START_MASK(expectlen);
+    if (UNLIKELY(possible_problems)) {
+        bool disallowed = FALSE;
+        const U32 orig_problems = possible_problems;
  
  
-    /* Now, loop through the remaining bytes in the character's sequence,
-     * accumulating each into the working value as we go.  Be sure to not look
-     * past the end of the input string */
-    send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
+        while (possible_problems) { /* Handle each possible problem */
+            UV pack_warn = 0;
+            char * message = NULL;
  
  
-    for (s = s0 + 1; s < send; s++) {
-       if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
-#ifndef EBCDIC /* Can't overflow in EBCDIC */
-           if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
-
-               /* The original implementors viewed this malformation as more
-                * serious than the others (though I, khw, don't understand
-                * why, since other malformations also give very very wrong
-                * results), so there is no way to turn off checking for it.
-                * Set a flag, but keep going in the loop, so that we absorb
-                * the rest of the bytes that comprise the character. */
-               overflowed = TRUE;
-               overflow_byte = *s; /* Save for warning message's use */
-           }
-#endif
-           uv = UTF8_ACCUMULATE(uv, *s);
-       }
-       else {
-           /* Here, found a non-continuation before processing all expected
-            * bytes.  This byte begins a new character, so quit, even if
-            * allowing this malformation. */
-           unexpected_non_continuation = TRUE;
-           break;
-       }
-    } /* End of loop through the character's bytes */
+            /* Each 'if' clause handles one problem.  They are ordered so that
+             * the first ones' messages will be displayed before the later
+             * ones; this is kinda in decreasing severity order */
+            if (possible_problems & UTF8_GOT_OVERFLOW) {
  
  
-    /* Save how many bytes were actually in the character */
-    curlen = s - s0;
+                /* Overflow means also got a super and above 31 bits, but we
+                 * handle all three cases here */
+                possible_problems
+                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+                *errors |= UTF8_GOT_OVERFLOW;
  
  
-    /* The loop above finds two types of malformations: non-continuation and/or
-     * overflow.  The non-continuation malformation is really a too-short
-     * malformation, as it means that the current character ended before it was
-     * expected to (being terminated prematurely by the beginning of the next
-     * character, whereas in the too-short malformation there just are too few
-     * bytes available to hold the character.  In both cases, the check below
-     * that we have found the expected number of bytes would fail if executed.)
-     * Thus the non-continuation malformation is really unnecessary, being a
-     * subset of the too-short malformation.  But there may be existing
-     * applications that are expecting the non-continuation type, so we retain
-     * it, and return it in preference to the too-short malformation.  (If this
-     * code were being written from scratch, the two types might be collapsed
-     * into one.)  I, khw, am also giving priority to returning the
-     * non-continuation and too-short malformations over overflow when multiple
-     * ones are present.  I don't know of any real reason to prefer one over
-     * the other, except that it seems to me that multiple-byte errors trumps
-     * errors from a single byte */
-    if (UNLIKELY(unexpected_non_continuation)) {
-       if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-           if (! (flags & UTF8_CHECK_ONLY)) {
-               if (curlen == 1) {
-                   sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
-               }
-               else {
-                   sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
-               }
-           }
-           goto malformed;
-       }
-       uv = UNICODE_REPLACEMENT;
+                /* But the API says we flag all errors found */
+                if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
+                    *errors |= UTF8_GOT_SUPER;
+                }
+                if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+                    *errors |= UTF8_GOT_ABOVE_31_BIT;
+                }
  
  
-       /* Skip testing for overlongs, as the REPLACEMENT may not be the same
-        * as what the original expectations were. */
-       do_overlong_test = FALSE;
-       if (retlen) {
-           *retlen = curlen;
-       }
-    }
-    else if (UNLIKELY(curlen < expectlen)) {
-       if (! (flags & UTF8_ALLOW_SHORT)) {
-           if (! (flags & UTF8_CHECK_ONLY)) {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
-           }
-           goto malformed;
-       }
-       uv = UNICODE_REPLACEMENT;
-       do_overlong_test = FALSE;
-       if (retlen) {
-           *retlen = curlen;
-       }
-    }
+                disallowed = TRUE;
  
  
-#ifndef EBCDIC /* EBCDIC can't overflow */
-    if (UNLIKELY(overflowed)) {
-       sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
-       goto malformed;
-    }
-#endif
+                /* The warnings code explicitly says it doesn't handle the case
+                 * of packWARN2 and two categories which have parent-child
+                 * relationship.  Even if it works now to raise the warning if
+                 * either is enabled, it wouldn't necessarily do so in the
+                 * future.  We output (only) the most dire warning*/
+                if (! (flags & UTF8_CHECK_ONLY)) {
+                    if (ckWARN_d(WARN_UTF8)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                    }
+                    else if (ckWARN_d(WARN_NON_UNICODE)) {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+                    }
+                    if (pack_warn) {
+                        message = Perl_form(aTHX_ "%s: %s (overflows)",
+                                        malformed_text,
+                                        _byte_dump_string(s0, send - s0));
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_EMPTY) {
+                possible_problems &= ~UTF8_GOT_EMPTY;
+                *errors |= UTF8_GOT_EMPTY;
+
+                if (! (flags & UTF8_ALLOW_EMPTY)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s (empty string)",
+                                                   malformed_text);
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_CONTINUATION;
+                *errors |= UTF8_GOT_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (unexpected continuation byte 0x%02x,"
+                                " with no preceding start byte)",
+                                malformed_text,
+                                _byte_dump_string(s0, 1), *s0);
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+                *errors |= UTF8_GOT_NON_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s",
+                            unexpected_non_continuation_text(s0,
+                                                            send - s0,
+                                                            s - s0,
+                                                            (int) expectlen));
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SHORT) {
+                possible_problems &= ~UTF8_GOT_SHORT;
+                *errors |= UTF8_GOT_SHORT;
+
+                if (! (flags & UTF8_ALLOW_SHORT)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (too short; got %d byte%s, need %d)",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                (int)curlen,
+                                curlen == 1 ? "" : "s",
+                                (int)expectlen);
+                    }
+                }
  
  
-    if (do_overlong_test
-       && expectlen > (STRLEN) OFFUNISKIP(uv)
-       && ! (flags & UTF8_ALLOW_LONG))
-    {
-       /* The overlong malformation has lower precedence than the others.
-        * Note that if this malformation is allowed, we return the actual
-        * value, instead of the replacement character.  This is because this
-        * value is actually well-defined. */
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
-       }
-       goto malformed;
-    }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
+
+                if (! (flags & UTF8_ALLOW_LONG)) {
+                    disallowed = TRUE;
+
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0),
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " U+%0*"UVXf")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf),
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                uv);
+                        }
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SURROGATE) {
+                possible_problems &= ~UTF8_GOT_SURROGATE;
  
  
-    /* Here, the input is considered to be well-formed, but it still could be a
-     * problematic code point that is not allowed by the input parameters. */
-    if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
-       && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-                    |UTF8_WARN_ILLEGAL_INTERCHANGE)))
-    {
-       if (UNICODE_IS_SURROGATE(uv)) {
+                if (flags & UTF8_WARN_SURROGATE) {
+                    *errors |= UTF8_GOT_SURROGATE;
  
  
-            /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-             * generation of the sv, since no warnings are raised under CHECK */
-           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
-               && ckWARN_d(WARN_SURROGATE))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
-               pack_warn = packWARN(WARN_SURROGATE);
-           }
-           if (flags & UTF8_DISALLOW_SURROGATE) {
-               goto disallowed;
-           }
-       }
-       else if ((uv > PERL_UNICODE_MAX)) {
-           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
-                && ckWARN_d(WARN_NON_UNICODE))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
-               pack_warn = packWARN(WARN_NON_UNICODE);
-           }
-#ifndef EBCDIC /* EBCDIC always allows FE, FF */
-
-            /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
-             * points.  We test for these after the regular SUPER ones, and
-             * before possibly bailing out, so that the more dire warning
-             * overrides the regular one, if applicable */
-            if ((*s0 & 0xFE) == 0xFE   /* matches both FE, FF */
-                && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
-            {
-                if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
-                                                            == UTF8_WARN_FE_FF
-                    && ckWARN_d(WARN_UTF8))
-                {
-                    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
-                    pack_warn = packWARN(WARN_UTF8);
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_SURROGATE))
+                    {
+                        pack_warn = packWARN(WARN_SURROGATE);
+
+                        /* These are the only errors that can occur with a
+                        * surrogate when the 'uv' isn't valid */
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "UTF-16 surrogate (any UTF-8 sequence that"
+                                    " starts with \"%s\" is for a surrogate)",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                            "UTF-16 surrogate U+%04"UVXf"", uv);
+                        }
+                    }
                  }
                  }
-                if (flags & UTF8_DISALLOW_FE_FF) {
-                    goto disallowed;
+
+                if (flags & UTF8_DISALLOW_SURROGATE) {
+                    disallowed = TRUE;
+                    *errors |= UTF8_GOT_SURROGATE;
                  }
              }
                  }
              }
-#endif
-           if (flags & UTF8_DISALLOW_SUPER) {
-               goto disallowed;
-           }
-       }
-       else if (UNICODE_IS_NONCHAR(uv)) {
-           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
-               && ckWARN_d(WARN_NONCHAR))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
-               pack_warn = packWARN(WARN_NONCHAR);
-           }
-           if (flags & UTF8_DISALLOW_NONCHAR) {
-               goto disallowed;
-           }
-       }
+            else if (possible_problems & UTF8_GOT_SUPER) {
+                possible_problems &= ~UTF8_GOT_SUPER;
  
  
-       if (sv) {
-            outlier_ret = uv;   /* Note we don't bother to convert to native,
-                                   as all the outlier code points are the same
-                                   in both ASCII and EBCDIC */
-           goto do_warn;
-       }
+                if (flags & UTF8_WARN_SUPER) {
+                    *errors |= UTF8_GOT_SUPER;
  
  
-       /* Here, this is not considered a malformed character, so drop through
-        * to return it */
-    }
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_NON_UNICODE))
+                    {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "Any UTF-8 sequence that starts with"
+                                    " \"%s\" is for a non-Unicode code point,"
+                                    " may not be portable",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                                "Code point 0x%04"UVXf" is not"
+                                                " Unicode, may not be portable",
+                                                uv);
+                        }
+                    }
+                }
  
  
-    return UNI_TO_NATIVE(uv);
+                /* The maximum code point ever specified by a standard was
+                 * 2**31 - 1.  Anything larger than that is a Perl extension
+                 * that very well may not be understood by other applications
+                 * (including earlier perl versions on EBCDIC platforms).  We
+                 * test for these after the regular SUPER ones, and before
+                 * possibly bailing out, so that the slightly more dire warning
+                 * will override the regular one. */
+                if (   (flags & (UTF8_WARN_ABOVE_31_BIT
+                                |UTF8_WARN_SUPER
+                                |UTF8_DISALLOW_ABOVE_31_BIT))
+                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
+                            && UNLIKELY(is_utf8_cp_above_31_bits(
+                                                                adjusted_s0,
+                                                                adjusted_send)))
+                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
+                            && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
+                {
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
+                        &&  ckWARN_d(WARN_UTF8))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is for a non-Unicode code"
+                                        " point, and is not portable",
+                                        _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                        "Code point 0x%"UVXf" is not Unicode,"
+                                        " and not portable",
+                                         uv);
+                        }
+                    }
  
  
-    /* There are three cases which get to beyond this point.  In all 3 cases:
-     * <sv>        if not null points to a string to print as a warning.
-     * <curlen>            is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
-     *             set.
-     * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
-     *             This is done by initializing it to 0, and changing it only
-     *             for case 1).
-     * The 3 cases are:
-     * 1)   The input is valid but problematic, and to be warned about.  The
-     *     return value is the resultant code point; <*retlen> is set to
-     *     <curlen>, the number of bytes that comprise the code point.
-     *     <pack_warn> contains the result of packWARN() for the warning
-     *     types.  The entry point for this case is the label <do_warn>;
-     * 2)   The input is a valid code point but disallowed by the parameters to
-     *     this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
-     *     <*relen> is -1; otherwise it is <curlen>, the number of bytes that
-     *     comprise the code point.  <pack_warn> contains the result of
-     *     packWARN() for the warning types.  The entry point for this case is
-     *     the label <disallowed>.
-     * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
-     *     is set, <*relen> is -1; otherwise it is <curlen>, the number of
-     *     bytes that comprise the malformation.  All such malformations are
-     *     assumed to be warning type <utf8>.  The entry point for this case
-     *     is the label <malformed>.
-     */
+                    if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+                        *errors |= UTF8_GOT_ABOVE_31_BIT;
  
  
-  malformed:
+                        if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                            disallowed = TRUE;
+                        }
+                    }
+                }
  
  
-    if (sv && ckWARN_d(WARN_UTF8)) {
-       pack_warn = packWARN(WARN_UTF8);
-    }
+                if (flags & UTF8_DISALLOW_SUPER) {
+                    *errors |= UTF8_GOT_SUPER;
+                    disallowed = TRUE;
+                }
  
  
-  disallowed:
+                /* The deprecated warning overrides any non-deprecated one.  If
+                 * there are other problems, a deprecation message is not
+                 * really helpful, so don't bother to raise it in that case.
+                 * This also keeps the code from having to handle the case
+                 * where 'uv' is not valid. */
+                if (   ! (orig_problems
+                                    & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                    && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+                    && ckWARN_d(WARN_DEPRECATED))
+                {
+                    message = Perl_form(aTHX_ cp_above_legal_max,
+                                              uv, MAX_NON_DEPRECATED_CP);
+                    pack_warn = packWARN(WARN_DEPRECATED);
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NONCHAR) {
+                possible_problems &= ~UTF8_GOT_NONCHAR;
  
  
-    if (flags & UTF8_CHECK_ONLY) {
-       if (retlen)
-           *retlen = ((STRLEN) -1);
-       return 0;
-    }
+                if (flags & UTF8_WARN_NONCHAR) {
+                    *errors |= UTF8_GOT_NONCHAR;
  
  
-  do_warn:
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_NONCHAR))
+                    {
+                        /* The code above should have guaranteed that we don't
+                         * get here with errors other than overlong */
+                        assert (! (orig_problems
+                                        & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
+
+                        pack_warn = packWARN(WARN_NONCHAR);
+                        message = Perl_form(aTHX_ "Unicode non-character"
+                                                " U+%04"UVXf" is not recommended"
+                                                " for open interchange", uv);
+                    }
+                }
  
  
-    if (pack_warn) {   /* <pack_warn> was initialized to 0, and changed only
-                          if warnings are to be raised. */
-       const char * const string = SvPVX_const(sv);
+                if (flags & UTF8_DISALLOW_NONCHAR) {
+                    disallowed = TRUE;
+                    *errors |= UTF8_GOT_NONCHAR;
+                }
+            } /* End of looking through the possible flags */
+
+            /* Display the message (if any) for the problem being handled in
+             * this iteration of the loop */
+            if (message) {
+                if (PL_op)
+                    Perl_warner(aTHX_ pack_warn, "%s in %s", message,
+                                                 OP_DESC(PL_op));
+                else
+                    Perl_warner(aTHX_ pack_warn, "%s", message);
+            }
+        }   /* End of 'while (possible_problems) {' */
  
  
-       if (PL_op)
-           Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
-       else
-           Perl_warner(aTHX_ pack_warn, "%s", string);
-    }
+        /* Since there was a possible problem, the returned length may need to
+         * be changed from the one stored at the beginning of this function.
+         * Instead of trying to figure out if that's needed, just do it. */
+        if (retlen) {
+            *retlen = curlen;
+        }
  
  
-    if (retlen) {
-       *retlen = curlen;
+        if (disallowed) {
+            if (flags & UTF8_CHECK_ONLY && retlen) {
+                *retlen = ((STRLEN) -1);
+            }
+            return 0;
+        }
      }
  
      }
  
-    return outlier_ret;
+    return UNI_TO_NATIVE(uv);
  }
  
  /*
  }
  
  /*
@@ -831,14 +1628,20 @@ C<*retlen> will be set to the length, in bytes, of that character.
  
  If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
  enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
  
  If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
  enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-NULL) to -1.  If those warnings are off, the computed value, if well-defined
+C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
  (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
  (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
-C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
+C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
  the next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
  the next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
+Code points above the platform's C<IV_MAX> will raise a deprecation warning,
+unless those are turned off.
+
  =cut
  =cut
+
+Also implemented as a macro in utf8.h
+
  */
  
  
  */
  
  
@@ -851,49 +1654,8 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
                           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
  }
  
                           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
  }
  
-/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
- * there are no malformations in the input UTF-8 string C<s>.  surrogates,
- * non-character code points, and non-Unicode code points are allowed. */
-
-UV
-Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
-{
-    UV expectlen = UTF8SKIP(s);
-    const U8* send = s + expectlen;
-    UV uv = *s;
-
-    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
-    PERL_UNUSED_CONTEXT;
-
-    if (retlen) {
-        *retlen = expectlen;
-    }
-
-    /* An invariant is trivially returned */
-    if (expectlen == 1) {
-       return uv;
-    }
-
-#ifdef EBCDIC
-    uv = NATIVE_UTF8_TO_I8(uv);
-#endif
-
-    /* Remove the leading bits that indicate the number of bytes, leaving just
-     * the bits that are part of the value */
-    uv &= UTF_START_MASK(expectlen);
-
-    /* Now, loop through the remaining bytes, accumulating each into the
-     * working total as we go.  (I khw tried unrolling the loop for up to 4
-     * bytes, but there was no performance improvement) */
-    for (++s; s < send; s++) {
-        uv = UTF8_ACCUMULATE(uv, *s);
-    }
-
-    return UNI_TO_NATIVE(uv);
-
-}
-
-/*
+/* This is marked as deprecated
+ *
  =for apidoc utf8_to_uvuni_buf
  
  Only in very rare circumstances should code need to be dealing in Unicode
  =for apidoc utf8_to_uvuni_buf
  
  Only in very rare circumstances should code need to be dealing in Unicode
@@ -913,6 +1675,9 @@ is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  next possible position in C<s> that could begin a non-malformed character.
  See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
+Code points above the platform's C<IV_MAX> will raise a deprecation warning,
+unless those are turned off.
+
  =cut
  */
  
  =cut
  */
  
@@ -923,9 +1688,8 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
      assert(send > s);
  
  
      assert(send > s);
  
-    /* Call the low level routine asking for checks */
-    return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
-                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
+    /* Call the low level routine, asking for checks */
+    return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
  }
  
  /*
  }
  
  /*
@@ -970,62 +1734,6 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
  }
  
  /*
  }
  
  /*
-=for apidoc utf8_distance
-
-Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
-and C<b>.
-
-WARNING: use only if you *know* that the pointers point inside the
-same UTF-8 buffer.
-
-=cut
-*/
-
-IV
-Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
-{
-    PERL_ARGS_ASSERT_UTF8_DISTANCE;
-
-    return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
-}
-
-/*
-=for apidoc utf8_hop
-
-Return the UTF-8 pointer C<s> displaced by C<off> characters, either
-forward or backward.
-
-WARNING: do not use the following unless you *know* C<off> is within
-the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
-on the first byte of character or just after the last byte of a character.
-
-=cut
-*/
-
-U8 *
-Perl_utf8_hop(const U8 *s, I32 off)
-{
-    PERL_ARGS_ASSERT_UTF8_HOP;
-
-    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
-     * the bitops (especially ~) can create illegal UTF-8.
-     * In other words: in Perl UTF-8 is not just for Unicode. */
-
-    if (off >= 0) {
-       while (off--)
-           s += UTF8SKIP(s);
-    }
-    else {
-       while (off++) {
-           s--;
-           while (UTF8_IS_CONTINUATION(*s))
-               s--;
-       }
-    }
-    return (U8 *)s;
-}
-
-/*
  =for apidoc bytes_cmp_utf8
  
  Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
  =for apidoc bytes_cmp_utf8
  
  Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
@@ -1057,16 +1765,14 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                 if (u < uend) {
                     U8 c1 = *u++;
                     if (UTF8_IS_CONTINUATION(c1)) {
                 if (u < uend) {
                     U8 c1 = *u++;
                     if (UTF8_IS_CONTINUATION(c1)) {
-                       c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
+                       c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
                     } else {
                     } else {
+                        /* diag_listed_as: Malformed UTF-8 character%s */
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
-                                        "Malformed UTF-8 character "
-                                        "(unexpected non-continuation byte 0x%02x"
-                                        ", immediately after start byte 0x%02x)"
-                                        /* Dear diag.t, it's in the pod.  */
-                                        "%s%s", c1, c,
-                                        PL_op ? " in " : "",
-                                        PL_op ? OP_DESC(PL_op) : "");
+                                    "%s %s%s",
+                                    unexpected_non_continuation_text(u - 1, 2, 1, 2),
+                                    PL_op ? " in " : "",
+                                    PL_op ? OP_DESC(PL_op) : "");
                         return -2;
                     }
                 } else {
                         return -2;
                     }
                 } else {
@@ -1133,7 +1839,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
         U8 c = *s++;
         if (! UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
         U8 c = *s++;
         if (! UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+           c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
              s++;
         }
         *d++ = c;
              s++;
         }
         *d++ = c;
@@ -1152,7 +1858,7 @@ the newly-created string, and updates C<len> to contain the new
  length.  Returns the original string if no conversion occurs, C<len>
  is unchanged.  Do nothing if C<is_utf8> points to 0.  Sets C<is_utf8> to
  0 if C<s> is converted or consisted entirely of characters that are invariant
  length.  Returns the original string if no conversion occurs, C<len>
  is unchanged.  Do nothing if C<is_utf8> points to 0.  Sets C<is_utf8> to
  0 if C<s> is converted or consisted entirely of characters that are invariant
-in utf8 (i.e., US-ASCII on non-EBCDIC machines).
+in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
  
  =cut
  */
  
  =cut
  */
@@ -1190,7 +1896,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
         U8 c = *s++;
         if (! UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
         U8 c = *s++;
         if (! UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+           c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
              s++;
         }
         *d++ = c;
              s++;
         }
         *d++ = c;
@@ -1264,7 +1970,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
      while (p < pend) {
         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
         p += 2;
      while (p < pend) {
         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
         p += 2;
-       if (UNI_IS_INVARIANT(uv)) {
+       if (OFFUNI_IS_INVARIANT(uv)) {
             *d++ = LATIN1_TO_NATIVE((U8) uv);
             continue;
         }
             *d++ = LATIN1_TO_NATIVE((U8) uv);
             continue;
         }
@@ -1384,7 +2090,7 @@ UV
  Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
  {
      /* We have the latin1-range values compiled into the core, so just use
  Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
  {
      /* We have the latin1-range values compiled into the core, so just use
-     * those, converting the result to utf8.  The only difference between upper
+     * those, converting the result to UTF-8.  The only difference between upper
       * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
       * either "SS" or "Ss".  Which one to use is passed into the routine in
       * 'S_or_s' to avoid a test */
       * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
       * either "SS" or "Ss".  Which one to use is passed into the routine in
       * 'S_or_s' to avoid a test */
@@ -1413,11 +2119,15 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
             case MICRO_SIGN:
                 converted = GREEK_CAPITAL_LETTER_MU;
                 break;
             case MICRO_SIGN:
                 converted = GREEK_CAPITAL_LETTER_MU;
                 break;
+#if    UNICODE_MAJOR_VERSION > 2                                        \
+   || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
+                                  && UNICODE_DOT_DOT_VERSION >= 8)
             case LATIN_SMALL_LETTER_SHARP_S:
                 *(p)++ = 'S';
                 *p = S_or_s;
                 *lenp = 2;
                 return 'S';
             case LATIN_SMALL_LETTER_SHARP_S:
                 *(p)++ = 'S';
                 *p = S_or_s;
                 *lenp = 2;
                 return 'S';
+#endif
             default:
                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
                 NOT_REACHED; /* NOTREACHED */
             default:
                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
                 NOT_REACHED; /* NOTREACHED */
@@ -1439,14 +2149,14 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
   * LENP will be set to the length in bytes of the string of changed characters
   *
   * The functions return the ordinal of the first character in the string of OUTP */
   * LENP will be set to the length in bytes of the string of changed characters
   *
   * The functions return the ordinal of the first character in the string of OUTP */
-#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
-#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
-#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
+#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
+#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
+#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
  
  
-/* This additionally has the input parameter SPECIALS, which if non-zero will
- * cause this to use the SPECIALS hash for folding (meaning get full case
+/* This additionally has the input parameter 'specials', which if non-zero will
+ * cause this to use the specials hash for folding (meaning get full case
   * folding); otherwise, when zero, this implies a simple case fold */
   * folding); otherwise, when zero, this implies a simple case fold */
-#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
+#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
@@ -1466,7 +2176,7 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
      }
  
      uvchr_to_utf8(p, c);
      }
  
      uvchr_to_utf8(p, c);
-    return CALL_UPPER_CASE(p, p, lenp);
+    return CALL_UPPER_CASE(c, p, p, lenp);
  }
  
  UV
  }
  
  UV
@@ -1479,14 +2189,14 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
      }
  
      uvchr_to_utf8(p, c);
      }
  
      uvchr_to_utf8(p, c);
-    return CALL_TITLE_CASE(p, p, lenp);
+    return CALL_TITLE_CASE(c, p, p, lenp);
  }
  
  STATIC U8
  S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
  {
      /* We have the latin1-range values compiled into the core, so just use
  }
  
  STATIC U8
  S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
  {
      /* We have the latin1-range values compiled into the core, so just use
-     * those, converting the result to utf8.  Since the result is always just
+     * those, converting the result to UTF-8.  Since the result is always just
       * one character, we allow <p> to be NULL */
  
      U8 converted = toLOWER_LATIN1(c);
       * one character, we allow <p> to be NULL */
  
      U8 converted = toLOWER_LATIN1(c);
@@ -1517,7 +2227,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
      }
  
      uvchr_to_utf8(p, c);
      }
  
      uvchr_to_utf8(p, c);
-    return CALL_LOWER_CASE(p, p, lenp);
+    return CALL_LOWER_CASE(c, p, p, lenp);
  }
  
  UV
  }
  
  UV
@@ -1537,11 +2247,15 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
-    if (c == MICRO_SIGN) {
+    if (UNLIKELY(c == MICRO_SIGN)) {
         converted = GREEK_SMALL_LETTER_MU;
      }
         converted = GREEK_SMALL_LETTER_MU;
      }
-    else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
-
+#if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
+   || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
+                                      || UNICODE_DOT_DOT_VERSION > 0)
+    else if (   (flags & FOLD_FLAGS_FULL)
+             && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
+    {
          /* If can't cross 127/128 boundary, can't return "ss"; instead return
           * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
           * under those circumstances. */
          /* If can't cross 127/128 boundary, can't return "ss"; instead return
           * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
           * under those circumstances. */
@@ -1558,6 +2272,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
              return 's';
          }
      }
              return 's';
          }
      }
+#endif
      else { /* In this range the fold of all other characters is their lower
                case */
         converted = toLOWER_LATIN1(c);
      else { /* In this range the fold of all other characters is their lower
                case */
         converted = toLOWER_LATIN1(c);
@@ -1609,7 +2324,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
      /* Here, above 255.  If no special needs, just use the macro */
      if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
         uvchr_to_utf8(p, c);
      /* Here, above 255.  If no special needs, just use the macro */
      if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
         uvchr_to_utf8(p, c);
-       return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
+       return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
      }
      else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
                the special flags. */
      }
      else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
                the special flags. */
@@ -1746,6 +2461,11 @@ Perl__is_utf8_mark(pTHX_ const U8 *p)
  /*
  =for apidoc to_utf8_case
  
  /*
  =for apidoc to_utf8_case
  
+Instead use the appropriate one of L</toUPPER_utf8>,
+L</toTITLE_utf8>,
+L</toLOWER_utf8>,
+or L</toFOLD_utf8>.
+
  C<p> contains the pointer to the UTF-8 string encoding
  the character that is being converted.  This routine assumes that the character
  at C<p> is well-formed.
  C<p> contains the pointer to the UTF-8 string encoding
  the character that is being converted.  This routine assumes that the character
  at C<p> is well-formed.
@@ -1757,7 +2477,7 @@ of the result.
  C<swashp> is a pointer to the swash to use.
  
  Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
  C<swashp> is a pointer to the swash to use.
  
  Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
-and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  C<special> (usually,
+and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>.  C<special> (usually,
  but not always, a multicharacter mapping), is tried first.
  
  C<special> is a string, normally C<NULL> or C<"">.  C<NULL> means to not use
  but not always, a multicharacter mapping), is tried first.
  
  C<special> is a string, normally C<NULL> or C<"">.  C<NULL> means to not use
@@ -1765,8 +2485,11 @@ any special mappings; C<""> means to use the special mappings.  Values other
  than these two are treated as the name of the hash containing the special
  mappings, like C<"utf8::ToSpecLower">.
  
  than these two are treated as the name of the hash containing the special
  mappings, like C<"utf8::ToSpecLower">.
  
-C<normal> is a string like "ToLower" which means the swash
-%utf8::ToLower.
+C<normal> is a string like C<"ToLower"> which means the swash
+C<%utf8::ToLower>.
+
+Code points above the platform's C<IV_MAX> will raise a deprecation warning,
+unless those are turned off.
  
  =cut */
  
  
  =cut */
  
@@ -1774,31 +2497,114 @@ UV
  Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
                         SV **swashp, const char *normal, const char *special)
  {
  Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
                         SV **swashp, const char *normal, const char *special)
  {
+    PERL_ARGS_ASSERT_TO_UTF8_CASE;
+
+    return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
+}
+
+    /* change namve uv1 to 'from' */
+STATIC UV
+S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
+               SV **swashp, const char *normal, const char *special)
+{
      STRLEN len = 0;
      STRLEN len = 0;
-    const UV uv1 = valid_utf8_to_uvchr(p, NULL);
  
  
-    PERL_ARGS_ASSERT_TO_UTF8_CASE;
+    PERL_ARGS_ASSERT__TO_UTF8_CASE;
+
+    /* For code points that don't change case, we already know that the output
+     * of this function is the unchanged input, so we can skip doing look-ups
+     * for them.  Unfortunately the case-changing code points are scattered
+     * around.  But there are some long consecutive ranges where there are no
+     * case changing code points.  By adding tests, we can eliminate the lookup
+     * for all the ones in such ranges.  This is currently done here only for
+     * just a few cases where the scripts are in common use in modern commerce
+     * (and scripts adjacent to those which can be included without additional
+     * tests). */
+
+    if (uv1 >= 0x0590) {
+        /* This keeps from needing further processing the code points most
+         * likely to be used in the following non-cased scripts: Hebrew,
+         * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
+         * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
+         * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
+        if (uv1 < 0x10A0) {
+            goto cases_to_self;
+        }
  
  
-    /* Note that swash_fetch() doesn't output warnings for these because it
-     * assumes we will */
-    if (uv1 >= UNICODE_SURROGATE_FIRST) {
-       if (uv1 <= UNICODE_SURROGATE_LAST) {
-           if (ckWARN_d(WARN_SURROGATE)) {
-               const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
-               Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
-                   "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
-           }
-       }
-       else if (UNICODE_IS_SUPER(uv1)) {
-           if (ckWARN_d(WARN_NON_UNICODE)) {
-               const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
-               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
-                   "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
-           }
-       }
+        /* The following largish code point ranges also don't have case
+         * changes, but khw didn't think they warranted extra tests to speed
+         * them up (which would slightly slow down everything else above them):
+         * 1100..139F   Hangul Jamo, Ethiopic
+         * 1400..1CFF   Unified Canadian Aboriginal Syllabics, Ogham, Runic,
+         *              Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
+         *              Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
+         *              Combining Diacritical Marks Extended, Balinese,
+         *              Sundanese, Batak, Lepcha, Ol Chiki
+         * 2000..206F   General Punctuation
+         */
+
+        if (uv1 >= 0x2D30) {
+
+            /* This keeps the from needing further processing the code points
+             * most likely to be used in the following non-cased major scripts:
+             * CJK, Katakana, Hiragana, plus some less-likely scripts.
+             *
+             * (0x2D30 above might have to be changed to 2F00 in the unlikely
+             * event that Unicode eventually allocates the unused block as of
+             * v8.0 2FE0..2FEF to code points that are cased.  khw has verified
+             * that the test suite will start having failures to alert you
+             * should that happen) */
+            if (uv1 < 0xA640) {
+                goto cases_to_self;
+            }
+
+            if (uv1 >= 0xAC00) {
+                if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
+                    if (ckWARN_d(WARN_SURROGATE)) {
+                        const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+                        Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
+                            "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
+                    }
+                    goto cases_to_self;
+                }
+
+                /* AC00..FAFF Catches Hangul syllables and private use, plus
+                 * some others */
+                if (uv1 < 0xFB00) {
+                    goto cases_to_self;
+
+                }
+
+                if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
+                    if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
+                        && ckWARN_d(WARN_DEPRECATED))
+                    {
+                        Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+                                cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
+                    }
+                    if (ckWARN_d(WARN_NON_UNICODE)) {
+                        const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+                        Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+                            "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
+                    }
+                    goto cases_to_self;
+                }
+#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
+                if (UNLIKELY(uv1
+                    > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
+                {
+
+                    /* As of this writing, this means we avoid swash creation
+                     * for anything beyond low Plane 1 */
+                    goto cases_to_self;
+                }
+#endif
+            }
+        }
  
         /* Note that non-characters are perfectly legal, so no warning should
  
         /* Note that non-characters are perfectly legal, so no warning should
-        * be given */
+         * be given.  There are so few of them, that it isn't worth the extra
+         * tests to avoid swash creation */
      }
  
      if (!*swashp) /* load on-demand */
      }
  
      if (!*swashp) /* load on-demand */
@@ -1823,7 +2629,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
          }
  
          if (hv
          }
  
          if (hv
-             && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
+             && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
               && (*svp))
           {
              const char *s;
               && (*svp))
           {
              const char *s;
@@ -1839,7 +2645,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      }
  
      if (!len && *swashp) {
      }
  
      if (!len && *swashp) {
-       const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
+       const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
  
          if (uv2) {
               /* It was "normal" (a single character mapping). */
  
          if (uv2) {
               /* It was "normal" (a single character mapping). */
@@ -1856,6 +2662,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
  
      /* Here, there was no mapping defined, which means that the code point maps
       * to itself.  Return the inputs */
  
      /* Here, there was no mapping defined, which means that the code point maps
       * to itself.  Return the inputs */
+  cases_to_self:
      len = UTF8SKIP(p);
      if (p != ustrp) {   /* Don't copy onto itself */
          Copy(p, ustrp, len, U8);
      len = UTF8SKIP(p);
      if (p != ustrp) {   /* Don't copy onto itself */
          Copy(p, ustrp, len, U8);
@@ -1871,7 +2678,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
  STATIC UV
  S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
  {
  STATIC UV
  S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
  {
-    /* This is called when changing the case of a utf8-encoded character above
+    /* This is called when changing the case of a UTF-8-encoded character above
       * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
       * result contains a character that crosses the 255/256 boundary, disallow
       * the change, and return the original code point.  See L<perlfunc/lc> for
       * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
       * result contains a character that crosses the 255/256 boundary, disallow
       * the change, and return the original code point.  See L<perlfunc/lc> for
@@ -1963,16 +2770,16 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
             result = toUPPER_LC(c);
         }
         else {
             result = toUPPER_LC(c);
         }
         else {
-           return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                                           ustrp, lenp, 'S');
         }
      }
                                           ustrp, lenp, 'S');
         }
      }
-    else {  /* utf8, ord above 255 */
-       result = CALL_UPPER_CASE(p, ustrp, lenp);
+    else {  /* UTF-8, ord above 255 */
+       result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
  
         if (flags) {
             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
  
         if (flags) {
             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
@@ -1980,7 +2787,7 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
         return result;
      }
  
         return result;
      }
  
-    /* Here, used locale rules.  Convert back to utf8 */
+    /* Here, used locale rules.  Convert back to UTF-8 */
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
@@ -2034,16 +2841,16 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
             result = toUPPER_LC(c);
         }
         else {
             result = toUPPER_LC(c);
         }
         else {
-           return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                                           ustrp, lenp, 's');
         }
      }
                                           ustrp, lenp, 's');
         }
      }
-    else {  /* utf8, ord above 255 */
-       result = CALL_TITLE_CASE(p, ustrp, lenp);
+    else {  /* UTF-8, ord above 255 */
+       result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
  
         if (flags) {
             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
  
         if (flags) {
             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
@@ -2051,7 +2858,7 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
         return result;
      }
  
         return result;
      }
  
-    /* Here, used locale rules.  Convert back to utf8 */
+    /* Here, used locale rules.  Convert back to UTF-8 */
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
@@ -2104,16 +2911,16 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
             result = toLOWER_LC(c);
         }
         else {
             result = toLOWER_LC(c);
         }
         else {
-           return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                                    ustrp, lenp);
         }
      }
                                    ustrp, lenp);
         }
      }
-    else {  /* utf8, ord above 255 */
-       result = CALL_LOWER_CASE(p, ustrp, lenp);
+    else {  /* UTF-8, ord above 255 */
+       result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
  
         if (flags) {
             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
  
         if (flags) {
             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
@@ -2122,7 +2929,7 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags
         return result;
      }
  
         return result;
      }
  
-    /* Here, used locale rules.  Convert back to utf8 */
+    /* Here, used locale rules.  Convert back to UTF-8 */
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
@@ -2186,25 +2993,27 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags & FOLD_FLAGS_LOCALE) {
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags & FOLD_FLAGS_LOCALE) {
-            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+            U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
             result = toFOLD_LC(c);
         }
         else {
             result = toFOLD_LC(c);
         }
         else {
-           return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
+           return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
                              ustrp, lenp,
                              flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
         }
      }
                              ustrp, lenp,
                              flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
         }
      }
-    else {  /* utf8, ord above 255 */
-       result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
+    else {  /* UTF-8, ord above 255 */
+       result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
  
         if (flags & FOLD_FLAGS_LOCALE) {
  
  
         if (flags & FOLD_FLAGS_LOCALE) {
  
-#           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
  #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
+            const unsigned int long_s_t_len    = sizeof(LONG_S_T) - 1;
+
+#         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+#           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
  
              const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
  
              const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
-            const unsigned int long_s_t_len    = sizeof(LONG_S_T) - 1;
  
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
  
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
@@ -2217,7 +3026,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
                                "resolved to \"\\x{17F}\\x{17F}\".");
                  goto return_long_s;
              }
                                "resolved to \"\\x{17F}\\x{17F}\".");
                  goto return_long_s;
              }
-            else if (UTF8SKIP(p) == long_s_t_len
+            else
+#endif
+                 if (UTF8SKIP(p) == long_s_t_len
                       && memEQ((char *) p, LONG_S_T, long_s_t_len))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                       && memEQ((char *) p, LONG_S_T, long_s_t_len))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
@@ -2226,13 +3037,35 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
                                "resolved to \"\\x{FB06}\".");
                  goto return_ligature_st;
              }
                                "resolved to \"\\x{FB06}\".");
                  goto return_ligature_st;
              }
+
+#if    UNICODE_MAJOR_VERSION   == 3         \
+    && UNICODE_DOT_VERSION     == 0         \
+    && UNICODE_DOT_DOT_VERSION == 1
+#           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
+
+            /* And special case this on this Unicode version only, for the same
+             * reaons the other two are special cased.  They would cross the
+             * 255/256 boundary which is forbidden under /l, and so the code
+             * wouldn't catch that they are equivalent (which they are only in
+             * this release) */
+            else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
+                     && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
+            {
+                /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
+                Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+                              "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
+                              "resolved to \"\\x{0131}\".");
+                goto return_dotless_i;
+            }
+#endif
+
             return check_locale_boundary_crossing(p, result, ustrp, lenp);
         }
         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
             return result;
         }
         else {
             return check_locale_boundary_crossing(p, result, ustrp, lenp);
         }
         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
             return result;
         }
         else {
-           /* This is called when changing the case of a utf8-encoded
+           /* This is called when changing the case of a UTF-8-encoded
               * character above the ASCII range, and the result should not
               * contain an ASCII character. */
  
               * character above the ASCII range, and the result should not
               * contain an ASCII character. */
  
@@ -2249,14 +3082,24 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
  
                      /* But in these instances, there is an alternative we can
                       * return that is valid */
  
                      /* But in these instances, there is an alternative we can
                       * return that is valid */
-                    if (original == LATIN_CAPITAL_LETTER_SHARP_S
-                        || original == LATIN_SMALL_LETTER_SHARP_S)
-                    {
+                    if (original == LATIN_SMALL_LETTER_SHARP_S
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+                        || original == LATIN_CAPITAL_LETTER_SHARP_S
+#endif
+                    ) {
                          goto return_long_s;
                      }
                      else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
                          goto return_ligature_st;
                      }
                          goto return_long_s;
                      }
                      else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
                          goto return_ligature_st;
                      }
+#if    UNICODE_MAJOR_VERSION   == 3         \
+    && UNICODE_DOT_VERSION     == 0         \
+    && UNICODE_DOT_DOT_VERSION == 1
+
+                    else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+                        goto return_dotless_i;
+                    }
+#endif
                     Copy(p, ustrp, *lenp, char);
                     return original;
                 }
                     Copy(p, ustrp, *lenp, char);
                     return original;
                 }
@@ -2268,7 +3111,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
         }
      }
  
         }
      }
  
-    /* Here, used locale rules.  Convert back to utf8 */
+    /* Here, used locale rules.  Convert back to UTF-8 */
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
      if (UTF8_IS_INVARIANT(result)) {
         *ustrp = (U8) result;
         *lenp = 1;
@@ -2300,6 +3143,18 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
      *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
      Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
      return LATIN_SMALL_LIGATURE_ST;
      *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
      Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
      return LATIN_SMALL_LIGATURE_ST;
+
+#if    UNICODE_MAJOR_VERSION   == 3         \
+    && UNICODE_DOT_VERSION     == 0         \
+    && UNICODE_DOT_DOT_VERSION == 1
+
+  return_dotless_i:
+    *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
+    Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
+    return LATIN_SMALL_LETTER_DOTLESS_I;
+
+#endif
+
  }
  
  /* Note:
  }
  
  /* Note:
@@ -2407,6 +3262,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         PUSHSTACKi(PERLSI_MAGIC);
         ENTER;
         SAVEHINTS();
         PUSHSTACKi(PERLSI_MAGIC);
         ENTER;
         SAVEHINTS();
+       save_re_context();
         /* We might get here via a subroutine signature which uses a utf8
          * parameter name, at which point PL_subname will have been set
          * but not yet used. */
         /* We might get here via a subroutine signature which uses a utf8
          * parameter name, at which point PL_subname will have been set
          * but not yet used. */
@@ -2414,13 +3270,17 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         if (PL_parser && PL_parser->error_count)
             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
         if (PL_parser && PL_parser->error_count)
             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
-       if (!method) {  /* demand load utf8 */
+       if (!method) {  /* demand load UTF-8 */
             ENTER;
             if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
             GvSV(PL_errgv) = NULL;
  #ifndef NO_TAINT_SUPPORT
             /* It is assumed that callers of this routine are not passing in
              * any user derived data.  */
             ENTER;
             if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
             GvSV(PL_errgv) = NULL;
  #ifndef NO_TAINT_SUPPORT
             /* It is assumed that callers of this routine are not passing in
              * any user derived data.  */
+           /* Need to do this after save_re_context() as it will set
+            * PL_tainted to 1 while saving $1 etc (see the code after getrx:
+            * in Perl_magic_get).  Even line to create errsv_save can turn on
+            * PL_tainted.  */
             SAVEBOOL(TAINT_get);
             TAINT_NOT;
  #endif
             SAVEBOOL(TAINT_get);
             TAINT_NOT;
  #endif
@@ -2471,7 +3331,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             CopHINTS_set(PL_curcop, PL_hints);
         }
         if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
             CopHINTS_set(PL_curcop, PL_hints);
         }
         if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
-           if (SvPOK(retval))
+           if (SvPOK(retval)) {
  
                 /* If caller wants to handle missing properties, let them */
                 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
  
                 /* If caller wants to handle missing properties, let them */
                 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
@@ -2481,6 +3341,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
                            "Can't find Unicode property definition \"%"SVf"\"",
                            SVfARG(retval));
                  NOT_REACHED; /* NOTREACHED */
                            "Can't find Unicode property definition \"%"SVf"\"",
                            SVfARG(retval));
                  NOT_REACHED; /* NOTREACHED */
+            }
         }
      } /* End of calling the module to find the swash */
  
         }
      } /* End of calling the module to find the swash */
  
@@ -2537,6 +3398,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
                 /* Add the passed-in inversion list, which invalidates the one
                  * already stored in the swash */
                 invlist_in_swash_is_valid = FALSE;
                 /* Add the passed-in inversion list, which invalidates the one
                  * already stored in the swash */
                 invlist_in_swash_is_valid = FALSE;
+                SvREADONLY_off(swash_invlist);  /* Turned on again below */
                 _invlist_union(invlist, swash_invlist, &swash_invlist);
             }
             else {
                 _invlist_union(invlist, swash_invlist, &swash_invlist);
             }
             else {
@@ -2566,6 +3428,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             else SvREFCNT_inc_simple_void_NN(swash_invlist);
         }
  
             else SvREFCNT_inc_simple_void_NN(swash_invlist);
         }
  
+        /* The result is immutable.  Forbid attempts to change it. */
          SvREADONLY_on(swash_invlist);
  
          /* Use the inversion list stand-alone if small enough */
          SvREADONLY_on(swash_invlist);
  
          /* Use the inversion list stand-alone if small enough */
@@ -2587,13 +3450,13 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
   * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
   * the lower-level routine, and it is similarly broken for returning
   * multiple values.  --jhi
   * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
   * the lower-level routine, and it is similarly broken for returning
   * multiple values.  --jhi
- * For those, you should use to_utf8_case() instead */
+ * For those, you should use S__to_utf8_case() instead */
  /* Now SWASHGET is recasted into S_swatch_get in this file. */
  
  /* Note:
   * Returns the value of property/mapping C<swash> for the first character
   * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
  /* Now SWASHGET is recasted into S_swatch_get in this file. */
  
  /* Note:
   * Returns the value of property/mapping C<swash> for the first character
   * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
- * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
+ * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
   * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
   *
   * A "swash" is a hash which contains initially the keys/values set up by
   * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
   *
   * A "swash" is a hash which contains initially the keys/values set up by
@@ -2618,7 +3481,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
   *
   * Non-binary properties are stored in as many bits as necessary to represent
   * their values (32 currently, though the code is more general than that), not
   *
   * Non-binary properties are stored in as many bits as necessary to represent
   * their values (32 currently, though the code is more general than that), not
- * as single bits, but the principal is the same: the value for each key is a
+ * as single bits, but the principle is the same: the value for each key is a
   * vector that encompasses the property values for all code points whose UTF-8
   * representations are represented by the key.  That is, for all code points
   * whose UTF-8 representations are length N bytes, and the key is the first N-1
   * vector that encompasses the property values for all code points whose UTF-8
   * representations are represented by the key.  That is, for all code points
   * whose UTF-8 representations are length N bytes, and the key is the first N-1
@@ -2662,7 +3525,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
          klen = 0;
         needents = 256;
      else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
          klen = 0;
         needents = 256;
-        off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
+        off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
      }
      else {
          klen = UTF8SKIP(ptr) - 1;
      }
      else {
          klen = UTF8SKIP(ptr) - 1;
@@ -2704,7 +3567,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      }
  
      /*
      }
  
      /*
-     * This single-entry cache saves about 1/3 of the utf8 overhead in test
+     * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
       * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
       * it's nothing to sniff at.)  Pity we usually come through at least
       * two function calls to get here...
       * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
       * it's nothing to sniff at.)  Pity we usually come through at least
       * two function calls to get here...
@@ -3221,10 +4084,10 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      * 004C             006C
      * 212A             006B
      *
      * 004C             006C
      * 212A             006B
      *
-    * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+    * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
      * 006C.  The value for each key is an array.  For 006C, the array would
      * 006C.  The value for each key is an array.  For 006C, the array would
-    * have two elements, the utf8 for itself, and for 004C.  For 006B, there
-    * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+    * have two elements, the UTF-8 for itself, and for 004C.  For 006B, there
+    * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
      *
      * Note that there are no elements in the hash for 004B, 004C, 212A.  The
      * keys are only code points that are folded-to, so it isn't a full closure.
      *
      * Note that there are no elements in the hash for 004B, 004C, 212A.  The
      * keys are only code points that are folded-to, so it isn't a full closure.
@@ -3238,7 +4101,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      *
      * The specials hash can be extra code points, and most likely consists of
      * maps from single code points to multiple ones (each expressed as a string
      *
      * The specials hash can be extra code points, and most likely consists of
      * maps from single code points to multiple ones (each expressed as a string
-    * of utf8 characters).   This function currently returns only 1-1 mappings.
+    * of UTF-8 characters).   This function currently returns only 1-1 mappings.
      * However consider this possible input in the specials hash:
      * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
      * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
      * However consider this possible input in the specials hash:
      * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
      * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
@@ -3247,7 +4110,24 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      * currently handle.  But it also means that FB05 and FB06 are equivalent in
      * a 1-1 mapping which we should handle, and this relationship may not be in
      * the main table.  Therefore this function examines all the multi-char
      * currently handle.  But it also means that FB05 and FB06 are equivalent in
      * a 1-1 mapping which we should handle, and this relationship may not be in
      * the main table.  Therefore this function examines all the multi-char
-    * sequences and adds the 1-1 mappings that come out of that.  */
+    * sequences and adds the 1-1 mappings that come out of that.
+    *
+    * XXX This function was originally intended to be multipurpose, but its
+    * only use is quite likely to remain for constructing the inversion of
+    * the CaseFolding (//i) property.  If it were more general purpose for
+    * regex patterns, it would have to do the FB05/FB06 game for simple folds,
+    * because certain folds are prohibited under /iaa and /il.  As an example,
+    * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
+    * equivalent under /i.  But under /iaa and /il, the folds to 'i' are
+    * prohibited, so we would not figure out that they fold to each other.
+    * Code could be written to automatically figure this out, similar to the
+    * code that does this for multi-character folds, but this is the only case
+    * where something like this is ever likely to happen, as all the single
+    * char folds to the 0-255 range are now quite settled.  Instead there is a
+    * little special code that is compiled only for this Unicode version.  This
+    * is smaller and didn't require much coding time to do.  But this makes
+    * this routine strongly tied to being used just for CaseFolding.  If ever
+    * it should be generalized, this would have to be fixed */
  
      U8 *l, *lend;
      STRLEN lcur;
  
      U8 *l, *lend;
      STRLEN lcur;
@@ -3292,8 +4172,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
  
         hv_iterinit(specials_hv);
  
  
         hv_iterinit(specials_hv);
  
-       /* The keys are the characters (in utf8) that map to the corresponding
-        * utf8 string value.  Iterate through the list creating the inverse
+       /* The keys are the characters (in UTF-8) that map to the corresponding
+        * UTF-8 string value.  Iterate through the list creating the inverse
          * list. */
         while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
             SV** listp;
          * list. */
         while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
             SV** listp;
@@ -3305,7 +4185,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
             /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
  
             /* Each key in the inverse list is a mapped-to value, and the key's
             /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
  
             /* Each key in the inverse list is a mapped-to value, and the key's
-            * hash value is a list of the strings (each in utf8) that map to
+            * hash value is a list of the strings (each in UTF-8) that map to
              * it.  Those strings are all one character long */
             if ((listp = hv_fetch(specials_inverse,
                                     SvPVX(sv_to),
              * it.  Those strings are all one character long */
             if ((listp = hv_fetch(specials_inverse,
                                     SvPVX(sv_to),
@@ -3348,12 +4228,12 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
                                                  &char_to, &to_len)))
         {
         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
                                                  &char_to, &to_len)))
         {
-           if (av_tindex(from_list) > 0) {
+           if (av_tindex_nomg(from_list) > 0) {
                 SSize_t i;
  
                 /* We iterate over all combinations of i,j to place each code
                  * point on each list */
                 SSize_t i;
  
                 /* We iterate over all combinations of i,j to place each code
                  * point on each list */
-               for (i = 0; i <= av_tindex(from_list); i++) {
+               for (i = 0; i <= av_tindex_nomg(from_list); i++) {
                     SSize_t j;
                     AV* i_list = newAV();
                     SV** entryp = av_fetch(from_list, i, FALSE);
                     SSize_t j;
                     AV* i_list = newAV();
                     SV** entryp = av_fetch(from_list, i, FALSE);
@@ -3370,7 +4250,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                     }
  
                     /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
                     }
  
                     /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
-                   for (j = 0; j <= av_tindex(from_list); j++) {
+                   for (j = 0; j <= av_tindex_nomg(from_list); j++) {
                         entryp = av_fetch(from_list, j, FALSE);
                         if (entryp == NULL) {
                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
                         entryp = av_fetch(from_list, j, FALSE);
                         if (entryp == NULL) {
                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
@@ -3390,7 +4270,24 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      } /* End of specials */
  
      /* read $swash->{LIST} */
      } /* End of specials */
  
      /* read $swash->{LIST} */
+
+#if    UNICODE_MAJOR_VERSION   == 3         \
+    && UNICODE_DOT_VERSION     == 0         \
+    && UNICODE_DOT_DOT_VERSION == 1
+
+    /* For this version only U+130 and U+131 are equivalent under qr//i.  Add a
+     * rule so that things work under /iaa and /il */
+
+    SV * mod_listsv = sv_mortalcopy(*listsvp);
+    sv_catpv(mod_listsv, "130\t130\t131\n");
+    l = (U8*)SvPV(mod_listsv, lcur);
+
+#else
+
      l = (U8*)SvPV(*listsvp, lcur);
      l = (U8*)SvPV(*listsvp, lcur);
+
+#endif
+
      lend = l + lcur;
  
      /* Go through each input line */
      lend = l + lcur;
  
      /* Go through each input line */
@@ -3429,7 +4326,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
  
             /* Look through list to see if this inverse mapping already is
              * listed, or if there is a mapping to itself already */
  
             /* Look through list to see if this inverse mapping already is
              * listed, or if there is a mapping to itself already */
-           for (i = 0; i <= av_tindex(list); i++) {
+           for (i = 0; i <= av_tindex_nomg(list); i++) {
                 SV** entryp = av_fetch(list, i, FALSE);
                 SV* entry;
                 UV uv;
                 SV** entryp = av_fetch(list, i, FALSE);
                 SV* entry;
                 UV uv;
@@ -3732,7 +4629,10 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in s which has length len bytes.  Returns
       * TRUE if none found; FALSE otherwise.  The only other validity check is
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in s which has length len bytes.  Returns
       * TRUE if none found; FALSE otherwise.  The only other validity check is
-     * to make sure that this won't exceed the string's length */
+     * to make sure that this won't exceed the string's length.
+     *
+     * Code points above the platform's C<IV_MAX> will raise a deprecation
+     * warning, unless those are turned off.  */
  
      const U8* const e = s + len;
      bool ok = TRUE;
  
      const U8* const e = s + len;
      bool ok = TRUE;
@@ -3745,31 +4645,45 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
             return FALSE;
         }
                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
             return FALSE;
         }
-       if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
+       if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
             STRLEN char_len;
             STRLEN char_len;
-           if (UTF8_IS_SUPER(s)) {
-               if (ckWARN_d(WARN_NON_UNICODE)) {
-                   UV uv = utf8_to_uvchr_buf(s, e, &char_len);
-                   Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
-                       "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
-                   ok = FALSE;
-               }
+           if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
+                if (   ckWARN_d(WARN_NON_UNICODE)
+                    || (   ckWARN_d(WARN_DEPRECATED)
+#ifndef UV_IS_QUAD
+                        && UNLIKELY(is_utf8_cp_above_31_bits(s, e))
+#else   /* Below is 64-bit words */
+                        /* 2**63 and up meet these conditions provided we have
+                         * a 64-bit word. */
+#   ifdef EBCDIC
+                        && *s == 0xFE
+                        && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8
+#   else
+                        && *s == 0xFF
+                           /* s[1] being above 0x80 overflows */
+                        && s[2] >= 0x88
+#   endif
+#endif
+                )) {
+                    /* A side effect of this function will be to warn */
+                    (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_SUPER);
+                    ok = FALSE;
+                }
             }
             }
-           else if (UTF8_IS_SURROGATE(s)) {
+           else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
                 if (ckWARN_d(WARN_SURROGATE)) {
                 if (ckWARN_d(WARN_SURROGATE)) {
+                    /* This has a different warning than the one the called
+                     * function would output, so can't just call it, unlike we
+                     * do for the non-chars and above-unicodes */
                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
                     ok = FALSE;
                 }
             }
                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
                     ok = FALSE;
                 }
             }
-           else if
-               ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
-                && (ckWARN_d(WARN_NONCHAR)))
-           {
-               UV uv = utf8_to_uvchr_buf(s, e, &char_len);
-               Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
-                   "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+           else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
+                /* A side effect of this function will be to warn */
+                (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR);
                 ok = FALSE;
             }
         }
                 ok = FALSE;
             }
         }
@@ -3784,17 +4698,19 @@ Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  
  Build to the scalar C<dsv> a displayable version of the string C<spv>,
  length C<len>, the displayable version being at most C<pvlim> bytes long
  
  Build to the scalar C<dsv> a displayable version of the string C<spv>,
  length C<len>, the displayable version being at most C<pvlim> bytes long
-(if longer, the rest is truncated and "..." will be appended).
+(if longer, the rest is truncated and C<"..."> will be appended).
  
  
-The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
-isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
-to display the \\[nrfta\\] as the backslashed versions (like '\n')
-(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
-UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
-UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
+The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
+C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
+to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
+(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
+C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
+C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
  
  The pointer to the PV of the C<dsv> is returned.
  
  
  The pointer to the PV of the C<dsv> is returned.
  
+See also L</sv_uni_display>.
+
  =cut */
  char *
  Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
  =cut */
  char *
  Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
@@ -3804,7 +4720,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
  
      PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
  
  
      PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
  
-    sv_setpvs(dsv, "");
+    SvPVCLEAR(dsv);
      SvUTF8_off(dsv);
      for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
          UV u;
      SvUTF8_off(dsv);
      for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
          UV u;
@@ -3900,7 +4816,7 @@ scan will not be considered to be a match unless the goal is reached, and
  scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
  C<s2>.
  
  scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
  C<s2>.
  
-If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
+If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that pointer is
  considered an end pointer to the position 1 byte past the maximum point
  in C<s1> beyond which scanning will not continue under any circumstances.
  (This routine assumes that UTF-8 encoded input strings are not malformed;
  considered an end pointer to the position 1 byte past the maximum point
  in C<s1> beyond which scanning will not continue under any circumstances.
  (This routine assumes that UTF-8 encoded input strings are not malformed;
@@ -3917,7 +4833,7 @@ reached for a successful match.   Also, if the fold of a character is multiple
  characters, all of them must be matched (see tr21 reference below for
  'folding').
  
  characters, all of them must be matched (see tr21 reference below for
  'folding').
  
-Upon a successful match, if C<pe1> is non-NULL,
+Upon a successful match, if C<pe1> is non-C<NULL>,
  it will be set to point to the beginning of the I<next> character of C<s1>
  beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
  
  it will be set to point to the beginning of the I<next> character of C<s1>
  beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
  
@@ -4065,7 +4981,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                  else if (u1) {
                      _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
                  }
                  else if (u1) {
                      _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
                  }
-                else {  /* Not utf8, get utf8 fold */
+                else {  /* Not UTF-8, get UTF-8 fold */
                      _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
                  }
                  f1 = foldbuf1;
                      _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
                  }
                  f1 = foldbuf1;
@@ -4098,7 +5014,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
  
         /* Here f1 and f2 point to the beginning of the strings to compare.
          * These strings are the folds of the next character from each input
  
         /* Here f1 and f2 point to the beginning of the strings to compare.
          * These strings are the folds of the next character from each input
-        * string, stored in utf8. */
+        * string, stored in UTF-8. */
  
          /* While there is more to look for in both folds, see if they
          * continue to match */
  
          /* While there is more to look for in both folds, see if they
          * continue to match */
@@ -4187,7 +5103,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  =for apidoc uvuni_to_utf8_flags
  
  Instead you almost certainly want to use L</uvchr_to_utf8> or
  =for apidoc uvuni_to_utf8_flags
  
  Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+L</uvchr_to_utf8_flags>.
  
  This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
  which itself, while not deprecated, should be used only in isolated
  
  This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
  which itself, while not deprecated, should be used only in isolated
@@ -4208,11 +5124,5 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  }
  
  /*
  }
  
  /*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
   * ex: set ts=8 sts=4 sw=4 et:
   */
   * ex: set ts=8 sts=4 sw=4 et:
   */