Make bytes_from_utf8_loc() internal

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index bdbe23f..718d9e8 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -32,19 +32,11 @@
  #define PERL_IN_UTF8_C
  #include "perl.h"
  #include "invlist_inline.h"
-#include "uni_keywords.h"
  
  static const char malformed_text[] = "Malformed UTF-8 character";
  static const char unees[] =
                          "Malformed UTF-8 character (unexpected end of string)";
  
-/* Be sure to synchronize this message with the similar one in regcomp.c */
-static const char cp_above_legal_max[] =
-                        "Use of code point 0x%" UVXf " is not allowed; the"
-                        " permissible max is 0x%" UVXf;
-
-#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
-
  /*
  =head1 Unicode Support
  These are various utility functions for manipulating UTF8-encoded
@@ -56,6 +48,17 @@ within non-zero characters.
  =cut
  */
  
+/* helper for Perl__force_out_malformed_utf8_message(). Like
+ * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
+ * PL_compiling */
+
+static void
+S_restore_cop_warnings(pTHX_ void *p)
+{
+    free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
+}
+
+
  void
  Perl__force_out_malformed_utf8_message(pTHX_
              const U8 *const p,      /* First byte in UTF-8 sequence */
@@ -87,6 +90,10 @@ Perl__force_out_malformed_utf8_message(pTHX_
  
      PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
      if (PL_curcop) {
+        /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
+         * than PL_compiling */
+        SAVEDESTRUCTOR_X(S_restore_cop_warnings,
+                (void*)PL_curcop->cop_warnings);
          PL_curcop->cop_warnings = pWARN_ALL;
      }
  
@@ -131,14 +138,14 @@ S_new_msg_hv(pTHX_ const char * const message, /* The message text */
  =for apidoc uvoffuni_to_utf8_flags
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Instead, B<Almost all code should use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
+L<perlapi/uvchr_to_utf8_flags>>.
  
  This function is like them, but the input is a strict Unicode
  (as opposed to native) code point.  Only in very rare circumstances should code
  not be using the native code point.
  
-For details, see the description for L</uvchr_to_utf8_flags>.
+For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
  
  =cut
  */
@@ -157,9 +164,6 @@ const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
                                     " is not recommended for open interchange";
  const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
                                     " may not be portable";
-const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
-                                       " Unicode, requires a Perl extension," \
-                                       " and so is not portable";
  
  #define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                   \
      STMT_START {                                                    \
@@ -310,8 +314,10 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
       * performance hit on these high EBCDIC code points. */
  
      if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
-        if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
-            Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+        if (UNLIKELY(      uv > MAX_LEGAL_CP
+                     && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
+        {
+            Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
          }
          if (       (flags & UNICODE_WARN_SUPER)
              || (   (flags & UNICODE_WARN_PERL_EXTENDED)
@@ -323,7 +329,8 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
  
              /* Choose the more dire applicable warning */
              if (UNICODE_IS_PERL_EXTENDED(uv)) {
-                format = perl_extended_cp_format;
+                format = PL_extended_cp_format;
+                category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
                  if (flags & (UNICODE_WARN_PERL_EXTENDED
                              |UNICODE_DISALLOW_PERL_EXTENDED))
                  {
@@ -335,8 +342,11 @@ Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
                  *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
                                     category, flag);
              }
-            else {
-                Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
+            else if (    ckWARN_d(WARN_NON_UNICODE)
+                     || (   (flag & UNICODE_GOT_PERL_EXTENDED)
+                         && ckWARN(WARN_PORTABLE)))
+            {
+                Perl_warner(aTHX_ category, format, uv);
              }
          }
          if (       (flags & UNICODE_DISALLOW_SUPER)
@@ -472,7 +482,7 @@ Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
  C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
  above-Unicode and surrogate flags, but not the non-character ones, as
  defined in
-L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
  See L<perlunicode/Noncharacter code points>.
  
  Extremely high code points were never specified in any standard, and require an
@@ -483,7 +493,7 @@ different extension.  For these reasons, there is a separate set of flags that
  can warn and/or disallow these extremely high code points, even if other
  above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
  and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
-L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
+C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
  treat all above-Unicode code points, including these, as malformations.  (Note
  that the Unicode standard considers anything above 0x10FFFF to be illegal, but
  there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
@@ -907,7 +917,7 @@ S_does_utf8_overflow(const U8 * const s,
  #undef FF_OVERLONG_PREFIX
  
  STRLEN
-Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  {
      STRLEN len;
      const U8 *x;
@@ -944,7 +954,7 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
       *
       */
  
-    PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
  
      assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
                            |UTF8_DISALLOW_PERL_EXTENDED)));
@@ -1175,7 +1185,8 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
  =for apidoc utf8n_to_uvchr
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this
+directly.
  
  Bottom level UTF-8 decode routine.
  Returns the native code point value of the first character in the string C<s>,
@@ -1227,7 +1238,7 @@ disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
  restricts the allowed inputs to the strict UTF-8 traditionally defined by
  Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
  definition given by
-L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
  The difference between traditional strictness and C9 strictness is that the
  latter does not forbid non-character code points.  (They are still discouraged,
  however.)  For more discussion see L<perlunicode/Noncharacter code points>.
@@ -1249,7 +1260,7 @@ different extension.  For these reasons, there is a separate set of flags that
  can warn and/or disallow these extremely high code points, even if other
  above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
  C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
-L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
+C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
  above-Unicode code points, including these, as malformations.
  (Note that the Unicode standard considers anything above 0x10FFFF to be
  illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
@@ -1269,6 +1280,20 @@ All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
+=for apidoc Amnh||UTF8_CHECK_ONLY
+=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
+=for apidoc Amnh||UTF8_DISALLOW_SURROGATE
+=for apidoc Amnh||UTF8_DISALLOW_NONCHAR
+=for apidoc Amnh||UTF8_DISALLOW_SUPER
+=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
+=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
+=for apidoc Amnh||UTF8_WARN_SURROGATE
+=for apidoc Amnh||UTF8_WARN_NONCHAR
+=for apidoc Amnh||UTF8_WARN_SUPER
+=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
+=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
+
  =cut
  
  Also implemented as a macro in utf8.h
@@ -1290,7 +1315,8 @@ Perl_utf8n_to_uvchr(const U8 *s,
  =for apidoc utf8n_to_uvchr_error
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this
+directly.
  
  This function is for code that needs to know what the precise malformation(s)
  are when an error is found.  If you also need to know the generated warning
@@ -1341,7 +1367,7 @@ describes the situation in all cases.
  
  =item C<UTF8_GOT_CONTINUATION>
  
-The input sequence was malformed in that the first byte was a a UTF-8
+The input sequence was malformed in that the first byte was a UTF-8
  continuation byte.
  
  =item C<UTF8_GOT_EMPTY>
@@ -1366,7 +1392,8 @@ C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
  =item C<UTF8_GOT_NON_CONTINUATION>
  
  The input sequence was malformed in that a non-continuation type byte was found
-in a position where only a continuation type one should be.
+in a position where only a continuation type one should be.  See also
+C<L</UTF8_GOT_SHORT>>.
  
  =item C<UTF8_GOT_OVERFLOW>
  
@@ -1379,6 +1406,34 @@ The input sequence was malformed in that C<curlen> is smaller than required for
  a complete sequence.  In other words, the input is for a partial character
  sequence.
  
+
+C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
+sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
+that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
+sequence was looked at.   If no other flags are present, it means that the
+sequence was valid as far as it went.  Depending on the application, this could
+mean one of three things:
+
+=over
+
+=item *
+
+The C<curlen> length parameter passed in was too small, and the function was
+prevented from examining all the necessary bytes.
+
+=item *
+
+The buffer being looked at is based on reading data, and the data received so
+far stopped in the middle of a character, so that the next read will
+read the remainder of this character.  (It is up to the caller to deal with the
+split bytes somehow.)
+
+=item *
+
+This is a real error, and the partial sequence is all we're going to get.
+
+=back
+
  =item C<UTF8_GOT_SUPER>
  
  The input sequence was malformed in that it is for a non-Unicode code point;
@@ -1420,7 +1475,8 @@ Perl_utf8n_to_uvchr_error(const U8 *s,
  =for apidoc utf8n_to_uvchr_msgs
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this
+directly.
  
  This function is for code that needs to know what the precise malformation(s)
  are when an error is found, and wants the corresponding warning and/or error
@@ -1552,7 +1608,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
      /* The order of malformation tests here is important.  We should consume as
       * few bytes as possible in order to not skip any valid character.  This is
       * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
-     * http://unicode.org/reports/tr36 for more discussion as to why.  For
+     * https://unicode.org/reports/tr36 for more discussion as to why.  For
       * example, once we've done a UTF8SKIP, we can tell the expected number of
       * bytes, and could fail right off the bat if the input parameters indicate
       * that there are too few available.  But it could be that just that first
@@ -1565,7 +1621,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
       * things.  For example, an input could be deliberately designed to
       * overflow, and if this code bailed out immediately upon discovering that,
       * returning to the caller C<*retlen> pointing to the very next byte (one
-     * which is actually part of of the overflowing sequence), that could look
+     * which is actually part of the overflowing sequence), that could look
       * legitimate to the caller, which could discard the initial partial
       * sequence and process the rest, inappropriately.
       *
@@ -1812,7 +1868,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
          }
  
          while (possible_problems) { /* Handle each possible problem */
-            UV pack_warn = 0;
+            U32 pack_warn = 0;
              char * message = NULL;
              U32 this_flag_bit = 0;
  
@@ -1951,8 +2007,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
                           * valid, avoid as much as possible reading past the
                           * end of the buffer */
                          int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
-                                       ? s - s0
-                                       : send - s0;
+                                       ? (int) (s - s0)
+                                       : (int) (send - s0);
                          pack_warn = packWARN(WARN_UTF8);
                          message = Perl_form(aTHX_ "%s",
                              unexpected_non_continuation_text(s0,
@@ -2025,9 +2081,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
                  if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
                      if (  ! (flags & UTF8_CHECK_ONLY)
                          &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
-                        &&  (msgs || ckWARN_d(WARN_NON_UNICODE)))
+                        &&  (msgs || (   ckWARN_d(WARN_NON_UNICODE)
+                                      || ckWARN(WARN_PORTABLE))))
                      {
-                        pack_warn = packWARN(WARN_NON_UNICODE);
+                        pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
  
                          /* If it is an overlong that evaluates to a code point
                           * that doesn't have to use the Perl extended UTF-8, it
@@ -2040,7 +2097,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
                           * */
                          if (UNICODE_IS_PERL_EXTENDED(uv)) {
                              message = Perl_form(aTHX_
-                                            perl_extended_cp_format, uv);
+                                            PL_extended_cp_format, uv);
                          }
                          else {
                              message = Perl_form(aTHX_
@@ -2228,10 +2285,7 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
  
-    assert(s < send);
-
-    return utf8n_to_uvchr(s, send - s, retlen,
-                     ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return utf8_to_uvchr_buf_helper(s, send, retlen);
  }
  
  /* This is marked as deprecated
@@ -2240,9 +2294,9 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
  Only in very rare circumstances should code need to be dealing in Unicode
  (as opposed to native) code points.  In those few cases, use
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.  If you
-are not absolutely sure this is one of those cases, then assume it isn't and
-use plain C<utf8_to_uvchr_buf> instead.
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
+If you are not absolutely sure this is one of those cases, then assume it isn't
+and use plain C<utf8_to_uvchr_buf> instead.
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
@@ -2255,7 +2309,8 @@ NULL) to -1.  If those warnings are off, the computed value if well-defined (or
  the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
  is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
+returned.
  
  =cut
  */
@@ -2294,14 +2349,14 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
       * the bitops (especially ~) can create illegal UTF-8.
       * In other words: in Perl UTF-8 is not just for Unicode. */
  
-    if (e < s)
+    if (UNLIKELY(e < s))
         goto warn_and_return;
      while (s < e) {
          s += UTF8SKIP(s);
         len++;
      }
  
-    if (e != s) {
+    if (UNLIKELY(e != s)) {
         len--;
          warn_and_return:
         if (PL_op)
@@ -2486,8 +2541,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
  }
  
  /*
-No = here because currently externally undocumented
-for apidoc bytes_from_utf8_loc
+=for apidoc bytes_from_utf8_loc
  
  Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
  to store the location of the first character in C<"s"> that cannot be
@@ -2505,7 +2559,7 @@ C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
  If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
  and C<*first_non_downgradable> is set to C<NULL>.
  
-Otherwise, C<*first_non_downgradable> set to point to the first byte of the
+Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
  first character in the original string that wasn't converted.  C<*is_utf8p> is
  unchanged.  Note that the new string may have length 0.
  
@@ -2621,7 +2675,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
      PERL_ARGS_ASSERT_BYTES_TO_UTF8;
      PERL_UNUSED_CONTEXT;
  
-    Newx(d, (*lenp) * 2 + 1, U8);
+    /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
      dst = d;
  
      while (s < send) {
@@ -2632,9 +2687,6 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
      *d = '\0';
      *lenp = d-dst;
  
-    /* Trim unused space */
-    Renew(dst, *lenp + 1, U8);
-
      return dst;
  }
  
@@ -2654,7 +2706,7 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
   * Do not use in-place.  We optimize for native, for obvious reasons. */
  
  U8*
-Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
  {
      U8* pend;
      U8* dstart = d;
@@ -2688,16 +2740,16 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  
          /* This assumes that most uses will be in the first Unicode plane, not
           * needing surrogates */
-       if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
-                  && uv <= UNICODE_SURROGATE_LAST))
+       if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST,
+                                 UNICODE_SURROGATE_LAST)))
          {
              if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
                  Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
              }
             else {
                 UV low = (p[0] << 8) + p[1];
-               if (   UNLIKELY(low < FIRST_LOW_SURROGATE)
-                    || UNLIKELY(low > LAST_LOW_SURROGATE))
+               if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
+                                            LAST_LOW_SURROGATE)))
                  {
                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
                  }
@@ -2731,7 +2783,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  /* Note: this one is slightly destructive of the source. */
  
  U8*
-Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
+Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
  {
      U8* s = (U8*)p;
      U8* const send = s + bytelen;
@@ -2757,21 +2809,6 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
      return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
  }
  
-/* Internal function so we can deprecate the external one, and call
-   this one from other deprecated functions in this file */
-
-bool
-Perl__is_utf8_idstart(pTHX_ const U8 *p)
-{
-    PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
-
-    if (*p == '_')
-       return TRUE;
-    return is_utf8_common(p, NULL,
-                          "This is buggy if this gets used",
-                          PL_utf8_idstart);
-}
-
  bool
  Perl__is_uni_perl_idcont(pTHX_ UV c)
  {
@@ -2926,8 +2963,7 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_UPPER_CASE(c, p, p, lenp);
+    return CALL_UPPER_CASE(c, NULL, p, lenp);
  }
  
  UV
@@ -2939,8 +2975,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
         return _to_upper_title_latin1((U8) c, p, lenp, 's');
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_TITLE_CASE(c, p, p, lenp);
+    return CALL_TITLE_CASE(c, NULL, p, lenp);
  }
  
  STATIC U8
@@ -2979,8 +3014,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_LOWER_CASE(c, p, p, lenp);
+    return CALL_LOWER_CASE(c, NULL, p, lenp);
  }
  
  UV
@@ -3058,10 +3092,10 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
      PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      if (flags & FOLD_FLAGS_LOCALE) {
-        /* Treat a UTF-8 locale as not being in locale at all, except for
-         * potentially warning */
+        /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
+         * except for potentially warning */
          _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
-        if (IN_UTF8_CTYPE_LOCALE) {
+        if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
              flags &= ~FOLD_FLAGS_LOCALE;
          }
          else {
@@ -3076,8 +3110,7 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  
      /* Here, above 255.  If no special needs, just use the macro */
      if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
-       uvchr_to_utf8(p, c);
-       return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+       return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
      }
      else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
                the special flags. */
@@ -3091,97 +3124,29 @@ Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  }
  
  PERL_STATIC_INLINE bool
-S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
-                const char *const swashname, SV* const invlist)
+S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
+                       SV* const invlist)
  {
      /* returns a boolean giving whether or not the UTF8-encoded character that
-     * starts at <p> is in the swash indicated by <swashname>.  <swash>
-     * contains a pointer to where the swash indicated by <swashname>
-     * is to be stored; which this routine will do, so that future calls will
-     * look at <*swash> and only generate a swash if it is not null.  <invlist>
-     * is NULL or an inversion list that defines the swash.  If not null, it
-     * saves time during initialization of the swash.
-     *
-     * Note that it is assumed that the buffer length of <p> is enough to
-     * contain all the bytes that comprise the character.  Thus, <*p> should
-     * have been checked before this call for mal-formedness enough to assure
-     * that. */
-
-    PERL_ARGS_ASSERT_IS_UTF8_COMMON;
-
-    /* The API should have included a length for the UTF-8 character in <p>,
-     * but it doesn't.  We therefore assume that p has been validated at least
-     * as far as there being enough bytes available in it to accommodate the
-     * character without reading beyond the end, and pass that number on to the
-     * validating routine */
-    if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
-        _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
-                                          _UTF8_NO_CONFIDENCE_IN_CURLEN,
-                                          1 /* Die */ );
-        NOT_REACHED; /* NOTREACHED */
-    }
-
-    if (invlist) {
-        return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
-    }
-
-    assert(swash);
-
-    if (!*swash) {
-        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-        *swash = _core_swash_init("utf8",
-
-                                  /* Only use the name if there is no inversion
-                                   * list; otherwise will go out to disk */
-                                  (invlist) ? "" : swashname,
-
-                                  &PL_sv_undef, 1, 0, invlist, &flags);
-    }
-
-    return swash_fetch(*swash, p, TRUE) != 0;
-}
+     * starts at <p>, and extending no further than <e - 1> is in the inversion
+     * list <invlist>. */
  
-PERL_STATIC_INLINE bool
-S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
-                          SV **swash, const char *const swashname,
-                          SV* const invlist)
-{
-    /* returns a boolean giving whether or not the UTF8-encoded character that
-     * starts at <p>, and extending no further than <e - 1> is in the swash
-     * indicated by <swashname>.  <swash> contains a pointer to where the swash
-     * indicated by <swashname> is to be stored; which this routine will do, so
-     * that future calls will look at <*swash> and only generate a swash if it
-     * is not null.  <invlist> is NULL or an inversion list that defines the
-     * swash.  If not null, it saves time during initialization of the swash.
-     */
+    UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
  
-    PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
+    PERL_ARGS_ASSERT_IS_UTF8_COMMON;
  
-    if (! isUTF8_CHAR(p, e)) {
+    if (cp == 0 && (p >= e || *p != '\0')) {
          _force_out_malformed_utf8_message(p, e, 0, 1);
          NOT_REACHED; /* NOTREACHED */
      }
  
-    if (invlist) {
-        return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
-    }
-
-    assert(swash);
-
-    if (!*swash) {
-        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-        *swash = _core_swash_init("utf8",
-
-                                  /* Only use the name if there is no inversion
-                                   * list; otherwise will go out to disk */
-                                  (invlist) ? "" : swashname,
-
-                                  &PL_sv_undef, 1, 0, invlist, &flags);
-    }
-
-    return swash_fetch(*swash, p, TRUE) != 0;
+    assert(invlist);
+    return _invlist_contains_cp(invlist, cp);
  }
  
+#if 0  /* Not currently used, but may be needed in the future */
+PERLVAR(I, seen_deprecated_macro, HV *)
+
  STATIC void
  S_warn_on_first_deprecated_use(pTHX_ const char * const name,
                                       const char * const alternative,
@@ -3208,14 +3173,14 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name,
  
              if (instr(file, "mathoms.c")) {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
-                            "In %s, line %d, starting in Perl v5.30, %s()"
+                            "In %s, line %d, starting in Perl v5.32, %s()"
                              " will be removed.  Avoid this message by"
                              " converting to use %s().\n",
                              file, line, name, alternative);
              }
              else {
                  Perl_warner(aTHX_ WARN_DEPRECATED,
-                            "In %s, line %d, starting in Perl v5.30, %s() will"
+                            "In %s, line %d, starting in Perl v5.32, %s() will"
                              " require an additional parameter.  Avoid this"
                              " message by converting to use %s().\n",
                              file, line, name, alternative);
@@ -3223,162 +3188,37 @@ S_warn_on_first_deprecated_use(pTHX_ const char * const name,
          }
      }
  }
+#endif
  
  bool
-Perl__is_utf8_FOO(pTHX_       U8   classnum,
-                        const U8   * const p,
-                        const char * const name,
-                        const char * const alternative,
-                        const bool use_utf8,
-                        const bool use_locale,
-                        const char * const file,
-                        const unsigned line)
+Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
  {
      PERL_ARGS_ASSERT__IS_UTF8_FOO;
  
-    warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
-
-    if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
-
-        switch (classnum) {
-            case _CC_WORDCHAR:
-            case _CC_DIGIT:
-            case _CC_ALPHA:
-            case _CC_LOWER:
-            case _CC_UPPER:
-            case _CC_PUNCT:
-            case _CC_PRINT:
-            case _CC_ALPHANUMERIC:
-            case _CC_GRAPH:
-            case _CC_CASED:
-
-                return is_utf8_common(p,
-                                      NULL,
-                                      "This is buggy if this gets used",
-                                      PL_XPosix_ptrs[classnum]);
-
-            case _CC_SPACE:
-                return is_XPERLSPACE_high(p);
-            case _CC_BLANK:
-                return is_HORIZWS_high(p);
-            case _CC_XDIGIT:
-                return is_XDIGIT_high(p);
-            case _CC_CNTRL:
-                return 0;
-            case _CC_ASCII:
-                return 0;
-            case _CC_VERTSPACE:
-                return is_VERTWS_high(p);
-            case _CC_IDFIRST:
-                return is_utf8_common(p, NULL,
-                                      "This is buggy if this gets used",
-                                      PL_utf8_perl_idstart);
-            case _CC_IDCONT:
-                return is_utf8_common(p, NULL,
-                                      "This is buggy if this gets used",
-                                      PL_utf8_perl_idcont);
-        }
-    }
-
-    /* idcont is the same as wordchar below 256 */
-    if (classnum == _CC_IDCONT) {
-        classnum = _CC_WORDCHAR;
-    }
-    else if (classnum == _CC_IDFIRST) {
-        if (*p == '_') {
-            return TRUE;
-        }
-        classnum = _CC_ALPHA;
-    }
-
-    if (! use_locale) {
-        if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
-            return _generic_isCC(*p, classnum);
-        }
-
-        return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
-    }
-    else {
-        if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
-            return isFOO_lc(classnum, *p);
-        }
-
-        return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
-    }
-
-    NOT_REACHED; /* NOTREACHED */
-}
-
-bool
-Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
-                                                            const U8 * const e)
-{
-    PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
-
-    return is_utf8_common_with_len(p, e, NULL,
-                                   "This is buggy if this gets used",
-                                   PL_XPosix_ptrs[classnum]);
-}
-
-bool
-Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
-{
-    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
-
-    return is_utf8_common_with_len(p, e, NULL,
-                                   "This is buggy if this gets used",
-                                   PL_utf8_perl_idstart);
-}
-
-bool
-Perl__is_utf8_xidstart(pTHX_ const U8 *p)
-{
-    PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
-
-    if (*p == '_')
-       return TRUE;
-    return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
-}
-
-bool
-Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
-{
-    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
-
-    return is_utf8_common_with_len(p, e, NULL,
-                                   "This is buggy if this gets used",
-                                   PL_utf8_perl_idcont);
-}
-
-bool
-Perl__is_utf8_idcont(pTHX_ const U8 *p)
-{
-    PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
-
-    return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
+    return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
  }
  
  bool
-Perl__is_utf8_xidcont(pTHX_ const U8 *p)
+Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
  {
-    PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
  
-    return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL);
+    return is_utf8_common(p, e, PL_utf8_perl_idstart);
  }
  
  bool
-Perl__is_utf8_mark(pTHX_ const U8 *p)
+Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
  {
-    PERL_ARGS_ASSERT__IS_UTF8_MARK;
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
  
-    return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+    return is_utf8_common(p, e, PL_utf8_perl_idcont);
  }
  
  STATIC UV
  S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
                        U8* ustrp, STRLEN *lenp,
-                      SV *invlist, const int * const invmap,
-                      const unsigned int * const * const aux_tables,
+                      SV *invlist, const I32 * const invmap,
+                      const U32 * const * const aux_tables,
                        const U8 * const aux_table_lengths,
                        const char * const normal)
  {
@@ -3461,9 +3301,8 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
                  }
  
                  if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
-                    if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
-                        Perl_croak(aTHX_ cp_above_legal_max, uv1,
-                                         MAX_EXTERNALLY_LEGAL_CP);
+                    if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
+                        Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
                      }
                      if (ckWARN_d(WARN_NON_UNICODE)) {
                          const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
@@ -3478,8 +3317,6 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
                      > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
                  {
  
-                    /* As of Unicode 10.0, this means we avoid swash creation
-                     * for anything beyond high Plane 1 (below emojis)  */
                      goto cases_to_self;
                  }
  #endif
@@ -3492,10 +3329,13 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
  
      {
          unsigned int i;
-        const unsigned int * cp_list;
+        const U32 * cp_list;
          U8 * d;
+
+        /* 'index' is guaranteed to be non-negative, as this is an inversion
+         * map that covers all possible inputs.  See [perl #133365] */
          SSize_t index = _invlist_search(invlist, uv1);
-        IV base = invmap[index];
+        I32 base = invmap[index];
  
          /* The data structures are set up so that if 'base' is non-negative,
           * the case change is 1-to-1; and if 0, the change is to itself */
@@ -3532,20 +3372,24 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
      /* Here, there was no mapping defined, which means that the code point maps
       * to itself.  Return the inputs */
    cases_to_self:
-    len = UTF8SKIP(p);
-    if (p != ustrp) {   /* Don't copy onto itself */
-        Copy(p, ustrp, len, U8);
+    if (p) {
+        len = UTF8SKIP(p);
+        if (p != ustrp) {   /* Don't copy onto itself */
+            Copy(p, ustrp, len, U8);
+        }
+        *lenp = len;
+    }
+    else {
+       *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
      }
-
-    *lenp = len;
  
      return uv1;
  
  }
  
  Size_t
-Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
-                          const unsigned int ** remaining_folds_to)
+Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
+                          const U32 ** remaining_folds_to)
  {
      /* Returns the count of the number of code points that fold to the input
       * 'cp' (besides itself).
@@ -3563,10 +3407,17 @@ Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
       * The reason for this convolution is to avoid having to deal with
       * allocating and freeing memory.  The lists are already constructed, so
       * the return can point to them, but single code points aren't, so would
-     * need to be constructed if we didn't employ something like this API */
+     * need to be constructed if we didn't employ something like this API
+     *
+     * The code points returned by this function are all legal Unicode, which
+     * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
+     * constructed with this size (to save space and memory), and we return
+     * pointers, so they must be this size */
  
+    /* 'index' is guaranteed to be non-negative, as this is an inversion map
+     * that covers all possible inputs.  See [perl #133365] */
      SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
-    int base = _Perl_IVCF_invmap[index];
+    I32 base = _Perl_IVCF_invmap[index];
  
      PERL_ARGS_ASSERT__INVERSE_FOLDS;
  
@@ -3590,16 +3441,16 @@ Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
           * to 'cp', and the parallel array containing the length of the list
           * array */
          *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
-        *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
-                                                                 *first_folds_to
-                                                                */
+        *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
+                                                /* +1 excludes first_folds_to */
          return IVCF_AUX_TABLE_lengths[-base];
      }
  
  #endif
  
      /* Only the single code point.  This works like 'fc(G) = G - A + a' */
-    *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+    *first_folds_to = (U32) (base + cp
+                                  - invlist_array(PL_utf8_foldclosures)[index]);
      *remaining_folds_to = NULL;
      return 1;
  }
@@ -3663,82 +3514,117 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
      return original;
  }
  
-STATIC U32
-S_check_and_deprecate(pTHX_ const U8 *p,
-                            const U8 **e,
-                            const unsigned int type,    /* See below */
-                            const bool use_locale,      /* Is this a 'LC_'
-                                                           macro call? */
-                            const char * const file,
-                            const unsigned line)
+STATIC UV
+S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
+                        U8 * ustrp, STRLEN *lenp)
  {
-    /* This is a temporary function to deprecate the unsafe calls to the case
-     * changing macros and functions.  It keeps all the special stuff in just
-     * one place.
-     *
-     * It updates *e with the pointer to the end of the input string.  If using
-     * the old-style macros, *e is NULL on input, and so this function assumes
-     * the input string is long enough to hold the entire UTF-8 sequence, and
-     * sets *e accordingly, but it then returns a flag to pass the
-     * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
-     * using the full length if possible.
-     *
-     * It also does the assert that *e > p when *e is not NULL.  This should be
-     * migrated to the callers when this function gets deleted.
+    /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
+     * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+     * Otherwise, it returns the first code point of the Turkic foldcased
+     * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
+     * contain *lenp bytes
       *
-     * The 'type' parameter is used for the caller to specify which case
-     * changing function this is called from: */
-
-#       define DEPRECATE_TO_UPPER 0
-#       define DEPRECATE_TO_TITLE 1
-#       define DEPRECATE_TO_LOWER 2
-#       define DEPRECATE_TO_FOLD  3
-
-    U32 utf8n_flags = 0;
-    const char * name;
-    const char * alternative;
-
-    PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
-
-    if (*e == NULL) {
-        utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
-        *e = p + UTF8SKIP(p);
-
-        /* For mathoms.c calls, we use the function name we know is stored
-         * there.  It could be part of a larger path */
-        if (type == DEPRECATE_TO_UPPER) {
-            name = instr(file, "mathoms.c")
-                   ? "to_utf8_upper"
-                   : "toUPPER_utf8";
-            alternative = "toUPPER_utf8_safe";
-        }
-        else if (type == DEPRECATE_TO_TITLE) {
-            name = instr(file, "mathoms.c")
-                   ? "to_utf8_title"
-                   : "toTITLE_utf8";
-            alternative = "toTITLE_utf8_safe";
-        }
-        else if (type == DEPRECATE_TO_LOWER) {
-            name = instr(file, "mathoms.c")
-                   ? "to_utf8_lower"
-                   : "toLOWER_utf8";
-            alternative = "toLOWER_utf8_safe";
-        }
-        else if (type == DEPRECATE_TO_FOLD) {
-            name = instr(file, "mathoms.c")
-                   ? "to_utf8_fold"
-                   : "toFOLD_utf8";
-            alternative = "toFOLD_utf8_safe";
+     * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+     * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
+     * DOTLESS I */
+
+    PERL_ARGS_ASSERT_TURKIC_FC;
+    assert(e > p);
+
+    if (UNLIKELY(*p == 'I')) {
+        *lenp = 2;
+        ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+        ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+        return LATIN_SMALL_LETTER_DOTLESS_I;
+    }
+
+    if (UNLIKELY(memBEGINs(p, e - p,
+                           LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
+    {
+        *lenp = 1;
+        *ustrp = 'i';
+        return 'i';
+    }
+
+    return 0;
+}
+
+STATIC UV
+S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
+                        U8 * ustrp, STRLEN *lenp)
+{
+    /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
+     * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+     * Otherwise, it returns the first code point of the Turkic lowercased
+     * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
+     * contain *lenp bytes */
+
+    PERL_ARGS_ASSERT_TURKIC_LC;
+    assert(e > p0);
+
+    /* A 'I' requires context as to what to do */
+    if (UNLIKELY(*p0 == 'I')) {
+        const U8 * p = p0 + 1;
+
+        /* According to the Unicode SpecialCasing.txt file, a capital 'I'
+         * modified by a dot above lowercases to 'i' even in turkic locales. */
+        while (p < e) {
+            UV cp;
+
+            if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
+                ustrp[0] = 'i';
+                *lenp = 1;
+                return 'i';
+            }
+
+            /* For the dot above to modify the 'I', it must be part of a
+             * combining sequence immediately following the 'I', and no other
+             * modifier with a ccc of 230 may intervene */
+            cp = utf8_to_uvchr_buf(p, e, NULL);
+            if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
+                break;
+            }
+
+            /* Here the combining sequence continues */
+            p += UTF8SKIP(p);
          }
-        else Perl_croak(aTHX_ "panic: Unexpected case change type");
+    }
+
+    /* In all other cases the lc is the same as the fold */
+    return turkic_fc(p0, e, ustrp, lenp);
+}
+
+STATIC UV
+S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
+                        U8 * ustrp, STRLEN *lenp)
+{
+    /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
+     * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
+     * Otherwise, it returns the first code point of the Turkic upper or
+     * title-cased sequence, and the entire sequence will be stored in *ustrp.
+     * ustrp will contain *lenp bytes
+     *
+     * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+     * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
+     * DOTLESS I */
  
-        warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
+    PERL_ARGS_ASSERT_TURKIC_UC;
+    assert(e > p);
+
+    if (*p == 'i') {
+        *lenp = 2;
+        ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+        ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+        return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
      }
-    else {
-        assert (p < *e);
+
+    if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
+        *lenp = 1;
+        *ustrp = 'I';
+        return 'I';
      }
  
-    return utf8n_flags;
+    return 0;
  }
  
  /* The process for changing the case is essentially the same for the four case
@@ -3767,15 +3653,24 @@ S_check_and_deprecate(pTHX_ const U8 *p,
   * the input code point calculated from the UTF-8.  The fold code needs to
   * realize all this and take it from there.
   *
+ * To deal with Turkic locales, the function specified by the parameter
+ * 'turkic' is called when appropriate.
+ *
   * If you read the two macros as sequential, it's easier to understand what's
   * going on. */
  #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func,    \
-                               L1_func_extra_param)                          \
+                               L1_func_extra_param, turkic)                  \
                                                                               \
      if (flags & (locale_flags)) {                                            \
          _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                  \
-        /* Treat a UTF-8 locale as not being in locale at all */             \
          if (IN_UTF8_CTYPE_LOCALE) {                                          \
+            if (UNLIKELY(PL_in_utf8_turkic_locale)) {                        \
+                UV ret = turkic(p, e, ustrp, lenp);                          \
+                if (ret) return ret;                                         \
+            }                                                                \
+                                                                             \
+            /* Otherwise, treat a UTF-8 locale as not being in locale at     \
+             * all */                                                        \
              flags &= ~(locale_flags);                                        \
          }                                                                    \
      }                                                                        \
@@ -3801,8 +3696,7 @@ S_check_and_deprecate(pTHX_ const U8 *p,
          STRLEN len_result;                                                   \
          result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
          if (len_result == (STRLEN) -1) {                                     \
-            _force_out_malformed_utf8_message(p, e, utf8n_flags,             \
-                                                            1 /* Die */ );   \
+            _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
          }
  
  #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
@@ -3827,15 +3721,8 @@ S_check_and_deprecate(pTHX_ const U8 *p,
                                                                               \
      return result;
  
-/*
-=for apidoc to_utf8_upper
-
-Instead use L</toUPPER_utf8_safe>.
-
-=cut */
-
  /* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
   *         be used. */
  
  UV
@@ -3843,29 +3730,19 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
                                  const U8 *e,
                                  U8* ustrp,
                                  STRLEN *lenp,
-                                bool flags,
-                                const char * const file,
-                                const int line)
+                                bool flags)
  {
      UV result;
-    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
-                                                cBOOL(flags), file, line);
  
      PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
  
      /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
      /* 2nd char of uc(U+DF) is 'S' */
-    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
+    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
+                                                                    turkic_uc);
      CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
  }
  
-/*
-=for apidoc to_utf8_title
-
-Instead use L</toTITLE_utf8_safe>.
-
-=cut */
-
  /* Not currently externally documented, and subject to change:
   * <flags> is set iff the rules from the current underlying locale are to be
   *         used.  Since titlecase is not defined in POSIX, for other than a
@@ -3877,30 +3754,20 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p,
                                  const U8 *e,
                                  U8* ustrp,
                                  STRLEN *lenp,
-                                bool flags,
-                                const char * const file,
-                                const int line)
+                                bool flags)
  {
      UV result;
-    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
-                                                cBOOL(flags), file, line);
  
      PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
  
      /* 2nd char of ucfirst(U+DF) is 's' */
-    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
+    CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
+                                                                    turkic_uc);
      CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
  }
  
-/*
-=for apidoc to_utf8_lower
-
-Instead use L</toLOWER_utf8_safe>.
-
-=cut */
-
  /* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
   *         be used.
   */
  
@@ -3909,27 +3776,17 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
                                  const U8 *e,
                                  U8* ustrp,
                                  STRLEN *lenp,
-                                bool flags,
-                                const char * const file,
-                                const int line)
+                                bool flags)
  {
      UV result;
-    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
-                                                cBOOL(flags), file, line);
  
      PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
  
-    CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
+    CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
+                                                                    turkic_lc);
      CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
  }
  
-/*
-=for apidoc to_utf8_fold
-
-Instead use L</toFOLD_utf8_safe>.
-
-=cut */
-
  /* Not currently externally documented, and subject to change,
   * in <flags>
   *     bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
@@ -3945,13 +3802,9 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
                                 const U8 *e,
                                 U8* ustrp,
                                 STRLEN *lenp,
-                               U8 flags,
-                               const char * const file,
-                               const int line)
+                               U8 flags)
  {
      UV result;
-    const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
-                                                cBOOL(flags), file, line);
  
      PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
  
@@ -3961,7 +3814,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
      assert(p != ustrp); /* Otherwise overwrites */
  
      CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
-                 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
+                 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
+                                                                    turkic_fc);
  
         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
  
@@ -3973,7 +3827,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
  
              /* Special case these two characters, as what normally gets
               * returned under locale doesn't work */
-            if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
+            if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -3983,7 +3837,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
              }
              else
  #endif
-                 if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
+                 if (memBEGINs((char *) p, e - p, LONG_S_T))
              {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
@@ -4002,7 +3856,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
               * 255/256 boundary which is forbidden under /l, and so the code
               * wouldn't catch that they are equivalent (which they are only in
               * this release) */
-            else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
+            else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
                  /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
                  Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
                                "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
@@ -4026,8 +3880,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
             /* Look at every character in the result; if any cross the
             * boundary, the whole thing is disallowed */
             U8* s = ustrp;
-           U8* e = ustrp + *lenp;
-           while (s < e) {
+           U8* send = ustrp + *lenp;
+           while (s < send) {
                 if (isASCII(*s)) {
                     /* Crossed, have to return the original */
                     original = valid_utf8_to_uvchr(p, lenp);
@@ -4084,7 +3938,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
       * works. */
  
      *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
-    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+    Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
          ustrp, *lenp, U8);
      return LATIN_SMALL_LETTER_LONG_S;
  
@@ -4109,1230 +3963,70 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
  
  }
  
-/* Note:
- * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
- * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
- * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
- */
-
-SV*
-Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
-                      I32 minbits, I32 none)
-{
-    PERL_ARGS_ASSERT_SWASH_INIT;
-
-    /* Returns a copy of a swash initiated by the called function.  This is the
-     * public interface, and returning a copy prevents others from doing
-     * mischief on the original */
-
-    return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
-                                    NULL, NULL));
-}
-
-SV*
-Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
-                            I32 minbits, I32 none, SV* invlist,
-                            U8* const flags_p)
+bool
+Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  {
+    /* May change: warns if surrogates, non-character code points, or
+     * non-Unicode code points are in 's' which has length 'len' bytes.
+     * Returns TRUE if none found; FALSE otherwise.  The only other validity
+     * check is to make sure that this won't exceed the string's length nor
+     * overflow */
  
-    /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
-     * use the following define */
+    const U8* const e = s + len;
+    bool ok = TRUE;
  
-#define CORE_SWASH_INIT_RETURN(x)   \
-    PL_curpm= old_PL_curpm;         \
-    return x
+    PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
  
-    /* Initialize and return a swash, creating it if necessary.  It does this
-     * by calling utf8_heavy.pl in the general case.  The returned value may be
-     * the swash's inversion list instead if the input parameters allow it.
-     * Which is returned should be immaterial to callers, as the only
-     * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
-     * and swash_to_invlist() handle both these transparently.
-     *
-     * This interface should only be used by functions that won't destroy or
-     * adversely change the swash, as doing so affects all other uses of the
-     * swash in the program; the general public should use 'Perl_swash_init'
-     * instead.
-     *
-     * pkg  is the name of the package that <name> should be in.
-     * name is the name of the swash to find.  Typically it is a Unicode
-     *     property name, including user-defined ones
-     * listsv is a string to initialize the swash with.  It must be of the form
-     *     documented as the subroutine return value in
-     *     L<perlunicode/User-Defined Character Properties>
-     * minbits is the number of bits required to represent each data element.
-     *     It is '1' for binary properties.
-     * none I (khw) do not understand this one, but it is used only in tr///.
-     * invlist is an inversion list to initialize the swash with (or NULL)
-     * flags_p if non-NULL is the address of various input and output flag bits
-     *      to the routine, as follows:  ('I' means is input to the routine;
-     *      'O' means output from the routine.  Only flags marked O are
-     *      meaningful on return.)
-     *  _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
-     *      came from a user-defined property.  (I O)
-     *  _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
-     *      when the swash cannot be located, to simply return NULL. (I)
-     *  _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
-     *      return of an inversion list instead of a swash hash if this routine
-     *      thinks that would result in faster execution of swash_fetch() later
-     *      on. (I)
-     *
-     * Thus there are three possible inputs to find the swash: <name>,
-     * <listsv>, and <invlist>.  At least one must be specified.  The result
-     * will be the union of the specified ones, although <listsv>'s various
-     * actions can intersect, etc. what <name> gives.  To avoid going out to
-     * disk at all, <invlist> should specify completely what the swash should
-     * have, and <listsv> should be &PL_sv_undef and <name> should be "".
-     *
-     * <invlist> is only valid for binary properties */
-
-    PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
-
-    SV* retval = &PL_sv_undef;
-    HV* swash_hv = NULL;
-    const bool use_invlist= (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST);
-
-    assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
-    assert(! invlist || minbits == 1);
-
-    PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
-                       regex that triggered the swash init and the swash init
-                       perl logic itself.  See perl #122747 */
-
-    /* If data was passed in to go out to utf8_heavy to find the swash of, do
-     * so */
-    if (listsv != &PL_sv_undef || strNE(name, "")) {
-       dSP;
-       const size_t pkg_len = strlen(pkg);
-       const size_t name_len = strlen(name);
-       HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
-       SV* errsv_save;
-       GV *method;
-
-       PERL_ARGS_ASSERT__CORE_SWASH_INIT;
-
-       PUSHSTACKi(PERLSI_MAGIC);
-       ENTER;
-       SAVEHINTS();
-       save_re_context();
-       /* We might get here via a subroutine signature which uses a utf8
-        * parameter name, at which point PL_subname will have been set
-        * but not yet used. */
-       save_item(PL_subname);
-       if (PL_parser && PL_parser->error_count)
-           SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
-       method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
-       if (!method) {  /* demand load UTF-8 */
-           ENTER;
-           if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
-           GvSV(PL_errgv) = NULL;
-#ifndef NO_TAINT_SUPPORT
-           /* It is assumed that callers of this routine are not passing in
-            * any user derived data.  */
-           /* Need to do this after save_re_context() as it will set
-            * PL_tainted to 1 while saving $1 etc (see the code after getrx:
-            * in Perl_magic_get).  Even line to create errsv_save can turn on
-            * PL_tainted.  */
-           SAVEBOOL(TAINT_get);
-           TAINT_NOT;
-#endif
-           Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
-                            NULL);
-           {
-               /* Not ERRSV, as there is no need to vivify a scalar we are
-                  about to discard. */
-               SV * const errsv = GvSV(PL_errgv);
-               if (!SvTRUE(errsv)) {
-                   GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
-                   SvREFCNT_dec(errsv);
-               }
-           }
-           LEAVE;
-       }
-       SPAGAIN;
-       PUSHMARK(SP);
-       EXTEND(SP,5);
-       mPUSHp(pkg, pkg_len);
-       mPUSHp(name, name_len);
-       PUSHs(listsv);
-       mPUSHi(minbits);
-       mPUSHi(none);
-       PUTBACK;
-       if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
-       GvSV(PL_errgv) = NULL;
-       /* If we already have a pointer to the method, no need to use
-        * call_method() to repeat the lookup.  */
-       if (method
-            ? call_sv(MUTABLE_SV(method), G_SCALAR)
-           : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
-       {
-           retval = *PL_stack_sp--;
-           SvREFCNT_inc(retval);
+    while (s < e) {
+       if (UTF8SKIP(s) > len) {
+           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                          "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+           return FALSE;
         }
-       {
-           /* Not ERRSV.  See above. */
-           SV * const errsv = GvSV(PL_errgv);
-           if (!SvTRUE(errsv)) {
-               GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
-               SvREFCNT_dec(errsv);
+       if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
+           if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
+                if (   ckWARN_d(WARN_NON_UNICODE)
+                    || UNLIKELY(0 < does_utf8_overflow(s, s + len,
+                                               0 /* Don't consider overlongs */
+                                               )))
+                {
+                    /* A side effect of this function will be to warn */
+                    (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
+                    ok = FALSE;
+                }
             }
-       }
-       LEAVE;
-       POPSTACK;
-       if (IN_PERL_COMPILETIME) {
-           CopHINTS_set(PL_curcop, PL_hints);
-       }
-       if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
-           if (SvPOK(retval)) {
-
-               /* If caller wants to handle missing properties, let them */
-               if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
-                    CORE_SWASH_INIT_RETURN(NULL);
+           else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
+               if (ckWARN_d(WARN_SURROGATE)) {
+                    /* This has a different warning than the one the called
+                     * function would output, so can't just call it, unlike we
+                     * do for the non-chars and above-unicodes */
+                   UV uv = utf8_to_uvchr_buf(s, e, NULL);
+                   Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
+                       "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
+                                             uv);
+                   ok = FALSE;
                 }
-               Perl_croak(aTHX_
-                          "Can't find Unicode property definition \"%" SVf "\"",
-                          SVfARG(retval));
-                NOT_REACHED; /* NOTREACHED */
-            }
-       }
-    } /* End of calling the module to find the swash */
-
-    /* If this operation fetched a swash, and we will need it later, get it */
-    if (retval != &PL_sv_undef
-        && (minbits == 1 || (flags_p
-                            && ! (*flags_p
-                                  & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
-    {
-        swash_hv = MUTABLE_HV(SvRV(retval));
-
-        /* If we don't already know that there is a user-defined component to
-         * this swash, and the user has indicated they wish to know if there is
-         * one (by passing <flags_p>), find out */
-        if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
-            SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
-            if (user_defined && SvUV(*user_defined)) {
-                *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
-            }
-        }
-    }
-
-    /* Make sure there is an inversion list for binary properties */
-    if (minbits == 1) {
-       SV** swash_invlistsvp = NULL;
-       SV* swash_invlist = NULL;
-       bool invlist_in_swash_is_valid = FALSE;
-       bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
-                                           an unclaimed reference count */
-
-        /* If this operation fetched a swash, get its already existing
-         * inversion list, or create one for it */
-
-        if (swash_hv) {
-           swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
-           if (swash_invlistsvp) {
-               swash_invlist = *swash_invlistsvp;
-               invlist_in_swash_is_valid = TRUE;
-           }
-           else {
-               swash_invlist = _swash_to_invlist(retval);
-               swash_invlist_unclaimed = TRUE;
-           }
-       }
-
-       /* If an inversion list was passed in, have to include it */
-       if (invlist) {
-
-            /* Any fetched swash will by now have an inversion list in it;
-             * otherwise <swash_invlist>  will be NULL, indicating that we
-             * didn't fetch a swash */
-           if (swash_invlist) {
-
-               /* Add the passed-in inversion list, which invalidates the one
-                * already stored in the swash */
-               invlist_in_swash_is_valid = FALSE;
-                SvREADONLY_off(swash_invlist);  /* Turned on again below */
-               _invlist_union(invlist, swash_invlist, &swash_invlist);
-           }
-           else {
-
-                /* Here, there is no swash already.  Set up a minimal one, if
-                 * we are going to return a swash */
-                if (! use_invlist) {
-                    swash_hv = newHV();
-                    retval = newRV_noinc(MUTABLE_SV(swash_hv));
-                }
-               swash_invlist = invlist;
             }
-       }
-
-        /* Here, we have computed the union of all the passed-in data.  It may
-         * be that there was an inversion list in the swash which didn't get
-         * touched; otherwise save the computed one */
-       if (! invlist_in_swash_is_valid && ! use_invlist) {
-           if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
+           else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
+                     && (ckWARN_d(WARN_NONCHAR)))
              {
-               Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+                /* A side effect of this function will be to warn */
+                (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
+               ok = FALSE;
             }
-           /* We just stole a reference count. */
-           if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
-           else SvREFCNT_inc_simple_void_NN(swash_invlist);
         }
-
-        /* The result is immutable.  Forbid attempts to change it. */
-        SvREADONLY_on(swash_invlist);
-
-        if (use_invlist) {
-           SvREFCNT_dec(retval);
-           if (!swash_invlist_unclaimed)
-               SvREFCNT_inc_simple_void_NN(swash_invlist);
-            retval = newRV_noinc(swash_invlist);
-        }
+       s += UTF8SKIP(s);
      }
  
-    CORE_SWASH_INIT_RETURN(retval);
-#undef CORE_SWASH_INIT_RETURN
+    return ok;
  }
  
+/*
+=for apidoc pv_uni_display
  
-/* This API is wrong for special case conversions since we may need to
- * return several Unicode characters for a single Unicode character
- * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
- * the lower-level routine, and it is similarly broken for returning
- * multiple values.  --jhi
- * For those, you should use S__to_utf8_case() instead */
-/* Now SWASHGET is recasted into S_swatch_get in this file. */
-
-/* Note:
- * Returns the value of property/mapping C<swash> for the first character
- * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
- * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
- * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
- *
- * A "swash" is a hash which contains initially the keys/values set up by
- * SWASHNEW.  The purpose is to be able to completely represent a Unicode
- * property for all possible code points.  Things are stored in a compact form
- * (see utf8_heavy.pl) so that calculation is required to find the actual
- * property value for a given code point.  As code points are looked up, new
- * key/value pairs are added to the hash, so that the calculation doesn't have
- * to ever be re-done.  Further, each calculation is done, not just for the
- * desired one, but for a whole block of code points adjacent to that one.
- * For binary properties on ASCII machines, the block is usually for 64 code
- * points, starting with a code point evenly divisible by 64.  Thus if the
- * property value for code point 257 is requested, the code goes out and
- * calculates the property values for all 64 code points between 256 and 319,
- * and stores these as a single 64-bit long bit vector, called a "swatch",
- * under the key for code point 256.  The key is the UTF-8 encoding for code
- * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
- * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
- * for code point 258 is then requested, this code realizes that it would be
- * stored under the key for 256, and would find that value and extract the
- * relevant bit, offset from 256.
- *
- * Non-binary properties are stored in as many bits as necessary to represent
- * their values (32 currently, though the code is more general than that), not
- * as single bits, but the principle is the same: the value for each key is a
- * vector that encompasses the property values for all code points whose UTF-8
- * representations are represented by the key.  That is, for all code points
- * whose UTF-8 representations are length N bytes, and the key is the first N-1
- * bytes of that.
- */
-UV
-Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
-{
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-    U32 klen;
-    U32 off;
-    STRLEN slen = 0;
-    STRLEN needents;
-    const U8 *tmps = NULL;
-    SV *swatch;
-    const U8 c = *ptr;
-
-    PERL_ARGS_ASSERT_SWASH_FETCH;
-
-    /* If it really isn't a hash, it isn't really swash; must be an inversion
-     * list */
-    if (SvTYPE(hv) != SVt_PVHV) {
-        return _invlist_contains_cp((SV*)hv,
-                                    (do_utf8)
-                                     ? valid_utf8_to_uvchr(ptr, NULL)
-                                     : c);
-    }
-
-    /* We store the values in a "swatch" which is a vec() value in a swash
-     * hash.  Code points 0-255 are a single vec() stored with key length
-     * (klen) 0.  All other code points have a UTF-8 representation
-     * 0xAA..0xYY,0xZZ.  A vec() is constructed containing all of them which
-     * share 0xAA..0xYY, which is the key in the hash to that vec.  So the key
-     * length for them is the length of the encoded char - 1.  ptr[klen] is the
-     * final byte in the sequence representing the character */
-    if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
-        klen = 0;
-       needents = 256;
-        off = c;
-    }
-    else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
-        klen = 0;
-       needents = 256;
-        off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
-    }
-    else {
-        klen = UTF8SKIP(ptr) - 1;
-
-        /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values.  The offset into
-         * the vec is the final byte in the sequence.  (In EBCDIC this is
-         * converted to I8 to get consecutive values.)  To help you visualize
-         * all this:
-         *                       Straight 1047   After final byte
-         *             UTF-8      UTF-EBCDIC     I8 transform
-         *  U+0400:  \xD0\x80    \xB8\x41\x41    \xB8\x41\xA0
-         *  U+0401:  \xD0\x81    \xB8\x41\x42    \xB8\x41\xA1
-         *    ...
-         *  U+0409:  \xD0\x89    \xB8\x41\x4A    \xB8\x41\xA9
-         *  U+040A:  \xD0\x8A    \xB8\x41\x51    \xB8\x41\xAA
-         *    ...
-         *  U+0412:  \xD0\x92    \xB8\x41\x59    \xB8\x41\xB2
-         *  U+0413:  \xD0\x93    \xB8\x41\x62    \xB8\x41\xB3
-         *    ...
-         *  U+041B:  \xD0\x9B    \xB8\x41\x6A    \xB8\x41\xBB
-         *  U+041C:  \xD0\x9C    \xB8\x41\x70    \xB8\x41\xBC
-         *    ...
-         *  U+041F:  \xD0\x9F    \xB8\x41\x73    \xB8\x41\xBF
-         *  U+0420:  \xD0\xA0    \xB8\x42\x41    \xB8\x42\x41
-         *
-         * (There are no discontinuities in the elided (...) entries.)
-         * The UTF-8 key for these 33 code points is '\xD0' (which also is the
-         * key for the next 31, up through U+043F, whose UTF-8 final byte is
-         * \xBF).  Thus in UTF-8, each key is for a vec() for 64 code points.
-         * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
-         * index into the vec() swatch (after subtracting 0x80, which we
-         * actually do with an '&').
-         * In UTF-EBCDIC, each key is for a 32 code point vec().  The first 32
-         * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
-         * dicontinuities which go away by transforming it into I8, and we
-         * effectively subtract 0xA0 to get the index. */
-       needents = (1 << UTF_ACCUMULATION_SHIFT);
-       off      = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
-    }
-
-    /*
-     * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
-     * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
-     * it's nothing to sniff at.)  Pity we usually come through at least
-     * two function calls to get here...
-     *
-     * NB: this code assumes that swatches are never modified, once generated!
-     */
-
-    if (hv   == PL_last_swash_hv &&
-       klen == PL_last_swash_klen &&
-       (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
-    {
-       tmps = PL_last_swash_tmps;
-       slen = PL_last_swash_slen;
-    }
-    else {
-       /* Try our second-level swatch cache, kept in a hash. */
-       SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
-
-       /* If not cached, generate it via swatch_get */
-       if (!svp || !SvPOK(*svp)
-                || !(tmps = (const U8*)SvPV_const(*svp, slen)))
-        {
-            if (klen) {
-                const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
-                swatch = swatch_get(swash,
-                                    code_point & ~((UV)needents - 1),
-                                   needents);
-            }
-            else {  /* For the first 256 code points, the swatch has a key of
-                       length 0 */
-                swatch = swatch_get(swash, 0, needents);
-            }
-
-           if (IN_PERL_COMPILETIME)
-               CopHINTS_set(PL_curcop, PL_hints);
-
-           svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
-
-           if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
-                    || (slen << 3) < needents)
-               Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
-                          "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
-                          svp, tmps, (UV)slen, (UV)needents);
-       }
-
-       PL_last_swash_hv = hv;
-       assert(klen <= sizeof(PL_last_swash_key));
-       PL_last_swash_klen = (U8)klen;
-       /* FIXME change interpvar.h?  */
-       PL_last_swash_tmps = (U8 *) tmps;
-       PL_last_swash_slen = slen;
-       if (klen)
-           Copy(ptr, PL_last_swash_key, klen, U8);
-    }
-
-    switch ((int)((slen << 3) / needents)) {
-    case 1:
-       return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
-    case 8:
-       return ((UV) tmps[off]);
-    case 16:
-       off <<= 1;
-       return
-            ((UV) tmps[off    ] << 8) +
-            ((UV) tmps[off + 1]);
-    case 32:
-       off <<= 2;
-       return
-            ((UV) tmps[off    ] << 24) +
-            ((UV) tmps[off + 1] << 16) +
-            ((UV) tmps[off + 2] <<  8) +
-            ((UV) tmps[off + 3]);
-    }
-    Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
-              "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
-    NORETURN_FUNCTION_END;
-}
-
-/* Read a single line of the main body of the swash input text.  These are of
- * the form:
- * 0053        0056    0073
- * where each number is hex.  The first two numbers form the minimum and
- * maximum of a range, and the third is the value associated with the range.
- * Not all swashes should have a third number
- *
- * On input: l   points to the beginning of the line to be examined; it points
- *               to somewhere in the string of the whole input text, and is
- *               terminated by a \n or the null string terminator.
- *          lend   points to the null terminator of that string
- *          wants_value    is non-zero if the swash expects a third number
- *          typestr is the name of the swash's mapping, like 'ToLower'
- * On output: *min, *max, and *val are set to the values read from the line.
- *           returns a pointer just beyond the line examined.  If there was no
- *           valid min number on the line, returns lend+1
- */
-
-STATIC U8*
-S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
-                            const bool wants_value, const U8* const typestr)
-{
-    const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
-    STRLEN numlen;         /* Length of the number */
-    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
-               | PERL_SCAN_DISALLOW_PREFIX
-               | PERL_SCAN_SILENT_NON_PORTABLE;
-
-    /* nl points to the next \n in the scan */
-    U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
-    PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
-
-    /* Get the first number on the line: the range minimum */
-    numlen = lend - l;
-    *min = grok_hex((char *)l, &numlen, &flags, NULL);
-    *max = *min;    /* So can never return without setting max */
-    if (numlen)            /* If found a hex number, position past it */
-       l += numlen;
-    else if (nl) {         /* Else, go handle next line, if any */
-       return nl + 1;  /* 1 is length of "\n" */
-    }
-    else {             /* Else, no next line */
-       return lend + 1;        /* to LIST's end at which \n is not found */
-    }
-
-    /* The max range value follows, separated by a BLANK */
-    if (isBLANK(*l)) {
-       ++l;
-       flags = PERL_SCAN_SILENT_ILLDIGIT
-               | PERL_SCAN_DISALLOW_PREFIX
-               | PERL_SCAN_SILENT_NON_PORTABLE;
-       numlen = lend - l;
-       *max = grok_hex((char *)l, &numlen, &flags, NULL);
-       if (numlen)
-           l += numlen;
-       else    /* If no value here, it is a single element range */
-           *max = *min;
-
-       /* Non-binary tables have a third entry: what the first element of the
-        * range maps to.  The map for those currently read here is in hex */
-       if (wants_value) {
-           if (isBLANK(*l)) {
-               ++l;
-                flags = PERL_SCAN_SILENT_ILLDIGIT
-                    | PERL_SCAN_DISALLOW_PREFIX
-                    | PERL_SCAN_SILENT_NON_PORTABLE;
-                numlen = lend - l;
-                *val = grok_hex((char *)l, &numlen, &flags, NULL);
-                if (numlen)
-                    l += numlen;
-                else
-                    *val = 0;
-           }
-           else {
-               *val = 0;
-               if (typeto) {
-                   /* diag_listed_as: To%s: illegal mapping '%s' */
-                   Perl_croak(aTHX_ "%s: illegal mapping '%s'",
-                                    typestr, l);
-               }
-           }
-       }
-       else
-           *val = 0; /* bits == 1, then any val should be ignored */
-    }
-    else { /* Nothing following range min, should be single element with no
-             mapping expected */
-       if (wants_value) {
-           *val = 0;
-           if (typeto) {
-               /* diag_listed_as: To%s: illegal mapping '%s' */
-               Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
-           }
-       }
-       else
-           *val = 0; /* bits == 1, then val should be ignored */
-    }
-
-    /* Position to next line if any, or EOF */
-    if (nl)
-       l = nl + 1;
-    else
-       l = lend;
-
-    return l;
-}
-
-/* Note:
- * Returns a swatch (a bit vector string) for a code point sequence
- * that starts from the value C<start> and comprises the number C<span>.
- * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
- * Should be used via swash_fetch, which will cache the swatch in C<swash>.
- */
-STATIC SV*
-S_swatch_get(pTHX_ SV* swash, UV start, UV span)
-{
-    SV *swatch;
-    U8 *l, *lend, *x, *xend, *s, *send;
-    STRLEN lcur, xcur, scur;
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-    SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
-
-    SV** listsvp = NULL; /* The string containing the main body of the table */
-    SV** extssvp = NULL;
-    SV** invert_it_svp = NULL;
-    U8* typestr = NULL;
-    STRLEN bits;
-    STRLEN octets; /* if bits == 1, then octets == 0 */
-    UV  none;
-    UV  end = start + span;
-
-    if (invlistsvp == NULL) {
-        SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-        SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
-        SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
-        extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
-        listsvp = hv_fetchs(hv, "LIST", FALSE);
-        invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
-       bits  = SvUV(*bitssvp);
-       none  = SvUV(*nonesvp);
-       typestr = (U8*)SvPV_nolen(*typesvp);
-    }
-    else {
-       bits = 1;
-       none = 0;
-    }
-    octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
-    PERL_ARGS_ASSERT_SWATCH_GET;
-
-    if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
-       Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
-                                                (UV)bits);
-    }
-
-    /* If overflowed, use the max possible */
-    if (end < start) {
-       end = UV_MAX;
-       span = end - start;
-    }
-
-    /* create and initialize $swatch */
-    scur   = octets ? (span * octets) : (span + 7) / 8;
-    swatch = newSV(scur);
-    SvPOK_on(swatch);
-    s = (U8*)SvPVX(swatch);
-    if (octets && none) {
-       const U8* const e = s + scur;
-       while (s < e) {
-           if (bits == 8)
-               *s++ = (U8)(none & 0xff);
-           else if (bits == 16) {
-               *s++ = (U8)((none >>  8) & 0xff);
-               *s++ = (U8)( none        & 0xff);
-           }
-           else if (bits == 32) {
-               *s++ = (U8)((none >> 24) & 0xff);
-               *s++ = (U8)((none >> 16) & 0xff);
-               *s++ = (U8)((none >>  8) & 0xff);
-               *s++ = (U8)( none        & 0xff);
-           }
-       }
-       *s = '\0';
-    }
-    else {
-       (void)memzero((U8*)s, scur + 1);
-    }
-    SvCUR_set(swatch, scur);
-    s = (U8*)SvPVX(swatch);
-
-    if (invlistsvp) {  /* If has an inversion list set up use that */
-       _invlist_populate_swatch(*invlistsvp, start, end, s);
-        return swatch;
-    }
-
-    /* read $swash->{LIST} */
-    l = (U8*)SvPV(*listsvp, lcur);
-    lend = l + lcur;
-    while (l < lend) {
-       UV min, max, val, upper;
-       l = swash_scan_list_line(l, lend, &min, &max, &val,
-                                                        cBOOL(octets), typestr);
-       if (l > lend) {
-           break;
-       }
-
-       /* If looking for something beyond this range, go try the next one */
-       if (max < start)
-           continue;
-
-       /* <end> is generally 1 beyond where we want to set things, but at the
-        * platform's infinity, where we can't go any higher, we want to
-        * include the code point at <end> */
-        upper = (max < end)
-                ? max
-                : (max != UV_MAX || end != UV_MAX)
-                  ? end - 1
-                  : end;
-
-       if (octets) {
-           UV key;
-           if (min < start) {
-               if (!none || val < none) {
-                   val += start - min;
-               }
-               min = start;
-           }
-           for (key = min; key <= upper; key++) {
-               STRLEN offset;
-               /* offset must be non-negative (start <= min <= key < end) */
-               offset = octets * (key - start);
-               if (bits == 8)
-                   s[offset] = (U8)(val & 0xff);
-               else if (bits == 16) {
-                   s[offset    ] = (U8)((val >>  8) & 0xff);
-                   s[offset + 1] = (U8)( val        & 0xff);
-               }
-               else if (bits == 32) {
-                   s[offset    ] = (U8)((val >> 24) & 0xff);
-                   s[offset + 1] = (U8)((val >> 16) & 0xff);
-                   s[offset + 2] = (U8)((val >>  8) & 0xff);
-                   s[offset + 3] = (U8)( val        & 0xff);
-               }
-
-               if (!none || val < none)
-                   ++val;
-           }
-       }
-       else { /* bits == 1, then val should be ignored */
-           UV key;
-           if (min < start)
-               min = start;
-
-           for (key = min; key <= upper; key++) {
-               const STRLEN offset = (STRLEN)(key - start);
-               s[offset >> 3] |= 1 << (offset & 7);
-           }
-       }
-    } /* while */
-
-    /* Invert if the data says it should be.  Assumes that bits == 1 */
-    if (invert_it_svp && SvUV(*invert_it_svp)) {
-
-       /* Unicode properties should come with all bits above PERL_UNICODE_MAX
-        * be 0, and their inversion should also be 0, as we don't succeed any
-        * Unicode property matches for non-Unicode code points */
-       if (start <= PERL_UNICODE_MAX) {
-
-           /* The code below assumes that we never cross the
-            * Unicode/above-Unicode boundary in a range, as otherwise we would
-            * have to figure out where to stop flipping the bits.  Since this
-            * boundary is divisible by a large power of 2, and swatches comes
-            * in small powers of 2, this should be a valid assumption */
-           assert(start + span - 1 <= PERL_UNICODE_MAX);
-
-           send = s + scur;
-           while (s < send) {
-               *s = ~(*s);
-               s++;
-           }
-       }
-    }
-
-    /* read $swash->{EXTRAS}
-     * This code also copied to swash_to_invlist() below */
-    x = (U8*)SvPV(*extssvp, xcur);
-    xend = x + xcur;
-    while (x < xend) {
-       STRLEN namelen;
-       U8 *namestr;
-       SV** othersvp;
-       HV* otherhv;
-       STRLEN otherbits;
-       SV **otherbitssvp, *other;
-       U8 *s, *o, *nl;
-       STRLEN slen, olen;
-
-       const U8 opc = *x++;
-       if (opc == '\n')
-           continue;
-
-       nl = (U8*)memchr(x, '\n', xend - x);
-
-       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
-           if (nl) {
-               x = nl + 1; /* 1 is length of "\n" */
-               continue;
-           }
-           else {
-               x = xend; /* to EXTRAS' end at which \n is not found */
-               break;
-           }
-       }
-
-       namestr = x;
-       if (nl) {
-           namelen = nl - namestr;
-           x = nl + 1;
-       }
-       else {
-           namelen = xend - namestr;
-           x = xend;
-       }
-
-       othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
-       otherhv = MUTABLE_HV(SvRV(*othersvp));
-       otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
-       otherbits = (STRLEN)SvUV(*otherbitssvp);
-       if (bits < otherbits)
-           Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
-                      "bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
-
-       /* The "other" swatch must be destroyed after. */
-       other = swatch_get(*othersvp, start, span);
-       o = (U8*)SvPV(other, olen);
-
-       if (!olen)
-           Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
-
-       s = (U8*)SvPV(swatch, slen);
-       if (bits == 1 && otherbits == 1) {
-           if (slen != olen)
-               Perl_croak(aTHX_ "panic: swatch_get found swatch length "
-                          "mismatch, slen=%" UVuf ", olen=%" UVuf,
-                          (UV)slen, (UV)olen);
-
-           switch (opc) {
-           case '+':
-               while (slen--)
-                   *s++ |= *o++;
-               break;
-           case '!':
-               while (slen--)
-                   *s++ |= ~*o++;
-               break;
-           case '-':
-               while (slen--)
-                   *s++ &= ~*o++;
-               break;
-           case '&':
-               while (slen--)
-                   *s++ &= *o++;
-               break;
-           default:
-               break;
-           }
-       }
-       else {
-           STRLEN otheroctets = otherbits >> 3;
-           STRLEN offset = 0;
-           U8* const send = s + slen;
-
-           while (s < send) {
-               UV otherval = 0;
-
-               if (otherbits == 1) {
-                   otherval = (o[offset >> 3] >> (offset & 7)) & 1;
-                   ++offset;
-               }
-               else {
-                   STRLEN vlen = otheroctets;
-                   otherval = *o++;
-                   while (--vlen) {
-                       otherval <<= 8;
-                       otherval |= *o++;
-                   }
-               }
-
-               if (opc == '+' && otherval)
-                   NOOP;   /* replace with otherval */
-               else if (opc == '!' && !otherval)
-                   otherval = 1;
-               else if (opc == '-' && otherval)
-                   otherval = 0;
-               else if (opc == '&' && !otherval)
-                   otherval = 0;
-               else {
-                   s += octets; /* no replacement */
-                   continue;
-               }
-
-               if (bits == 8)
-                   *s++ = (U8)( otherval & 0xff);
-               else if (bits == 16) {
-                   *s++ = (U8)((otherval >>  8) & 0xff);
-                   *s++ = (U8)( otherval        & 0xff);
-               }
-               else if (bits == 32) {
-                   *s++ = (U8)((otherval >> 24) & 0xff);
-                   *s++ = (U8)((otherval >> 16) & 0xff);
-                   *s++ = (U8)((otherval >>  8) & 0xff);
-                   *s++ = (U8)( otherval        & 0xff);
-               }
-           }
-       }
-       sv_free(other); /* through with it! */
-    } /* while */
-    return swatch;
-}
-
-SV*
-Perl__swash_to_invlist(pTHX_ SV* const swash)
-{
-
-   /* Subject to change or removal.  For use only in one place in regcomp.c.
-    * Ownership is given to one reference count in the returned SV* */
-
-    U8 *l, *lend;
-    char *loc;
-    STRLEN lcur;
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-    UV elements = 0;    /* Number of elements in the inversion list */
-    U8 empty[] = "";
-    SV** listsvp;
-    SV** typesvp;
-    SV** bitssvp;
-    SV** extssvp;
-    SV** invert_it_svp;
-
-    U8* typestr;
-    STRLEN bits;
-    STRLEN octets; /* if bits == 1, then octets == 0 */
-    U8 *x, *xend;
-    STRLEN xcur;
-
-    SV* invlist;
-
-    PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
-
-    /* If not a hash, it must be the swash's inversion list instead */
-    if (SvTYPE(hv) != SVt_PVHV) {
-        return SvREFCNT_inc_simple_NN((SV*) hv);
-    }
-
-    /* The string containing the main body of the table */
-    listsvp = hv_fetchs(hv, "LIST", FALSE);
-    typesvp = hv_fetchs(hv, "TYPE", FALSE);
-    bitssvp = hv_fetchs(hv, "BITS", FALSE);
-    extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
-    invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
-    typestr = (U8*)SvPV_nolen(*typesvp);
-    bits  = SvUV(*bitssvp);
-    octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
-    /* read $swash->{LIST} */
-    if (SvPOK(*listsvp)) {
-       l = (U8*)SvPV(*listsvp, lcur);
-    }
-    else {
-       /* LIST legitimately doesn't contain a string during compilation phases
-        * of Perl itself, before the Unicode tables are generated.  In this
-        * case, just fake things up by creating an empty list */
-       l = empty;
-       lcur = 0;
-    }
-    loc = (char *) l;
-    lend = l + lcur;
-
-    if (*l == 'V') {    /*  Inversion list format */
-        const char *after_atou = (char *) lend;
-        UV element0;
-        UV* other_elements_ptr;
-
-        /* The first number is a count of the rest */
-        l++;
-        if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
-            Perl_croak(aTHX_ "panic: Expecting a valid count of elements"
-                             " at start of inversion list");
-        }
-        if (elements == 0) {
-            invlist = _new_invlist(0);
-        }
-        else {
-            l = (U8 *) after_atou;
-
-            /* Get the 0th element, which is needed to setup the inversion list
-             * */
-            while (isSPACE(*l)) l++;
-            after_atou = (char *) lend;
-            if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
-                Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
-                                 " inversion list");
-            }
-            l = (U8 *) after_atou;
-            invlist = _setup_canned_invlist(elements, element0,
-                                            &other_elements_ptr);
-            elements--;
-
-            /* Then just populate the rest of the input */
-            while (elements-- > 0) {
-                if (l > lend) {
-                    Perl_croak(aTHX_ "panic: Expecting %" UVuf " more"
-                                     " elements than available", elements);
-                }
-                while (isSPACE(*l)) l++;
-                after_atou = (char *) lend;
-                if (!grok_atoUV((const char *)l, other_elements_ptr++,
-                                 &after_atou))
-                {
-                    Perl_croak(aTHX_ "panic: Expecting a valid element"
-                                     " in inversion list");
-                }
-                l = (U8 *) after_atou;
-            }
-        }
-    }
-    else {
-
-        /* Scan the input to count the number of lines to preallocate array
-         * size based on worst possible case, which is each line in the input
-         * creates 2 elements in the inversion list: 1) the beginning of a
-         * range in the list; 2) the beginning of a range not in the list.  */
-        while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) {
-            elements += 2;
-            loc++;
-        }
-
-        /* If the ending is somehow corrupt and isn't a new line, add another
-         * element for the final range that isn't in the inversion list */
-        if (! (*lend == '\n'
-            || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
-        {
-            elements++;
-        }
-
-        invlist = _new_invlist(elements);
-
-        /* Now go through the input again, adding each range to the list */
-        while (l < lend) {
-            UV start, end;
-            UV val;            /* Not used by this function */
-
-            l = swash_scan_list_line(l, lend, &start, &end, &val,
-                                                        cBOOL(octets), typestr);
-
-            if (l > lend) {
-                break;
-            }
-
-            invlist = _add_range_to_invlist(invlist, start, end);
-        }
-    }
-
-    /* Invert if the data says it should be */
-    if (invert_it_svp && SvUV(*invert_it_svp)) {
-       _invlist_invert(invlist);
-    }
-
-    /* This code is copied from swatch_get()
-     * read $swash->{EXTRAS} */
-    x = (U8*)SvPV(*extssvp, xcur);
-    xend = x + xcur;
-    while (x < xend) {
-       STRLEN namelen;
-       U8 *namestr;
-       SV** othersvp;
-       HV* otherhv;
-       STRLEN otherbits;
-       SV **otherbitssvp, *other;
-       U8 *nl;
-
-       const U8 opc = *x++;
-       if (opc == '\n')
-           continue;
-
-       nl = (U8*)memchr(x, '\n', xend - x);
-
-       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
-           if (nl) {
-               x = nl + 1; /* 1 is length of "\n" */
-               continue;
-           }
-           else {
-               x = xend; /* to EXTRAS' end at which \n is not found */
-               break;
-           }
-       }
-
-       namestr = x;
-       if (nl) {
-           namelen = nl - namestr;
-           x = nl + 1;
-       }
-       else {
-           namelen = xend - namestr;
-           x = xend;
-       }
-
-       othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
-       otherhv = MUTABLE_HV(SvRV(*othersvp));
-       otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
-       otherbits = (STRLEN)SvUV(*otherbitssvp);
-
-       if (bits != otherbits || bits != 1) {
-           Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
-                      "properties, bits=%" UVuf ", otherbits=%" UVuf,
-                      (UV)bits, (UV)otherbits);
-       }
-
-       /* The "other" swatch must be destroyed after. */
-       other = _swash_to_invlist((SV *)*othersvp);
-
-       /* End of code copied from swatch_get() */
-       switch (opc) {
-       case '+':
-           _invlist_union(invlist, other, &invlist);
-           break;
-       case '!':
-            _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
-           break;
-       case '-':
-           _invlist_subtract(invlist, other, &invlist);
-           break;
-       case '&':
-           _invlist_intersection(invlist, other, &invlist);
-           break;
-       default:
-           break;
-       }
-       sv_free(other); /* through with it! */
-    }
-
-    SvREADONLY_on(invlist);
-    return invlist;
-}
-
-SV*
-Perl__get_swash_invlist(pTHX_ SV* const swash)
-{
-    SV** ptr;
-
-    PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
-
-    if (! SvROK(swash)) {
-        return NULL;
-    }
-
-    /* If it really isn't a hash, it isn't really swash; must be an inversion
-     * list */
-    if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
-        return SvRV(swash);
-    }
-
-    ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
-    if (! ptr) {
-        return NULL;
-    }
-
-    return *ptr;
-}
-
-bool
-Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
-{
-    /* May change: warns if surrogates, non-character code points, or
-     * non-Unicode code points are in 's' which has length 'len' bytes.
-     * Returns TRUE if none found; FALSE otherwise.  The only other validity
-     * check is to make sure that this won't exceed the string's length nor
-     * overflow */
-
-    const U8* const e = s + len;
-    bool ok = TRUE;
-
-    PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
-
-    while (s < e) {
-       if (UTF8SKIP(s) > len) {
-           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
-                          "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
-           return FALSE;
-       }
-       if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
-           if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
-                if (   ckWARN_d(WARN_NON_UNICODE)
-                    || UNLIKELY(0 < does_utf8_overflow(s, s + len,
-                                               0 /* Don't consider overlongs */
-                                               )))
-                {
-                    /* A side effect of this function will be to warn */
-                    (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
-                    ok = FALSE;
-                }
-           }
-           else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
-               if (ckWARN_d(WARN_SURROGATE)) {
-                    /* This has a different warning than the one the called
-                     * function would output, so can't just call it, unlike we
-                     * do for the non-chars and above-unicodes */
-                   UV uv = utf8_to_uvchr_buf(s, e, NULL);
-                   Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
-                       "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
-                                             uv);
-                   ok = FALSE;
-               }
-           }
-           else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
-                     && (ckWARN_d(WARN_NONCHAR)))
-            {
-                /* A side effect of this function will be to warn */
-                (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
-               ok = FALSE;
-           }
-       }
-       s += UTF8SKIP(s);
-    }
-
-    return ok;
-}
-
-/*
-=for apidoc pv_uni_display
-
-Build to the scalar C<dsv> a displayable version of the string C<spv>,
-length C<len>, the displayable version being at most C<pvlim> bytes long
-(if longer, the rest is truncated and C<"..."> will be appended).
+Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
+C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
+long (if longer, the rest is truncated and C<"..."> will be appended).
  
  The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
  C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
@@ -5341,6 +4035,9 @@ to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
  C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
  C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
  
+Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
+backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
+
  The pointer to the PV of the C<dsv> is returned.
  
  See also L</sv_uni_display>.
@@ -5359,10 +4056,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
      SvUTF8_off(dsv);
      for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
          UV u;
-         /* This serves double duty as a flag and a character to print after
-            a \ when flags & UNI_DISPLAY_BACKSLASH is true.
-         */
-        char ok = 0;
+        bool ok = 0;
  
          if (pvlim && SvCUR(dsv) >= pvlim) {
               truncated++;
@@ -5372,27 +4066,19 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
          if (u < 256) {
              const unsigned char c = (unsigned char)u & 0xFF;
              if (flags & UNI_DISPLAY_BACKSLASH) {
-                switch (c) {
-                case '\n':
-                    ok = 'n'; break;
-                case '\r':
-                    ok = 'r'; break;
-                case '\t':
-                    ok = 't'; break;
-                case '\f':
-                    ok = 'f'; break;
-                case '\a':
-                    ok = 'a'; break;
-                case '\\':
-                    ok = '\\'; break;
-                default: break;
-                }
-                if (ok) {
-                    const char string = ok;
-                    sv_catpvs(dsv, "\\");
-                    sv_catpvn(dsv, &string, 1);
-                }
-            }
+                 if (    isMNEMONIC_CNTRL(c)
+                     && (   c != '\b'
+                         || (flags & UNI_DISPLAY_BACKSPACE)))
+                 {
+                    const char * mnemonic = cntrl_to_mnemonic(c);
+                    sv_catpvn(dsv, mnemonic, strlen(mnemonic));
+                    ok = 1;
+                 }
+                 else if (c == '\\') {
+                    sv_catpvs(dsv, "\\\\");
+                    ok = 1;
+                 }
+             }
              /* isPRINT() is the locale-blind version. */
              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
                  const char string = c;
@@ -5474,7 +4160,7 @@ beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
  
  For case-insensitiveness, the "casefolding" of Unicode is used
  instead of upper/lowercasing both the characters, see
-L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
+L<https://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
  
  =cut */
  
@@ -5496,7 +4182,13 @@ L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
   *                          that effect.  However, if the caller knows what
   *                          it's doing, it can pass this flag to indicate that,
   *                          and the assertion is skipped.
- *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
+ *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ *                          to s2, and s2 doesn't have to be UTF-8 encoded.
+ *                          This introduces an asymmetry to save a few branches
+ *                          in a loop.  Currently, this is not a problem, as
+ *                          never are both inputs pre-folded.  Simply call this
+ *                          function with the pre-folded one as the second
+ *                          string.
   *  FOLDEQ_S2_FOLDS_SANE
   */
  I32
@@ -5519,11 +4211,11 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
-    assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
-               && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
-                     && !(flags & FOLDEQ_S1_FOLDS_SANE))
-                   || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
-                       && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+    assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+               && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
+                        && !(flags &  FOLDEQ_S1_FOLDS_SANE))
+                    || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
+                        && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
      /* The algorithm is to trial the folds without regard to the flags on
       * the first line of the above assert(), and then see if the result
       * violates them.  This means that the inputs can't be pre-folded to a
@@ -5537,12 +4229,20 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
  
      if (flags & FOLDEQ_LOCALE) {
          if (IN_UTF8_CTYPE_LOCALE) {
-            flags &= ~FOLDEQ_LOCALE;
+            if (UNLIKELY(PL_in_utf8_turkic_locale)) {
+                flags_for_folder |= FOLD_FLAGS_LOCALE;
+            }
+            else {
+                flags &= ~FOLDEQ_LOCALE;
+            }
          }
          else {
              flags_for_folder |= FOLD_FLAGS_LOCALE;
          }
      }
+    if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+        flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+    }
  
      if (pe1) {
          e1 = *(U8**)pe1;
@@ -5627,9 +4327,23 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
  
          if (n2 == 0) {    /* Same for s2 */
             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
-               f2 = (U8 *) p2;
-                assert(u2);
-               n2 = UTF8SKIP(f2);
+
+                /* Point to the already-folded character.  But for non-UTF-8
+                 * variants, convert to UTF-8 for the algorithm below */
+               if (UTF8_IS_INVARIANT(*p2)) {
+                    f2 = (U8 *) p2;
+                    n2 = 1;
+                }
+                else if (u2) {
+                    f2 = (U8 *) p2;
+                    n2 = UTF8SKIP(f2);
+                }
+                else {
+                    foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+                    foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+                    f2 = foldbuf2;
+                    n2 = 2;
+                }
             }
             else {
                  if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
@@ -5700,648 +4414,6 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
      return 1;
  }
  
-/* XXX The next two functions should likely be moved to mathoms.c once all
- * occurrences of them are removed from the core; some cpan-upstream modules
- * still use them */
-
-U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
-{
-    PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
-
-    return uvoffuni_to_utf8_flags(d, uv, 0);
-}
-
-/*
-=for apidoc utf8n_to_uvuni
-
-Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
-
-This function was useful for code that wanted to handle both EBCDIC and
-ASCII platforms with Unicode properties, but starting in Perl v5.20, the
-distinctions between the platforms have mostly been made invisible to most
-code, so this function is quite unlikely to be what you want.  If you do need
-this precise functionality, use instead
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
-or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
-{
-    PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
-
-    return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
-}
-
-/*
-=for apidoc uvuni_to_utf8_flags
-
-Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>.
-
-This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
-which itself, while not deprecated, should be used only in isolated
-circumstances.  These functions were useful for code that wanted to handle
-both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
-v5.20, the distinctions between the platforms have mostly been made invisible
-to most code, so this function is quite unlikely to be what you want.
-
-=cut
-*/
-
-U8 *
-Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
-{
-    PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
-
-    return uvoffuni_to_utf8_flags(d, uv, flags);
-}
-
-void
-Perl_init_uniprops(pTHX)
-{
-    /* Set up the inversion list global variables */
-
-    PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
-    PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALNUM]);
-    PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXALPHA]);
-    PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXBLANK]);
-    PL_XPosix_ptrs[_CC_CASED] =  _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
-    PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXCNTRL]);
-    PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXDIGIT]);
-    PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXGRAPH]);
-    PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXLOWER]);
-    PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPRINT]);
-    PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXPUNCT]);
-    PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXSPACE]);
-    PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXUPPER]);
-    PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
-    PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXWORD]);
-    PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_XPOSIXXDIGIT]);
-
-    PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASCII]);
-    PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALNUM]);
-    PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXALPHA]);
-    PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXBLANK]);
-    PL_Posix_ptrs[_CC_CASED] =  _new_invlist_C_array(PL_uni_prop_ptrs[PL_CASED]);
-    PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXCNTRL]);
-    PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXDIGIT]);
-    PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXGRAPH]);
-    PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXLOWER]);
-    PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPRINT]);
-    PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXPUNCT]);
-    PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXSPACE]);
-    PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXUPPER]);
-    PL_Posix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_VERTSPACE]);
-    PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXWORD]);
-    PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PL_uni_prop_ptrs[PL_POSIXXDIGIT]);
-
-    PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
-    PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
-    PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
-    PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
-    PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
-
-    PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
-    PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
-    PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
-
-    PL_Assigned_invlist = _new_invlist_C_array(PL_uni_prop_ptrs[PL_ASSIGNED]);
-
-    PL_utf8_perl_idstart = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDSTART]);
-    PL_utf8_perl_idcont = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_IDCONT]);
-
-    PL_utf8_charname_begin = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_BEGIN]);
-    PL_utf8_charname_continue = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_CHARNAME_CONTINUE]);
-
-    PL_utf8_foldable = _new_invlist_C_array(PL_uni_prop_ptrs[PL__PERL_ANY_FOLDS]);
-    PL_HasMultiCharFold = _new_invlist_C_array(PL_uni_prop_ptrs[
-                                            PL__PERL_FOLDS_TO_MULTI_CHAR]);
-    PL_NonL1NonFinalFold = _new_invlist_C_array(
-                                            NonL1_Perl_Non_Final_Folds_invlist);
-
-    PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
-    PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
-    PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
-    PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
-    PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
-    PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
-}
-
-SV *
-Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t len, const bool to_fold, bool * invert)
-{
-    /* Parse the interior meat of \p{} passed to this in 'name' with length 'len',
-     * and return an inversion list if a property with 'name' is found, or NULL
-     * if not.  'name' point to the input with leading and trailing space trimmed.
-     * 'to_fold' indicates if /i is in effect.
-     *
-     * When the return is an inversion list, '*invert' will be set to a boolean
-     * indicating if it should be inverted or not
-     *
-     * This currently doesn't handle all cases.  A NULL return indicates the
-     * caller should try a different approach
-     */
-
-    char* lookup_name;
-    bool stricter = FALSE;
-    bool is_nv_type = FALSE;         /* nv= or numeric_value=, or possibly one
-                                        of the cjk numeric properties (though
-                                        it requires extra effort to compile
-                                        them) */
-    unsigned int i;
-    unsigned int j = 0, lookup_len;
-    int equals_pos = -1;        /* Where the '=' is found, or negative if none */
-    int slash_pos = -1;        /* Where the '/' is found, or negative if none */
-    int table_index = 0;
-    bool starts_with_In_or_Is = FALSE;
-    Size_t lookup_offset = 0;
-
-    PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
-
-    /* The input will be modified into 'lookup_name' */
-    Newx(lookup_name, len, char);
-    SAVEFREEPV(lookup_name);
-
-    /* Parse the input. */
-    for (i = 0; i < len; i++) {
-        char cur = name[i];
-
-        /* These characters can be freely ignored in most situations.  Later it
-         * may turn out we shouldn't have ignored them, and we have to reparse,
-         * but we don't have enough information yet to make that decision */
-        if (cur == '-' || cur == '_' || isSPACE(cur)) {
-            continue;
-        }
-
-        /* Case differences are also ignored.  Our lookup routine assumes
-         * everything is lowercase */
-        if (isUPPER(cur)) {
-            lookup_name[j++] = toLOWER(cur);
-            continue;
-        }
-
-        /* A double colon is either an error, or a package qualifier to a
-         * subroutine user-defined property; neither of which do we currently
-         * handle
-         *
-         * But a single colon is a synonym for '=' */
-        if (cur == ':') {
-            if (i < len - 1 && name[i+1] == ':') {
-                return NULL;
-            }
-            cur = '=';
-        }
-
-        /* Otherwise, this character is part of the name. */
-        lookup_name[j++] = cur;
-
-        /* Only the equals sign needs further processing */
-        if (cur == '=') {
-            equals_pos = j; /* Note where it occurred in the input */
-            break;
-        }
-    }
-
-    /* Here, we are either done with the whole property name, if it was simple;
-     * or are positioned just after the '=' if it is compound. */
-
-    if (equals_pos >= 0) {
-        assert(! stricter); /* We shouldn't have set this yet */
-
-        /* Space immediately after the '=' is ignored */
-        i++;
-        for (; i < len; i++) {
-            if (! isSPACE(name[i])) {
-                break;
-            }
-        }
-
-        /* Certain properties need special handling.  They may optionally be
-         * prefixed by 'is'.  Ignore that prefix for the purposes of checking
-         * if this is one of those properties */
-        if (memBEGINPs(lookup_name, len, "is")) {
-            lookup_offset = 2;
-        }
-
-        /* Then check if it is one of these properties.  This is hard-coded
-         * because easier this way, and the list is unlikely to change.  There
-         * are several properties like this in the Unihan DB, which is unlikely
-         * to be compiled, and they all end with 'numeric'.  The interiors
-         * aren't checked for the precise property.  This would stop working if
-         * a cjk property were to be created that ended with 'numeric' and
-         * wasn't a numeric type */
-        is_nv_type = memEQs(lookup_name + lookup_offset,
-                       j - 1 - lookup_offset, "numericvalue")
-                  || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "nv")
-                  || (   memENDPs(lookup_name + lookup_offset,
-                            j - 1 - lookup_offset, "numeric")
-                      && (   memBEGINPs(lookup_name + lookup_offset,
-                                      j - 1 - lookup_offset, "cjk")
-                          || memBEGINPs(lookup_name + lookup_offset,
-                                      j - 1 - lookup_offset, "k")));
-        if (   is_nv_type
-            || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "canonicalcombiningclass")
-            || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "ccc")
-            || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "age")
-            || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "in")
-            || memEQs(lookup_name + lookup_offset,
-                      j - 1 - lookup_offset, "presentin"))
-        {
-            unsigned int k;
-
-            /* What makes these properties special is that the stuff after the
-             * '=' is a number.  Therefore, we can't throw away '-'
-             * willy-nilly, as those could be a minus sign.  Other stricter
-             * rules also apply.  However, these properties all can have the
-             * rhs not be a number, in which case they contain at least one
-             * alphabetic.  In those cases, the stricter rules don't apply.
-             * But the numeric type properties can have the alphas [Ee] to
-             * signify an exponent, and it is still a number with stricter
-             * rules.  So look for an alpha that signifys not-strict */
-            stricter = TRUE;
-            for (k = i; k < len; k++) {
-                if (   isALPHA(name[k])
-                    && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
-                {
-                    stricter = FALSE;
-                    break;
-                }
-            }
-        }
-
-        if (stricter) {
-
-            /* A number may have a leading '+' or '-'.  The latter is retained
-             * */
-            if (name[i] == '+') {
-                i++;
-            }
-            else if (name[i] == '-') {
-                lookup_name[j++] = '-';
-                i++;
-            }
-
-            /* Skip leading zeros including single underscores separating the
-             * zeros, or between the final leading zero and the first other
-             * digit */
-            for (; i < len - 1; i++) {
-                if (   name[i] != '0'
-                    && (name[i] != '_' || ! isDIGIT(name[i+1])))
-                {
-                    break;
-                }
-            }
-        }
-    }
-    else {  /* No '=' */
-
-       /* We are now in a position to determine if this property should have
-        * been parsed using stricter rules.  Only a few are like that, and
-        * unlikely to change. */
-        if (   memBEGINPs(lookup_name, j, "perl")
-            && memNEs(lookup_name + 4, j - 4, "space")
-            && memNEs(lookup_name + 4, j - 4, "word"))
-        {
-            stricter = TRUE;
-
-            /* We set the inputs back to 0 and the code below will reparse,
-             * using strict */
-            i = j = 0;
-        }
-    }
-
-    /* Here, we have either finished the property, or are positioned to parse
-     * the remainder, and we know if stricter rules apply.  Finish out, if not
-     * already done */
-    for (; i < len; i++) {
-        char cur = name[i];
-
-        /* In all instances, case differences are ignored, and we normalize to
-         * lowercase */
-        if (isUPPER(cur)) {
-            lookup_name[j++] = toLOWER(cur);
-            continue;
-        }
-
-        /* An underscore is skipped, but not under strict rules unless it
-         * separates two digits */
-        if (cur == '_') {
-            if (    stricter
-                && (     i == 0 || (int) i == equals_pos || i == len- 1
-                    || ! isDIGIT(name[i-1]) || ! isDIGIT(name[i+1])))
-            {
-                lookup_name[j++] = '_';
-            }
-            continue;
-        }
-
-        /* Hyphens are skipped except under strict */
-        if (cur == '-' && ! stricter) {
-            continue;
-        }
-
-        /* XXX Bug in documentation.  It says white space skipped adjacent to
-         * non-word char.  Maybe we should, but shouldn't skip it next to a dot
-         * in a number */
-        if (isSPACE(cur) && ! stricter) {
-            continue;
-        }
-
-        lookup_name[j++] = cur;
-
-        /* Unless this is a non-trailing slash, we are done with it */
-        if (i >= len - 1 || cur != '/') {
-            continue;
-        }
-
-        slash_pos = j;
-
-        /* A slash in the 'numeric value' property indicates that what follows
-         * is a denominator.  It can have a leading '+' and '0's that should be
-         * skipped.  But we have never allowed a negative denominator, so treat
-         * a minus like every other character.  (No need to rule out a second
-         * '/', as that won't match anything anyway */
-        if (is_nv_type) {
-            i++;
-            if (i < len && name[i] == '+') {
-                i++;
-            }
-
-            /* Skip leading zeros including underscores separating digits */
-            for (; i < len - 1; i++) {
-                if (   name[i] != '0'
-                    && (name[i] != '_' || ! isDIGIT(name[i+1])))
-                {
-                    break;
-                }
-            }
-
-            /* Store the first real character in the denominator */
-            lookup_name[j++] = name[i];
-        }
-    }
-
-    /* Here are completely done parsing the input 'name', and 'lookup_name'
-     * contains a copy, normalized.
-     *
-     * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
-     * different from without the underscores.  */
-    if (  (   UNLIKELY(memEQs(lookup_name, j, "l"))
-           || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
-        && UNLIKELY(name[len-1] == '_'))
-    {
-        lookup_name[j++] = '&';
-    }
-    else if (len > 2 && name[0] == 'I' && (   name[1] == 'n' || name[1] == 's'))
-    {
-
-        /* Also, if the original input began with 'In' or 'Is', it could be a
-         * subroutine call instead of a property names, which currently isn't
-         * handled by this function.  Subroutine calls can't happen if there is
-         * an '=' in the name */
-        if (equals_pos < 0 && get_cvn_flags(name, len, GV_NOTQUAL) != NULL) {
-            return NULL;
-        }
-
-        starts_with_In_or_Is = TRUE;
-    }
-
-    lookup_len = j;     /* Use a more mnemonic name starting here */
-
-    /* Get the index into our pointer table of the inversion list corresponding
-     * to the property */
-    table_index = match_uniprop((U8 *) lookup_name, lookup_len);
-
-    /* If it didn't find the property */
-    if (table_index == 0) {
-
-        /* If didn't find the property, we try again stripping off any initial
-         * 'In' or 'Is' */
-        if (starts_with_In_or_Is) {
-            lookup_name += 2;
-            lookup_len -= 2;
-            equals_pos -= 2;
-            slash_pos -= 2;
-
-            table_index = match_uniprop((U8 *) lookup_name, lookup_len);
-        }
-
-        if (table_index == 0) {
-            char * canonical;
-
-            /* If not found, and not a numeric type property, isn't a legal
-             * property */
-            if (! is_nv_type) {
-                return NULL;
-            }
-
-            /* But the numeric type properties need more work to decide.  What
-             * we do is make sure we have the number in canonical form and look
-             * that up. */
-
-            if (slash_pos < 0) {    /* No slash */
-
-                /* When it isn't a rational, take the input, convert it to a
-                 * NV, then create a canonical string representation of that
-                 * NV. */
-
-                NV value;
-
-                /* Get the value */
-                if (my_atof3(lookup_name + equals_pos, &value,
-                             lookup_len - equals_pos)
-                          != lookup_name + lookup_len)
-                {
-                    return NULL;
-                }
-
-                /* If the value is an integer, the canonical value is integral */
-                if (Perl_ceil(value) == value) {
-                    canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
-                                                equals_pos, lookup_name, value);
-                }
-                else {  /* Otherwise, it is %e with a known precision */
-                    canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
-                                                equals_pos, lookup_name,
-                                                PL_E_FORMAT_PRECISION, value);
-                }
-            }
-            else {  /* Has a slash.  Create a rational in canonical form  */
-                UV numerator, denominator, gcd, trial;
-                const char * end_ptr;
-                const char * sign = "";
-
-                /* We can't just find the numerator, denominator, and do the
-                 * division, then use the method above, because that is
-                 * inexact.  And the input could be a rational that is within
-                 * epsilon (given our precision) of a valid rational, and would
-                 * then incorrectly compare valid.
-                 *
-                 * We're only interested in the part after the '=' */
-                const char * this_lookup_name = lookup_name + equals_pos;
-                lookup_len -= equals_pos;
-                slash_pos -= equals_pos;
-
-                /* Handle any leading minus */
-                if (this_lookup_name[0] == '-') {
-                    sign = "-";
-                    this_lookup_name++;
-                    lookup_len--;
-                    slash_pos--;
-                }
-
-                /* Convert the numerator to numeric */
-                end_ptr = this_lookup_name + slash_pos;
-                if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
-                    return NULL;
-                }
-
-                /* It better have included all characters before the slash */
-                if (*end_ptr != '/') {
-                    return NULL;
-                }
-
-                /* Set to look at just the denominator */
-                this_lookup_name += slash_pos;
-                lookup_len -= slash_pos;
-                end_ptr = this_lookup_name + lookup_len;
-
-                /* Convert the denominator to numeric */
-                if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
-                    return NULL;
-                }
-
-                /* It better be the rest of the characters, and don't divide by
-                 * 0 */
-                if (   end_ptr != this_lookup_name + lookup_len
-                    || denominator == 0)
-                {
-                    return NULL;
-                }
-
-                /* Get the greatest common denominator using
-                   http://en.wikipedia.org/wiki/Euclidean_algorithm */
-                gcd = numerator;
-                trial = denominator;
-                while (trial != 0) {
-                    UV temp = trial;
-                    trial = gcd % trial;
-                    gcd = temp;
-                }
-
-                /* If already in lowest possible terms, we have already tried
-                 * looking this up */
-                if (gcd == 1) {
-                    return NULL;
-                }
-
-                /* Reduce the rational, which should put it in canonical form.
-                 * Then look it up */
-                numerator /= gcd;
-                denominator /= gcd;
-
-                canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
-                        equals_pos, lookup_name, sign, numerator, denominator);
-            }
-
-            /* Here, we have the number in canonical form.  Try that */
-            table_index = match_uniprop((U8 *) canonical, strlen(canonical));
-            if (table_index == 0) {
-                return NULL;
-            }
-        }
-    }
-
-    /* The return is an index into a table of ptrs.  A negative return
-     * signifies that the real index is the absolute value, but the result
-     * needs to be inverted */
-    if (table_index < 0) {
-        *invert = TRUE;
-        table_index = -table_index;
-    }
-    else {
-        *invert = FALSE;
-    }
-
-    /* Out-of band indices indicate a deprecated property.  The proper index is
-     * modulo it with the table size.  And dividing by the table size yields
-     * an offset into a table constructed to contain the corresponding warning
-     * message */
-    if (table_index > MAX_UNI_KEYWORD_INDEX) {
-        Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
-        table_index %= MAX_UNI_KEYWORD_INDEX;
-        Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
-                "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
-                (int) len, name, deprecated_property_msgs[warning_offset]);
-    }
-
-    /* In a few properties, a different property is used under /i.  These are
-     * unlikely to change, so are hard-coded here. */
-    if (to_fold) {
-        if (   table_index == PL_XPOSIXUPPER
-            || table_index == PL_XPOSIXLOWER
-            || table_index == PL_TITLE)
-        {
-            table_index = PL_CASED;
-        }
-        else if (   table_index == PL_UPPERCASELETTER
-                 || table_index == PL_LOWERCASELETTER
-#ifdef PL_TITLECASELETTER   /* Missing from early Unicodes */
-                 || table_index == PL_TITLECASELETTER
-#endif
-        ) {
-            table_index = PL_CASEDLETTER;
-        }
-        else if (  table_index == PL_POSIXUPPER
-                || table_index == PL_POSIXLOWER)
-        {
-            table_index = PL_POSIXALPHA;
-        }
-    }
-
-    /* Create and return the inversion list */
-    return _new_invlist_C_array(PL_uni_prop_ptrs[table_index]);
-}
-
-/*
-=for apidoc utf8_to_uvchr
-
-Returns the native code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-C<NULL>) to -1.  If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
-{
-    PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
-
-    return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
-}
-
  /*
   * ex: set ts=8 sts=4 sw=4 et:
   */