utf8.c: Improve debug message

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 1570fc8..df71252 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -32,22 +32,15 @@
  #define PERL_IN_UTF8_C
  #include "perl.h"
  #include "inline_invlist.c"
-
-#ifndef EBCDIC
-/* Separate prototypes needed because in ASCII systems these are
- * usually macros but they still are compiled as code, too. */
-PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
-PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
-#endif
+#include "charclass_invlists.h"
  
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
  
  /*
  =head1 Unicode Support
-
-This file contains various utility functions for manipulating UTF8-encoded
-strings. For the uninitiated, this is a method of representing arbitrary
+These are various utility functions for manipulating UTF8-encoded
+strings.  For the uninitiated, this is a method of representing arbitrary
  Unicode characters as a variable number of bytes, in such a way that
  characters in the ASCII range are unmodified, and a zero byte never appears
  within non-zero characters.
@@ -63,7 +56,9 @@ or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  fit this definition, hence the function's name.
  
-If C<len> is 0, it will be calculated using C<strlen(s)>.  
+If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
+use this option, that C<s> can't have embedded C<NUL> characters and has to
+have a terminating C<NUL> byte).
  
  See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  
@@ -90,48 +85,14 @@ Perl_is_ascii_string(const U8 *s, STRLEN len)
  =for apidoc uvoffuni_to_utf8_flags
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Instead, B<Almost all code should use L</uvchr_to_utf8> or
+L</uvchr_to_utf8_flags>>.
  
-It adds the UTF-8 representation of the Unicode code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
-    d = uvoffuni_to_utf8_flags(d, uv, flags);
-
-or, in most cases,
-
-    d = uvoffuni_to_utf8_flags(d, uv, 0);
-
-This is the Unicode-aware way of saying
-
-    *(d++) = uv;
-
-where uv is a code point expressed in Latin-1 or above, not the platform's
-native character set.  B<Almost all code should instead use L</uvchr_to_utf8>
-or L</uvchr_to_utf8_flags>>.
-
-This function will convert to UTF-8 (and not warn) even code points that aren't
-legal Unicode or are problematic, unless C<flags> contains one or more of the
-following flags:
-
-If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
-the function will raise a warning, provided UTF8 warnings are enabled.  If instead
-UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
-If both flags are set, the function will both warn and return NULL.
-
-The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
-affect how the function handles a Unicode non-character.  And likewise, the
-UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
-code points that are
-above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
-even less portable) can be warned and/or disallowed even if other above-Unicode
-code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
-flags.
-
-And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
-above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
-DISALLOW flags.
+This function is like them, but the input is a strict Unicode
+(as opposed to native) code point.  Only in very rare circumstances should code
+not be using the native code point.
  
+For details, see the description for L</uvchr_to_utf8_flags>>.
  
  =cut
  */
@@ -141,9 +102,14 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
  
+    if (UNI_IS_INVARIANT(uv)) {
+       *d++ = (U8) LATIN1_TO_NATIVE(uv);
+       return d;
+    }
+
      /* The first problematic code point is the first surrogate */
      if (uv >= UNICODE_SURROGATE_FIRST
-        && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
+        && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
      {
         if (UNICODE_IS_SURROGATE(uv)) {
             if (flags & UNICODE_WARN_SURROGATE) {
@@ -178,12 +144,9 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
             }
         }
      }
-    if (UNI_IS_INVARIANT(uv)) {
-       *d++ = (U8) LATIN1_TO_NATIVE(uv);
-       return d;
-    }
+
  #if defined(EBCDIC)
-    else {
+    {
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
@@ -229,7 +192,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
         *d++ = (U8)(( uv        & 0x3f) | 0x80);
         return d;
      }
-#ifdef HAS_QUAD
+#ifdef UTF8_QUAD_MAX
      if (uv < UTF8_QUAD_MAX)
  #endif
      {
@@ -242,7 +205,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
         *d++ = (U8)(( uv        & 0x3f) | 0x80);
         return d;
      }
-#ifdef HAS_QUAD
+#ifdef UTF8_QUAD_MAX
      {
         *d++ =                            0xff;         /* Can't match U+FFFE! */
         *d++ =                            0x80;         /* 6 Reserved bits */
@@ -262,108 +225,111 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  #endif
  #endif /* Non loop style */
  }
-
  /*
+=for apidoc uvchr_to_utf8
  
-Tests if the first C<len> bytes of string C<s> form a valid UTF-8
-character.  Note that an INVARIANT (i.e. ASCII) character is a valid
-UTF-8 character.  The number of bytes in the UTF-8 character
-will be returned if it is valid, otherwise 0.
-
-This is the "slow" version as opposed to the "fast" version which is
-the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
-difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
-or less you should use the IS_UTF8_CHAR(), for lengths of five or more
-you should use the _slow().  In practice this means that the _slow()
-will be used very rarely, since the maximum Unicode code point (as of
-Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
-the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
-five bytes or more.
+Adds the UTF-8 representation of the native code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
+the byte after the end of the new character.  In other words,
  
-=cut */
-PERL_STATIC_INLINE STRLEN
-S_is_utf8_char_slow(const U8 *s, const STRLEN len)
-{
-    dTHX;   /* The function called below requires thread context */
+    d = uvchr_to_utf8(d, uv);
  
-    STRLEN actual_len;
+is the recommended wide native character-aware way of saying
+
+    *(d++) = uv;
+
+This function accepts any UV as input.  To forbid or warn on non-Unicode code
+points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
  
-    PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
+=cut
+*/
  
-    utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
+/* This is also a macro */
+PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  
-    return (actual_len == (STRLEN) -1) ? 0 : actual_len;
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return uvchr_to_utf8(d, uv);
  }
  
  /*
-=for apidoc is_utf8_char_buf
+=for apidoc uvchr_to_utf8_flags
  
-Returns the number of bytes that comprise the first UTF-8 encoded character in
-buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
-buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
-encoded character.
+Adds the UTF-8 representation of the native code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
+the byte after the end of the new character.  In other words,
  
-Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character.
+    d = uvchr_to_utf8_flags(d, uv, flags);
  
-=cut */
+or, in most cases,
  
-STRLEN
-Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
-{
+    d = uvchr_to_utf8_flags(d, uv, 0);
+
+This is the Unicode-aware way of saying
  
-    STRLEN len;
+    *(d++) = uv;
  
-    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+This function will convert to UTF-8 (and not warn) even code points that aren't
+legal Unicode or are problematic, unless C<flags> contains one or more of the
+following flags:
  
-    if (buf_end <= buf) {
-       return 0;
-    }
+If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
+the function will raise a warning, provided UTF8 warnings are enabled.  If instead
+UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
+If both flags are set, the function will both warn and return NULL.
  
-    len = buf_end - buf;
-    if (len > UTF8SKIP(buf)) {
-       len = UTF8SKIP(buf);
-    }
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
+affect how the function handles a Unicode non-character.  And likewise, the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
+code points that are
+above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
+even less portable) can be warned and/or disallowed even if other above-Unicode
+code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
+flags.
+
+And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
+above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
+DISALLOW flags.
+
+=cut
+*/
+
+/* This is also a macro */
+PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
  
-#ifdef IS_UTF8_CHAR
-    if (IS_UTF8_CHAR_FAST(len))
-        return IS_UTF8_CHAR(buf, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
-    return is_utf8_char_slow(buf, len);
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    return uvchr_to_utf8_flags(d, uv, flags);
  }
  
  /*
-=for apidoc is_utf8_char
-
-Tests if some arbitrary number of bytes begins in a valid UTF-8
-character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
-character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
-character will be returned if it is valid, otherwise 0.
+=for apidoc is_utf8_char_buf
  
-This function is deprecated due to the possibility that malformed input could
-cause reading beyond the end of the input buffer.  Use L</is_utf8_char_buf>
-instead.
+This is identical to the macro L</isUTF8_CHAR>.
  
  =cut */
  
  STRLEN
-Perl_is_utf8_char(const U8 *s)
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
  {
-    PERL_ARGS_ASSERT_IS_UTF8_CHAR;
  
-    /* Assumes we have enough space, which is why this is deprecated */
-    return is_utf8_char_buf(s, s + UTF8SKIP(s));
-}
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
  
+    return isUTF8_CHAR(buf, buf_end);
+}
  
  /*
  =for apidoc is_utf8_string
  
  Returns true if the first C<len> bytes of string C<s> form a valid
  UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
-using C<strlen(s)> (which means if you use this option, that C<s> has to have a
-terminating NUL byte).  Note that all characters being ASCII constitute 'a
-valid UTF-8 string'.
+using C<strlen(s)> (which means if you use this option, that C<s> can't have
+embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
+that all characters being ASCII constitute 'a valid UTF-8 string'.
  
  See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  
@@ -379,28 +345,11 @@ Perl_is_utf8_string(const U8 *s, STRLEN len)
      PERL_ARGS_ASSERT_IS_UTF8_STRING;
  
      while (x < send) {
-        /* Inline the easy bits of is_utf8_char() here for speed... */
-        if (UTF8_IS_INVARIANT(*x)) {
-           x++;
-        }
-        else {
-             /* ... and call is_utf8_char() only if really needed. */
-            const STRLEN c = UTF8SKIP(x);
-            const U8* const next_char_ptr = x + c;
-
-            if (next_char_ptr > send) {
-                return FALSE;
-            }
-
-            if (IS_UTF8_CHAR_FAST(c)) {
-                if (!IS_UTF8_CHAR(x, c))
-                    return FALSE;
-            }
-            else if (! is_utf8_char_slow(x, c)) {
-                return FALSE;
-            }
-            x = next_char_ptr;
-        }
+        STRLEN len = isUTF8_CHAR(x, send);
+        if (UNLIKELY(! len)) {
+            return FALSE;
+        }
+        x += len;
      }
  
      return TRUE;
@@ -434,34 +383,17 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  {
      const U8* const send = s + (len ? len : strlen((const char *)s));
      const U8* x = s;
-    STRLEN c;
      STRLEN outlen = 0;
  
      PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
  
      while (x < send) {
-        const U8* next_char_ptr;
-
-        /* Inline the easy bits of is_utf8_char() here for speed... */
-        if (UTF8_IS_INVARIANT(*x))
-            next_char_ptr = x + 1;
-        else {
-            /* ... and call is_utf8_char() only if really needed. */
-            c = UTF8SKIP(x);
-            next_char_ptr = c + x;
-            if (next_char_ptr > send) {
-                goto out;
-            }
-            if (IS_UTF8_CHAR_FAST(c)) {
-                if (!IS_UTF8_CHAR(x, c))
-                    c = 0;
-            } else
-                c = is_utf8_char_slow(x, c);
-            if (!c)
-                goto out;
-        }
-         x = next_char_ptr;
-        outlen++;
+        STRLEN len = isUTF8_CHAR(x, send);
+        if (UNLIKELY(! len)) {
+            goto out;
+        }
+        x += len;
+        outlen++;
      }
  
   out:
@@ -475,13 +407,13 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  
  /*
  
-=for apidoc utf8n_to_uvoffuni
+=for apidoc utf8n_to_uvchr
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  Bottom level UTF-8 decode routine.
-Returns the official Unicode (not native) code point value of the first
-character in the string C<s>,
+Returns the native code point value of the first character in the string C<s>,
  which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
  C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
  the length, in bytes, of that character.
@@ -508,11 +440,11 @@ flags) malformation is found.  If this flag is set, the routine assumes that
  the caller will raise a warning, and this function will silently just set
  C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
  
-Note that this API requires disambiguation between successful decoding a NUL
+Note that this API requires disambiguation between successful decoding a C<NUL>
  character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
  in both cases, 0 is returned.  To disambiguate, upon a zero return, see if the
-first byte of C<s> is 0 as well.  If so, the input was a NUL; if not, the input
-had an error.
+first byte of C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the
+input had an error.
  
  Certain code points are considered problematic.  These are Unicode surrogates,
  Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
@@ -543,22 +475,20 @@ The UTF-8 encoding on ASCII platforms for these large code points begins with a
  byte containing 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to
  be treated as malformations, while allowing smaller above-Unicode code points.
  (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
-including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
+including these, as malformations.)
+Similarly, UTF8_WARN_FE_FF acts just like
  the other WARN flags, but applies just to these code points.
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
-
  =cut
  */
  
  UV
-Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  {
-    dVAR;
      const U8 * const s0 = s;
      U8 overflow_byte = '\0';   /* Save byte in case of overflow */
      U8 * send;
@@ -574,7 +504,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
  
      const char* const malformed_text = "Malformed UTF-8 character";
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVOFFUNI;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
  
      /* The order of malformation tests here is important.  We should consume as
       * few bytes as possible in order to not skip any valid character.  This is
@@ -591,7 +521,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
       * We also should not consume too few bytes, otherwise someone could inject
       * things.  For example, an input could be deliberately designed to
       * overflow, and if this code bailed out immediately upon discovering that,
-     * returning to the caller *retlen pointing to the very next byte (one
+     * returning to the caller C<*retlen> pointing to the very next byte (one
       * which is actually part of of the overflowing sequence), that could look
       * legitimate to the caller, which could discard the initial partial
       * sequence and process the rest, inappropriately */
@@ -623,7 +553,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
  
      /* An invariant is trivially well-formed */
      if (UTF8_IS_INVARIANT(uv)) {
-       return NATIVE_TO_LATIN1(uv);
+       return uv;
      }
  
      /* A continuation character can't start a valid sequence */
@@ -740,32 +670,8 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
         }
      }
  
-#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
-    if ((*s0 & 0xFE) == 0xFE   /* matches both FE, FF */
-       && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
-    {
-       /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-        * generation of the sv, since no warnings are raised under CHECK */
-       if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
-           && ckWARN_d(WARN_UTF8))
-       {
-           /* This message is deliberately not of the same syntax as the other
-            * messages for malformations, for backwards compatibility in the
-            * unlikely event that code is relying on its precise earlier text
-            */
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
-           pack_warn = packWARN(WARN_UTF8);
-       }
-       if (flags & UTF8_DISALLOW_FE_FF) {
-           goto malformed;
-       }
-    }
+#ifndef EBCDIC /* EBCDIC can't overflow */
      if (UNLIKELY(overflowed)) {
-
-       /* If the first byte is FF, it will overflow a 32-bit word.  If the
-        * first byte is FE, it will overflow a signed 32-bit word.  The
-        * above preserves backward compatibility, since its message was used
-        * in earlier versions of this code in preference to overflow */
         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
         goto malformed;
      }
@@ -785,18 +691,21 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
         goto malformed;
      }
  
-    /* Here, the input is considered to be well-formed , but could be a
+    /* Here, the input is considered to be well-formed, but it still could be a
       * problematic code point that is not allowed by the input parameters. */
      if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
         && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
                      |UTF8_WARN_ILLEGAL_INTERCHANGE)))
      {
         if (UNICODE_IS_SURROGATE(uv)) {
+
+            /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+             * generation of the sv, since no warnings are raised under CHECK */
             if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
-               && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
+               && ckWARN_d(WARN_SURROGATE))
             {
                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
-               pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
+               pack_warn = packWARN(WARN_SURROGATE);
             }
             if (flags & UTF8_DISALLOW_SURROGATE) {
                 goto disallowed;
@@ -804,21 +713,42 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
         }
         else if ((uv > PERL_UNICODE_MAX)) {
             if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
-               && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
+                && ckWARN_d(WARN_NON_UNICODE))
             {
                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
-               pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
+               pack_warn = packWARN(WARN_NON_UNICODE);
             }
+#ifndef EBCDIC /* EBCDIC always allows FE, FF */
+
+            /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+             * points.  We test for these after the regular SUPER ones, and
+             * before possibly bailing out, so that the more dire warning
+             * overrides the regular one, if applicable */
+            if ((*s0 & 0xFE) == 0xFE   /* matches both FE, FF */
+                && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+            {
+                if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+                                                            == UTF8_WARN_FE_FF
+                    && ckWARN_d(WARN_UTF8))
+                {
+                    sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
+                    pack_warn = packWARN(WARN_UTF8);
+                }
+                if (flags & UTF8_DISALLOW_FE_FF) {
+                    goto disallowed;
+                }
+            }
+#endif
             if (flags & UTF8_DISALLOW_SUPER) {
                 goto disallowed;
             }
         }
         else if (UNICODE_IS_NONCHAR(uv)) {
             if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
-               && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
+               && ckWARN_d(WARN_NONCHAR))
             {
                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
-               pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
+               pack_warn = packWARN(WARN_NONCHAR);
             }
             if (flags & UTF8_DISALLOW_NONCHAR) {
                 goto disallowed;
@@ -826,7 +756,9 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
         }
  
         if (sv) {
-           outlier_ret = uv;
+            outlier_ret = uv;   /* Note we don't bother to convert to native,
+                                   as all the outlier code points are the same
+                                   in both ASCII and EBCDIC */
             goto do_warn;
         }
  
@@ -834,7 +766,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
          * to return it */
      }
  
-    return uv;
+    return UNI_TO_NATIVE(uv);
  
      /* There are three cases which get to beyond this point.  In all 3 cases:
       * <sv>        if not null points to a string to print as a warning.
@@ -908,7 +840,7 @@ NULL) to -1.  If those warnings are off, the computed value, if well-defined
  (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
  C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
  the next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
  =cut
@@ -918,8 +850,6 @@ returned.
  UV
  Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  {
-    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
-
      assert(s < send);
  
      return utf8n_to_uvchr(s, send - s, retlen,
@@ -928,8 +858,7 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
  /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
   * there are no malformations in the input UTF-8 string C<s>.  surrogates,
- * non-character code points, and non-Unicode code points are allowed.  A macro
- * in utf8.h is used to normally avoid this function wrapper */
+ * non-character code points, and non-Unicode code points are allowed. */
  
  UV
  Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
@@ -939,6 +868,7 @@ Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
      UV uv = *s;
  
      PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
+    PERL_UNUSED_CONTEXT;
  
      if (retlen) {
          *retlen = expectlen;
@@ -969,40 +899,11 @@ Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  }
  
  /*
-=for apidoc utf8_to_uvchr
-
-Returns the native code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-NULL) to -1.  If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
-{
-    PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
-
-    return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
-}
-
-/*
  =for apidoc utf8_to_uvuni_buf
  
-Only in very rare circumstances should code need to be dealing in the Unicode
-code point.  Use L</utf8_to_uvchr_buf> instead.
+Only in very rare circumstances should code need to be dealing in Unicode
+(as opposed to native) code points.  In those few cases, use
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
@@ -1015,7 +916,7 @@ NULL) to -1.  If those warnings are off, the computed value if well-defined (or
  the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
  is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  =cut
  */
@@ -1028,53 +929,8 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
      assert(send > s);
  
      /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvoffuni(aTHX_ s, send -s, retlen,
-                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
-}
-
-/* DEPRECATED!
- * Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
- * there are no malformations in the input UTF-8 string C<s>.  Surrogates,
- * non-character code points, and non-Unicode code points are allowed */
-
-UV
-Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
-{
-    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
-
-    return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
-}
-
-/*
-=for apidoc utf8_to_uvuni
-
-Returns the Unicode code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is one reason why this function is deprecated.  The other is that only in
-extremely limited circumstances should the Unicode versus native code point be
-of any interest to you.  Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
-NULL) to -1.  If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
-{
-    PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
-
-    return NATIVE_TO_UNI(valid_utf8_to_uvchr(s, retlen));
+    return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
+                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
  }
  
  /*
@@ -1090,7 +946,6 @@ up past C<e>, croaks.
  STRLEN
  Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
  {
-    dVAR;
      STRLEN len = 0;
  
      PERL_ARGS_ASSERT_UTF8_LENGTH;
@@ -1153,11 +1008,10 @@ on the first byte of character or just after the last byte of a character.
  */
  
  U8 *
-Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
+Perl_utf8_hop(const U8 *s, I32 off)
  {
      PERL_ARGS_ASSERT_UTF8_HOP;
  
-    PERL_UNUSED_CONTEXT;
      /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
       * the bitops (especially ~) can create illegal UTF-8.
       * In other words: in Perl UTF-8 is not just for Unicode. */
@@ -1180,12 +1034,14 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
  =for apidoc bytes_cmp_utf8
  
  Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
-sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
+sequence of characters (stored as UTF-8)
+in C<u>, C<ulen>.  Returns 0 if they are
  equal, -1 or -2 if the first string is less than the second string, +1 or +2
  if the first string is greater than the second string.
  
  -1 or +1 is returned if the shorter string was identical to the start of the
-longer string. -2 or +2 is returned if the was a difference between characters
+longer string.  -2 or +2 is returned if
+there was a difference between characters
  within the strings.
  
  =cut
@@ -1199,8 +1055,6 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
  
      PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
  
-    PERL_UNUSED_CONTEXT;
-
      while (b < bend && u < uend) {
          U8 c = *u++;
         if (!UTF8_IS_INVARIANT(c)) {
@@ -1265,6 +1119,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
      U8 *d;
  
      PERL_ARGS_ASSERT_UTF8_TO_BYTES;
+    PERL_UNUSED_CONTEXT;
  
      /* ensure valid UTF-8 and chars < 256 before updating string */
      while (s < send) {
@@ -1280,9 +1135,13 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
  
      d = s = save;
      while (s < send) {
-        STRLEN ulen;
-        *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
-        s += ulen;
+       U8 c = *s++;
+       if (! UTF8_IS_INVARIANT(c)) {
+           /* Then it is two-byte encoded */
+           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+            s++;
+       }
+       *d++ = c;
      }
      *d = '\0';
      *len = d - save;
@@ -1296,7 +1155,7 @@ Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
  Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
  the newly-created string, and updates C<len> to contain the new
  length.  Returns the original string if no conversion occurs, C<len>
-is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+is unchanged.  Do nothing if C<is_utf8> points to 0.  Sets C<is_utf8> to
  0 if C<s> is converted or consisted entirely of characters that are invariant
  in utf8 (i.e., US-ASCII on non-EBCDIC machines).
  
@@ -1312,7 +1171,6 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
      I32 count = 0;
  
      PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
-
      PERL_UNUSED_CONTEXT;
      if (!*is_utf8)
          return (U8 *)start;
@@ -1335,9 +1193,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
      s = start; start = d;
      while (s < send) {
         U8 c = *s++;
-       if (!UTF8_IS_INVARIANT(c)) {
+       if (! UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s++);
+           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+            s++;
         }
         *d++ = c;
      }
@@ -1354,7 +1213,7 @@ UTF-8.
  Returns a pointer to the newly-created string, and sets C<len> to
  reflect the new length in bytes.
  
-A NUL character will be written after the end of the string.
+A C<NUL> character will be written after the end of the string.
  
  If you want to convert to UTF-8 from encodings other than
  the native (Latin1 or EBCDIC),
@@ -1490,46 +1349,17 @@ Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
      return _is_utf8_FOO(classnum, tmpbuf);
  }
  
-/* for now these are all defined (inefficiently) in terms of the utf8 versions.
- * Note that the macros in handy.h that call these short-circuit calling them
- * for Latin-1 range inputs */
-
-bool
-Perl_is_uni_alnum(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_WORDCHAR, tmpbuf);
-}
-
-bool
-Perl_is_uni_alnumc(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_ALPHANUMERIC, tmpbuf);
-}
-
  /* Internal function so we can deprecate the external one, and call
     this one from other deprecated functions in this file */
  
-PERL_STATIC_INLINE bool
-S_is_utf8_idfirst(pTHX_ const U8 *p)
+bool
+Perl__is_utf8_idstart(pTHX_ const U8 *p)
  {
-    dVAR;
+    PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
  
      if (*p == '_')
         return TRUE;
-    /* is_utf8_idstart would be more logical. */
-    return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
-}
-
-bool
-Perl_is_uni_idfirst(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return S_is_utf8_idfirst(aTHX_ tmpbuf);
+    return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
  }
  
  bool
@@ -1548,113 +1378,27 @@ Perl__is_uni_perl_idstart(pTHX_ UV c)
      return _is_utf8_perl_idstart(tmpbuf);
  }
  
-bool
-Perl_is_uni_alpha(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_ALPHA, tmpbuf);
-}
-
-bool
-Perl_is_uni_ascii(pTHX_ UV c)
-{
-    return isASCII(c);
-}
-
-bool
-Perl_is_uni_blank(pTHX_ UV c)
-{
-    return isBLANK_uni(c);
-}
-
-bool
-Perl_is_uni_space(pTHX_ UV c)
+UV
+Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
  {
-    return isSPACE_uni(c);
-}
+    /* We have the latin1-range values compiled into the core, so just use
+     * those, converting the result to utf8.  The only difference between upper
+     * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
+     * either "SS" or "Ss".  Which one to use is passed into the routine in
+     * 'S_or_s' to avoid a test */
  
-bool
-Perl_is_uni_digit(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_DIGIT, tmpbuf);
-}
+    UV converted = toUPPER_LATIN1_MOD(c);
  
-bool
-Perl_is_uni_upper(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_UPPER, tmpbuf);
-}
+    PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
  
-bool
-Perl_is_uni_lower(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_LOWER, tmpbuf);
-}
+    assert(S_or_s == 'S' || S_or_s == 's');
  
-bool
-Perl_is_uni_cntrl(pTHX_ UV c)
-{
-    return isCNTRL_L1(c);
-}
-
-bool
-Perl_is_uni_graph(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_GRAPH, tmpbuf);
-}
-
-bool
-Perl_is_uni_print(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_PRINT, tmpbuf);
-}
-
-bool
-Perl_is_uni_punct(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return _is_utf8_FOO(_CC_PUNCT, tmpbuf);
-}
-
-bool
-Perl_is_uni_xdigit(pTHX_ UV c)
-{
-    return isXDIGIT_uni(c);
-}
-
-UV
-Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
-{
-    /* We have the latin1-range values compiled into the core, so just use
-     * those, converting the result to utf8.  The only difference between upper
-     * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
-     * either "SS" or "Ss".  Which one to use is passed into the routine in
-     * 'S_or_s' to avoid a test */
-
-    UV converted = toUPPER_LATIN1_MOD(c);
-
-    PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
-
-    assert(S_or_s == 'S' || S_or_s == 's');
-
-    if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
-                                            characters in this range */
-       *p = (U8) converted;
-       *lenp = 1;
-       return converted;
-    }
+    if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
+                                            characters in this range */
+       *p = (U8) converted;
+       *lenp = 1;
+       return converted;
+    }
  
      /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
       * which it maps to one of them, so as to only have to have one check for
@@ -1693,20 +1437,18 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
   * LENP will be set to the length in bytes of the string of changed characters
   *
   * The functions return the ordinal of the first character in the string of OUTP */
-#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
-#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
-#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
+#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
+#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
+#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
  
  /* This additionally has the input parameter SPECIALS, which if non-zero will
   * cause this to use the SPECIALS hash for folding (meaning get full case
   * folding); otherwise, when zero, this implies a simple case fold */
-#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
+#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
  
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
-    dVAR;
-
      /* Convert the Unicode character whose ordinal is <c> to its uppercase
       * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
       * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
@@ -1728,8 +1470,6 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  UV
  Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
-    dVAR;
-
      PERL_ARGS_ASSERT_TO_UNI_TITLE;
  
      if (c < 256) {
@@ -1741,7 +1481,7 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
  }
  
  STATIC U8
-S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
+S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
  {
      /* We have the latin1-range values compiled into the core, so just use
       * those, converting the result to utf8.  Since the result is always just
@@ -1750,13 +1490,15 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
      U8 converted = toLOWER_LATIN1(c);
  
      if (p != NULL) {
-       if (NATIVE_IS_INVARIANT(converted)) {
+       if (NATIVE_BYTE_IS_INVARIANT(converted)) {
             *p = converted;
             *lenp = 1;
         }
         else {
-           *p = UTF8_TWO_BYTE_HI(converted);
-           *(p+1) = UTF8_TWO_BYTE_LO(converted);
+            /* Result is known to always be < 256, so can use the EIGHT_BIT
+             * macros */
+           *p = UTF8_EIGHT_BIT_HI(converted);
+           *(p+1) = UTF8_EIGHT_BIT_LO(converted);
             *lenp = 2;
         }
      }
@@ -1766,8 +1508,6 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
  UV
  Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
-    dVAR;
-
      PERL_ARGS_ASSERT_TO_UNI_LOWER;
  
      if (c < 256) {
@@ -1791,592 +1531,236 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
      UV converted;
  
      PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
+    PERL_UNUSED_CONTEXT;
  
      assert (! (flags & FOLD_FLAGS_LOCALE));
  
      if (c == MICRO_SIGN) {
         converted = GREEK_SMALL_LETTER_MU;
      }
-    else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
-
-        /* If can't cross 127/128 boundary, can't return "ss"; instead return
-         * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
-         * under those circumstances. */
-        if (flags & FOLD_FLAGS_NOMIX_ASCII) {
-            *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
-            Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
-                 p, *lenp, U8);
-            return LATIN_SMALL_LETTER_LONG_S;
-        }
-        else {
-            *(p)++ = 's';
-            *p = 's';
-            *lenp = 2;
-            return 's';
-        }
-    }
-    else { /* In this range the fold of all other characters is their lower
-              case */
-       converted = toLOWER_LATIN1(c);
-    }
-
-    if (NATIVE_IS_INVARIANT(converted)) {
-       *p = (U8) converted;
-       *lenp = 1;
-    }
-    else {
-       *(p)++ = UTF8_TWO_BYTE_HI(converted);
-       *p = UTF8_TWO_BYTE_LO(converted);
-       *lenp = 2;
-    }
-
-    return converted;
-}
-
-UV
-Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
-{
-
-    /* Not currently externally documented, and subject to change
-     *  <flags> bits meanings:
-     *     FOLD_FLAGS_FULL  iff full folding is to be used;
-     *     FOLD_FLAGS_LOCALE iff in locale
-     *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
-     */
-
-    PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
-
-    if (c < 256) {
-       UV result = _to_fold_latin1((U8) c, p, lenp,
-                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
-       /* It is illegal for the fold to cross the 255/256 boundary under
-        * locale; in this case return the original */
-       return (result > 256 && flags & FOLD_FLAGS_LOCALE)
-              ? c
-              : result;
-    }
-
-    /* If no special needs, just use the macro */
-    if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
-       uvchr_to_utf8(p, c);
-       return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
-    }
-    else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
-              the special flags. */
-       U8 utf8_c[UTF8_MAXBYTES + 1];
-       uvchr_to_utf8(utf8_c, c);
-       return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
-    }
-}
-
-bool
-Perl_is_uni_alnum_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isALNUM_LC(c);
-    }
-    return _is_uni_FOO(_CC_WORDCHAR, c);
-}
-
-bool
-Perl_is_uni_alnumc_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isALPHANUMERIC_LC(c);
-    }
-    return _is_uni_FOO(_CC_ALPHANUMERIC, c);
-}
-
-bool
-Perl_is_uni_idfirst_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isIDFIRST_LC(c);
-    }
-    return _is_uni_perl_idstart(c);
-}
-
-bool
-Perl_is_uni_alpha_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isALPHA_LC(c);
-    }
-    return _is_uni_FOO(_CC_ALPHA, c);
-}
-
-bool
-Perl_is_uni_ascii_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isASCII_LC(c);
-    }
-    return 0;
-}
-
-bool
-Perl_is_uni_blank_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isBLANK_LC(c);
-    }
-    return isBLANK_uni(c);
-}
-
-bool
-Perl_is_uni_space_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isSPACE_LC(c);
-    }
-    return isSPACE_uni(c);
-}
-
-bool
-Perl_is_uni_digit_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isDIGIT_LC(c);
-    }
-    return _is_uni_FOO(_CC_DIGIT, c);
-}
-
-bool
-Perl_is_uni_upper_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isUPPER_LC(c);
-    }
-    return _is_uni_FOO(_CC_UPPER, c);
-}
-
-bool
-Perl_is_uni_lower_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isLOWER_LC(c);
-    }
-    return _is_uni_FOO(_CC_LOWER, c);
-}
-
-bool
-Perl_is_uni_cntrl_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isCNTRL_LC(c);
-    }
-    return 0;
-}
-
-bool
-Perl_is_uni_graph_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isGRAPH_LC(c);
-    }
-    return _is_uni_FOO(_CC_GRAPH, c);
-}
-
-bool
-Perl_is_uni_print_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isPRINT_LC(c);
-    }
-    return _is_uni_FOO(_CC_PRINT, c);
-}
-
-bool
-Perl_is_uni_punct_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-        return isPUNCT_LC(c);
-    }
-    return _is_uni_FOO(_CC_PUNCT, c);
-}
-
-bool
-Perl_is_uni_xdigit_lc(pTHX_ UV c)
-{
-    if (c < 256) {
-       return isXDIGIT_LC(c);
-    }
-    return isXDIGIT_uni(c);
-}
-
-U32
-Perl_to_uni_upper_lc(pTHX_ U32 c)
-{
-    /* XXX returns only the first character -- do not use XXX */
-    /* XXX no locale support yet */
-    STRLEN len;
-    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-    return (U32)to_uni_upper(c, tmpbuf, &len);
-}
-
-U32
-Perl_to_uni_title_lc(pTHX_ U32 c)
-{
-    /* XXX returns only the first character XXX -- do not use XXX */
-    /* XXX no locale support yet */
-    STRLEN len;
-    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-    return (U32)to_uni_title(c, tmpbuf, &len);
-}
-
-U32
-Perl_to_uni_lower_lc(pTHX_ U32 c)
-{
-    /* XXX returns only the first character -- do not use XXX */
-    /* XXX no locale support yet */
-    STRLEN len;
-    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-    return (U32)to_uni_lower(c, tmpbuf, &len);
-}
-
-PERL_STATIC_INLINE bool
-S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
-                const char *const swashname)
-{
-    /* returns a boolean giving whether or not the UTF8-encoded character that
-     * starts at <p> is in the swash indicated by <swashname>.  <swash>
-     * contains a pointer to where the swash indicated by <swashname>
-     * is to be stored; which this routine will do, so that future calls will
-     * look at <*swash> and only generate a swash if it is not null
-     *
-     * Note that it is assumed that the buffer length of <p> is enough to
-     * contain all the bytes that comprise the character.  Thus, <*p> should
-     * have been checked before this call for mal-formedness enough to assure
-     * that. */
-
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_COMMON;
-
-    /* The API should have included a length for the UTF-8 character in <p>,
-     * but it doesn't.  We therefore assume that p has been validated at least
-     * as far as there being enough bytes available in it to accommodate the
-     * character without reading beyond the end, and pass that number on to the
-     * validating routine */
-    if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
-        if (ckWARN_d(WARN_UTF8)) {
-            Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
-                   "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
-            if (ckWARN(WARN_UTF8)) {    /* This will output details as to the
-                                           what the malformation is */
-                utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
-            }
-        }
-        return FALSE;
-    }
-    if (!*swash) {
-        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-        *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
-    }
-
-    return swash_fetch(*swash, p, TRUE) != 0;
-}
-
-bool
-Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT__IS_UTF8_FOO;
-
-    assert(classnum < _FIRST_NON_SWASH_CC);
-
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[classnum], swash_property_names[classnum]);
-}
-
-bool
-Perl_is_utf8_alnum(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
-
-    /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
-     * descendant of isalnum(3), in other words, it doesn't
-     * contain the '_'. --jhi */
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_WORDCHAR], "IsWord");
-}
-
-bool
-Perl_is_utf8_alnumc(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
-
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHANUMERIC], "IsAlnum");
-}
-
-bool
-Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
-
-    return S_is_utf8_idfirst(aTHX_ p);
-}
-
-bool
-Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
-
-    if (*p == '_')
-       return TRUE;
-    /* is_utf8_idstart would be more logical. */
-    return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
-}
-
-bool
-Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
-
-    return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
-}
-
-bool
-Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
-
-    return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont");
-}
-
-
-bool
-Perl_is_utf8_idcont(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
-
-    return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
-}
-
-bool
-Perl_is_utf8_xidcont(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
-
-    return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
-}
-
-bool
-Perl_is_utf8_alpha(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
-
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_ALPHA], "IsAlpha");
-}
-
-bool
-Perl_is_utf8_ascii(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_ASCII;
-
-    /* ASCII characters are the same whether in utf8 or not.  So the macro
-     * works on both utf8 and non-utf8 representations. */
-    return isASCII(*p);
-}
-
-bool
-Perl_is_utf8_blank(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_BLANK;
-
-    return isBLANK_utf8(p);
-}
-
-bool
-Perl_is_utf8_space(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_SPACE;
-
-    return isSPACE_utf8(p);
-}
+    else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
  
-bool
-Perl_is_utf8_perl_space(pTHX_ const U8 *p)
-{
-    dVAR;
+        /* If can't cross 127/128 boundary, can't return "ss"; instead return
+         * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
+         * under those circumstances. */
+        if (flags & FOLD_FLAGS_NOMIX_ASCII) {
+            *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
+            Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+                 p, *lenp, U8);
+            return LATIN_SMALL_LETTER_LONG_S;
+        }
+        else {
+            *(p)++ = 's';
+            *p = 's';
+            *lenp = 2;
+            return 's';
+        }
+    }
+    else { /* In this range the fold of all other characters is their lower
+              case */
+       converted = toLOWER_LATIN1(c);
+    }
  
-    PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
+    if (UVCHR_IS_INVARIANT(converted)) {
+       *p = (U8) converted;
+       *lenp = 1;
+    }
+    else {
+       *(p)++ = UTF8_TWO_BYTE_HI(converted);
+       *p = UTF8_TWO_BYTE_LO(converted);
+       *lenp = 2;
+    }
  
-    /* Only true if is an ASCII space-like character, and ASCII is invariant
-     * under utf8, so can just use the macro */
-    return isSPACE_A(*p);
+    return converted;
  }
  
-bool
-Perl_is_utf8_perl_word(pTHX_ const U8 *p)
+UV
+Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  {
-    dVAR;
  
-    PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
+    /* Not currently externally documented, and subject to change
+     *  <flags> bits meanings:
+     *     FOLD_FLAGS_FULL  iff full folding is to be used;
+     *     FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
+     *                       locale are to be used.
+     *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
+     */
  
-    /* Only true if is an ASCII word character, and ASCII is invariant
-     * under utf8, so can just use the macro */
-    return isWORDCHAR_A(*p);
-}
+    PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
-bool
-Perl_is_utf8_digit(pTHX_ const U8 *p)
-{
-    dVAR;
+    /* Tread a UTF-8 locale as not being in locale at all */
+    if (IN_UTF8_CTYPE_LOCALE) {
+        flags &= ~FOLD_FLAGS_LOCALE;
+    }
  
-    PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
+    if (c < 256) {
+       UV result = _to_fold_latin1((U8) c, p, lenp,
+                           flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
+       /* It is illegal for the fold to cross the 255/256 boundary under
+        * locale; in this case return the original */
+       return (result > 256 && flags & FOLD_FLAGS_LOCALE)
+              ? c
+              : result;
+    }
  
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_DIGIT], "IsDigit");
+    /* If no special needs, just use the macro */
+    if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
+       uvchr_to_utf8(p, c);
+       return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
+    }
+    else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
+              the special flags. */
+       U8 utf8_c[UTF8_MAXBYTES + 1];
+       uvchr_to_utf8(utf8_c, c);
+       return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
+    }
  }
  
-bool
-Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
+PERL_STATIC_INLINE bool
+S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
+                const char *const swashname, SV* const invlist)
  {
-    dVAR;
+    /* returns a boolean giving whether or not the UTF8-encoded character that
+     * starts at <p> is in the swash indicated by <swashname>.  <swash>
+     * contains a pointer to where the swash indicated by <swashname>
+     * is to be stored; which this routine will do, so that future calls will
+     * look at <*swash> and only generate a swash if it is not null.  <invlist>
+     * is NULL or an inversion list that defines the swash.  If not null, it
+     * saves time during initialization of the swash.
+     *
+     * Note that it is assumed that the buffer length of <p> is enough to
+     * contain all the bytes that comprise the character.  Thus, <*p> should
+     * have been checked before this call for mal-formedness enough to assure
+     * that. */
  
-    PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
+    PERL_ARGS_ASSERT_IS_UTF8_COMMON;
  
-    /* Only true if is an ASCII digit character, and ASCII is invariant
-     * under utf8, so can just use the macro */
-    return isDIGIT_A(*p);
-}
+    /* The API should have included a length for the UTF-8 character in <p>,
+     * but it doesn't.  We therefore assume that p has been validated at least
+     * as far as there being enough bytes available in it to accommodate the
+     * character without reading beyond the end, and pass that number on to the
+     * validating routine */
+    if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
+        if (ckWARN_d(WARN_UTF8)) {
+            Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
+                   "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
+            if (ckWARN(WARN_UTF8)) {    /* This will output details as to the
+                                           what the malformation is */
+                utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
+            }
+        }
+        return FALSE;
+    }
+    if (!*swash) {
+        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+        *swash = _core_swash_init("utf8",
  
-bool
-Perl_is_utf8_upper(pTHX_ const U8 *p)
-{
-    dVAR;
+                                  /* Only use the name if there is no inversion
+                                   * list; otherwise will go out to disk */
+                                  (invlist) ? "" : swashname,
  
-    PERL_ARGS_ASSERT_IS_UTF8_UPPER;
+                                  &PL_sv_undef, 1, 0, invlist, &flags);
+    }
  
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_UPPER], "IsUppercase");
+    return swash_fetch(*swash, p, TRUE) != 0;
  }
  
  bool
-Perl_is_utf8_lower(pTHX_ const U8 *p)
+Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
  {
-    dVAR;
+    PERL_ARGS_ASSERT__IS_UTF8_FOO;
  
-    PERL_ARGS_ASSERT_IS_UTF8_LOWER;
+    assert(classnum < _FIRST_NON_SWASH_CC);
  
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_LOWER], "IsLowercase");
+    return is_utf8_common(p,
+                          &PL_utf8_swash_ptrs[classnum],
+                          swash_property_names[classnum],
+                          PL_XPosix_ptrs[classnum]);
  }
  
  bool
-Perl_is_utf8_cntrl(pTHX_ const U8 *p)
+Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
  {
-    dVAR;
+    SV* invlist = NULL;
  
-    PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
  
-    return isCNTRL_utf8(p);
+    if (! PL_utf8_perl_idstart) {
+        invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
+    }
+    return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
  }
  
  bool
-Perl_is_utf8_graph(pTHX_ const U8 *p)
+Perl__is_utf8_xidstart(pTHX_ const U8 *p)
  {
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
+    PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
  
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_GRAPH], "IsGraph");
+    if (*p == '_')
+       return TRUE;
+    return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
  }
  
  bool
-Perl_is_utf8_print(pTHX_ const U8 *p)
+Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
  {
-    dVAR;
+    SV* invlist = NULL;
  
-    PERL_ARGS_ASSERT_IS_UTF8_PRINT;
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
  
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PRINT], "IsPrint");
+    if (! PL_utf8_perl_idcont) {
+        invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
+    }
+    return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
  }
  
  bool
-Perl_is_utf8_punct(pTHX_ const U8 *p)
+Perl__is_utf8_idcont(pTHX_ const U8 *p)
  {
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
+    PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
  
-    return is_utf8_common(p, &PL_utf8_swash_ptrs[_CC_PUNCT], "IsPunct");
+    return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
  }
  
  bool
-Perl_is_utf8_xdigit(pTHX_ const U8 *p)
+Perl__is_utf8_xidcont(pTHX_ const U8 *p)
  {
-    dVAR;
+    PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
  
-    PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
-
-    return is_XDIGIT_utf8(p);
+    return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
  }
  
  bool
  Perl__is_utf8_mark(pTHX_ const U8 *p)
  {
-    dVAR;
-
      PERL_ARGS_ASSERT__IS_UTF8_MARK;
  
-    return is_utf8_common(p, &PL_utf8_mark, "IsM");
-}
-
-
-bool
-Perl_is_utf8_mark(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_MARK;
-
-    return _is_utf8_mark(p);
+    return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
  }
  
  /*
  =for apidoc to_utf8_case
  
-The C<p> contains the pointer to the UTF-8 string encoding
+C<p> contains the pointer to the UTF-8 string encoding
  the character that is being converted.  This routine assumes that the character
  at C<p> is well-formed.
  
-The C<ustrp> is a pointer to the character buffer to put the
-conversion result to.  The C<lenp> is a pointer to the length
+C<ustrp> is a pointer to the character buffer to put the
+conversion result to.  C<lenp> is a pointer to the length
  of the result.
  
-The C<swashp> is a pointer to the swash to use.
+C<swashp> is a pointer to the swash to use.
  
  Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
-and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  The C<special> (usually,
+and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  C<special> (usually,
  but not always, a multicharacter mapping), is tried first.
  
-The C<special> is a string like "utf8::ToSpecLower", which means the
-hash %utf8::ToSpecLower.  The access to the hash is through
-Perl_to_utf8_case().
+C<special> is a string, normally C<NULL> or C<"">.  C<NULL> means to not use
+any special mappings; C<""> means to use the special mappings.  Values other
+than these two are treated as the name of the hash containing the special
+mappings, like C<"utf8::ToSpecLower">.
  
-The C<normal> is a string like "ToLower" which means the swash
+C<normal> is a string like "ToLower" which means the swash
  %utf8::ToLower.
  
  =cut */
@@ -2385,7 +1769,6 @@ UV
  Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
                         SV **swashp, const char *normal, const char *special)
  {
-    dVAR;
      STRLEN len = 0;
      const UV uv1 = valid_utf8_to_uvchr(p, NULL);
  
@@ -2419,12 +1802,25 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      if (special) {
           /* It might be "special" (sometimes, but not always,
           * a multicharacter mapping) */
-        HV * const hv = get_hv(special, 0);
+         HV *hv = NULL;
          SV **svp;
  
-        if (hv &&
-            (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
-            (*svp)) {
+        /* If passed in the specials name, use that; otherwise use any
+         * given in the swash */
+         if (*special != '\0') {
+            hv = get_hv(special, 0);
+        }
+        else {
+            svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
+            if (svp) {
+                hv = MUTABLE_HV(SvRV(*svp));
+            }
+        }
+
+        if (hv
+             && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
+             && (*svp))
+         {
              const char *s;
  
               s = SvPV_const(*svp, len);
@@ -2471,9 +1867,10 @@ STATIC UV
  S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
  {
      /* This is called when changing the case of a utf8-encoded character above
-     * the Latin1 range, and the operation is in locale.  If the result
-     * contains a character that crosses the 255/256 boundary, disallow the
-     * change, and return the original code point.  See L<perlfunc/lc> for why;
+     * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
+     * result contains a character that crosses the 255/256 boundary, disallow
+     * the change, and return the original code point.  See L<perlfunc/lc> for
+     * why;
       *
       * p       points to the original string whose case was changed; assumed
       *          by this routine to be well-formed
@@ -2522,19 +1919,20 @@ Instead use L</toUPPER_utf8>.
  =cut */
  
  /* Not currently externally documented, and subject to change:
- * <flags> is set iff locale semantics are to be used for code points < 256
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- *              were used in the calculation; otherwise unchanged. */
+ * <flags> is set iff iff the rules from the current underlying locale are to
+ *         be used. */
  
  UV
-Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
  {
-    dVAR;
-
      UV result;
  
      PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
  
+    if (flags && IN_UTF8_CTYPE_LOCALE) {
+        flags = FALSE;
+    }
+
      if (UTF8_IS_INVARIANT(*p)) {
         if (flags) {
             result = toUPPER_LC(*p);
@@ -2545,7 +1943,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
-           result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+           result = toUPPER_LC(c);
         }
         else {
             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
@@ -2567,14 +1966,11 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
-    if (tainted_ptr) {
-       *tainted_ptr = TRUE;
-    }
      return result;
  }
  
@@ -2586,21 +1982,22 @@ Instead use L</toTITLE_utf8>.
  =cut */
  
  /* Not currently externally documented, and subject to change:
- * <flags> is set iff locale semantics are to be used for code points < 256
- *        Since titlecase is not defined in POSIX, uppercase is used instead
- *        for these/
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- *              were used in the calculation; otherwise unchanged. */
+ * <flags> is set iff the rules from the current underlying locale are to be
+ *         used.  Since titlecase is not defined in POSIX, for other than a
+ *         UTF-8 locale, uppercase is used instead for code points < 256.
+ */
  
  UV
-Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
  {
-    dVAR;
-
      UV result;
  
      PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
  
+    if (flags && IN_UTF8_CTYPE_LOCALE) {
+        flags = FALSE;
+    }
+
      if (UTF8_IS_INVARIANT(*p)) {
         if (flags) {
             result = toUPPER_LC(*p);
@@ -2611,7 +2008,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
-           result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+           result = toUPPER_LC(c);
         }
         else {
             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
@@ -2633,14 +2031,11 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
-    if (tainted_ptr) {
-       *tainted_ptr = TRUE;
-    }
      return result;
  }
  
@@ -2652,19 +2047,21 @@ Instead use L</toLOWER_utf8>.
  =cut */
  
  /* Not currently externally documented, and subject to change:
- * <flags> is set iff locale semantics are to be used for code points < 256
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- *              were used in the calculation; otherwise unchanged. */
+ * <flags> is set iff iff the rules from the current underlying locale are to
+ *         be used.
+ */
  
  UV
-Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
+Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
  {
      UV result;
  
-    dVAR;
-
      PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
  
+    if (flags && IN_UTF8_CTYPE_LOCALE) {
+        flags = FALSE;
+    }
+
      if (UTF8_IS_INVARIANT(*p)) {
         if (flags) {
             result = toLOWER_LC(*p);
@@ -2675,7 +2072,8 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags) {
-           result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+           result = toLOWER_LC(c);
         }
         else {
             return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
@@ -2698,14 +2096,11 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
-    if (tainted_ptr) {
-       *tainted_ptr = TRUE;
-    }
      return result;
  }
  
@@ -2718,21 +2113,17 @@ Instead use L</toFOLD_utf8>.
  
  /* Not currently externally documented, and subject to change,
   * in <flags>
- *     bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
- *                           points < 256.  Since foldcase is not defined in
- *                           POSIX, lowercase is used instead
+ *     bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
+ *                           locale are to be used.
   *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
   *                           otherwise simple folds
   *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
   *                           prohibited
- * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
- *              were used in the calculation; otherwise unchanged. */
+ */
  
  UV
-Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
  {
-    dVAR;
-
      UV result;
  
      PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
@@ -2742,6 +2133,10 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
  
      assert(p != ustrp); /* Otherwise overwrites */
  
+    if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) {
+        flags &= ~FOLD_FLAGS_LOCALE;
+    }
+
      if (UTF8_IS_INVARIANT(*p)) {
         if (flags & FOLD_FLAGS_LOCALE) {
             result = toFOLD_LC(*p);
@@ -2753,7 +2148,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
      }
      else if UTF8_IS_DOWNGRADEABLE_START(*p) {
         if (flags & FOLD_FLAGS_LOCALE) {
-           result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+            U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+           result = toFOLD_LC(c);
         }
         else {
             return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
@@ -2766,14 +2162,20 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
  
         if (flags & FOLD_FLAGS_LOCALE) {
  
-            /* Special case this character, as what normally gets returned
-             * under locale doesn't work */
+            /* Special case these two characters, as what normally gets
+             * returned under locale doesn't work */
              if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
                  && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
                            sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
              {
                  goto return_long_s;
              }
+            else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
+                && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
+                          sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
+            {
+                goto return_ligature_st;
+            }
             return check_locale_boundary_crossing(p, result, ustrp, lenp);
         }
         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
@@ -2781,8 +2183,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
         }
         else {
             /* This is called when changing the case of a utf8-encoded
-            * character above the Latin1 range, and the result should not
-            * contain an ASCII character. */
+             * character above the ASCII range, and the result should not
+             * contain an ASCII character. */
  
             UV original;    /* To store the first code point of <p> */
  
@@ -2795,11 +2197,16 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
                     /* Crossed, have to return the original */
                     original = valid_utf8_to_uvchr(p, lenp);
  
-                    /* But in this one instance, there is an alternative we can
+                    /* But in these instances, there is an alternative we can
                       * return that is valid */
-                    if (original == LATIN_CAPITAL_LETTER_SHARP_S) {
+                    if (original == LATIN_CAPITAL_LETTER_SHARP_S
+                        || original == LATIN_SMALL_LETTER_SHARP_S)
+                    {
                          goto return_long_s;
                      }
+                    else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
+                        goto return_ligature_st;
+                    }
                     Copy(p, ustrp, *lenp, char);
                     return original;
                 }
@@ -2817,14 +2224,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
-    if (tainted_ptr) {
-       *tainted_ptr = TRUE;
-    }
      return result;
  
    return_long_s:
@@ -2838,6 +2242,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
      Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
          ustrp, *lenp, U8);
      return LATIN_SMALL_LETTER_LONG_S;
+
+  return_ligature_st:
+    /* Two folds to 'st' are prohibited by the options; instead we pick one and
+     * have the other one fold to it */
+
+    *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
+    Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
+    return LATIN_SMALL_LIGATURE_ST;
  }
  
  /* Note:
@@ -2861,6 +2273,14 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
  SV*
  Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
  {
+
+    /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
+     * use the following define */
+
+#define CORE_SWASH_INIT_RETURN(x)   \
+    PL_curpm= old_PL_curpm;         \
+    return x
+
      /* Initialize and return a swash, creating it if necessary.  It does this
       * by calling utf8_heavy.pl in the general case.  The returned value may be
       * the swash's inversion list instead if the input parameters allow it.
@@ -2899,11 +2319,14 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
       * Thus there are three possible inputs to find the swash: <name>,
       * <listsv>, and <invlist>.  At least one must be specified.  The result
       * will be the union of the specified ones, although <listsv>'s various
-     * actions can intersect, etc. what <name> gives.
+     * actions can intersect, etc. what <name> gives.  To avoid going out to
+     * disk at all, <invlist> should specify completely what the swash should
+     * have, and <listsv> should be &PL_sv_undef and <name> should be "".
       *
       * <invlist> is only valid for binary properties */
  
-    dVAR;
+    PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
+
      SV* retval = &PL_sv_undef;
      HV* swash_hv = NULL;
      const int invlist_swash_boundary =
@@ -2915,6 +2338,10 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
      assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
      assert(! invlist || minbits == 1);
  
+    PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
+                       that triggered the swash init and the swash init perl logic itself.
+                       See perl #122747 */
+
      /* If data was passed in to go out to utf8_heavy to find the swash of, do
       * so */
      if (listsv != &PL_sv_undef || strNE(name, "")) {
@@ -2930,7 +2357,6 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         PUSHSTACKi(PERLSI_MAGIC);
         ENTER;
         SAVEHINTS();
-       save_re_context();
         /* We might get here via a subroutine signature which uses a utf8
          * parameter name, at which point PL_subname will have been set
          * but not yet used. */
@@ -2942,13 +2368,9 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             ENTER;
             if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
             GvSV(PL_errgv) = NULL;
+#ifndef NO_TAINT_SUPPORT
             /* It is assumed that callers of this routine are not passing in
              * any user derived data.  */
-           /* Need to do this after save_re_context() as it will set
-            * PL_tainted to 1 while saving $1 etc (see the code after getrx:
-            * in Perl_magic_get).  Even line to create errsv_save can turn on
-            * PL_tainted.  */
-#ifndef NO_TAINT_SUPPORT
             SAVEBOOL(TAINT_get);
             TAINT_NOT;
  #endif
@@ -3003,12 +2425,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
  
                 /* If caller wants to handle missing properties, let them */
                 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
-                   return NULL;
+                    CORE_SWASH_INIT_RETURN(NULL);
                 }
                 Perl_croak(aTHX_
                            "Can't find Unicode property definition \"%"SVf"\"",
                            SVfARG(retval));
-           Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
+                NOT_REACHED; /* NOTREACHED */
         }
      } /* End of calling the module to find the swash */
  
@@ -3081,7 +2503,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
  
          /* Here, we have computed the union of all the passed-in data.  It may
           * be that there was an inversion list in the swash which didn't get
-         * touched; otherwise save the one computed one */
+         * touched; otherwise save the computed one */
         if (! invlist_in_swash_is_valid
              && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
          {
@@ -3094,6 +2516,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             else SvREFCNT_inc_simple_void_NN(swash_invlist);
         }
  
+        SvREADONLY_on(swash_invlist);
+
          /* Use the inversion list stand-alone if small enough */
          if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
             SvREFCNT_dec(retval);
@@ -3103,7 +2527,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
          }
      }
  
-    return retval;
+    CORE_SWASH_INIT_RETURN(retval);
+#undef CORE_SWASH_INIT_RETURN
  }
  
  
@@ -3152,14 +2577,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
  UV
  Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
  {
-    dVAR;
      HV *const hv = MUTABLE_HV(SvRV(swash));
      U32 klen;
      U32 off;
-    STRLEN slen;
+    STRLEN slen = 0;
      STRLEN needents;
      const U8 *tmps = NULL;
-    U32 bit;
      SV *swatch;
      const U8 c = *ptr;
  
@@ -3289,17 +2712,21 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
  
      switch ((int)((slen << 3) / needents)) {
      case 1:
-       bit = 1 << (off & 7);
-       off >>= 3;
-       return (tmps[off] & bit) != 0;
+       return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
      case 8:
-       return tmps[off];
+       return ((UV) tmps[off]);
      case 16:
         off <<= 1;
-       return (tmps[off] << 8) + tmps[off + 1] ;
+       return
+            ((UV) tmps[off    ] << 8) +
+            ((UV) tmps[off + 1]);
      case 32:
         off <<= 2;
-       return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
+       return
+            ((UV) tmps[off    ] << 24) +
+            ((UV) tmps[off + 1] << 16) +
+            ((UV) tmps[off + 2] <<  8) +
+            ((UV) tmps[off + 3]);
      }
      Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
                "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
@@ -3337,9 +2764,12 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
      /* nl points to the next \n in the scan */
      U8* const nl = (U8*)memchr(l, '\n', lend - l);
  
+    PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
+
      /* Get the first number on the line: the range minimum */
      numlen = lend - l;
      *min = grok_hex((char *)l, &numlen, &flags, NULL);
+    *max = *min;    /* So can never return without setting max */
      if (numlen)            /* If found a hex number, position past it */
         l += numlen;
      else if (nl) {         /* Else, go handle next line, if any */
@@ -3363,30 +2793,19 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
             *max = *min;
  
         /* Non-binary tables have a third entry: what the first element of the
-        * range maps to */
+        * range maps to.  The map for those currently read here is in hex */
         if (wants_value) {
             if (isBLANK(*l)) {
                 ++l;
-
-               /* The ToLc, etc table mappings are not in hex, and must be
-                * corrected by adding the code point to them */
-               if (typeto) {
-                   char *after_strtol = (char *) lend;
-                   *val = Strtol((char *)l, &after_strtol, 10);
-                   l = (U8 *) after_strtol;
-               }
-               else { /* Other tables are in hex, and are the correct result
-                         without tweaking */
-                   flags = PERL_SCAN_SILENT_ILLDIGIT
-                       | PERL_SCAN_DISALLOW_PREFIX
-                       | PERL_SCAN_SILENT_NON_PORTABLE;
-                   numlen = lend - l;
-                   *val = grok_hex((char *)l, &numlen, &flags, NULL);
-                   if (numlen)
-                       l += numlen;
-                   else
-                       *val = 0;
-               }
+                flags = PERL_SCAN_SILENT_ILLDIGIT
+                    | PERL_SCAN_DISALLOW_PREFIX
+                    | PERL_SCAN_SILENT_NON_PORTABLE;
+                numlen = lend - l;
+                *val = grok_hex((char *)l, &numlen, &flags, NULL);
+                if (numlen)
+                    l += numlen;
+                else
+                    *val = 0;
             }
             else {
                 *val = 0;
@@ -3402,7 +2821,6 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
      }
      else { /* Nothing following range min, should be single element with no
               mapping expected */
-       *max = *min;
         if (wants_value) {
             *val = 0;
             if (typeto) {
@@ -3517,8 +2935,8 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
      lend = l + lcur;
      while (l < lend) {
         UV min, max, val, upper;
-       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
-                                        cBOOL(octets), typestr);
+       l = swash_scan_list_line(l, lend, &min, &max, &val,
+                                                        cBOOL(octets), typestr);
         if (l > lend) {
             break;
         }
@@ -3758,6 +3176,9 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      * have two elements, the utf8 for itself, and for 004C.  For 006B, there
      * would be three elements in its array, the utf8 for 006B, 004B and 212A.
      *
+    * Note that there are no elements in the hash for 004B, 004C, 212A.  The
+    * keys are only code points that are folded-to, so it isn't a full closure.
+    *
      * Essentially, for any code point, it gives all the code points that map to
      * it, or the list of 'froms' for that point.
      *
@@ -3877,12 +3298,12 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
                                                  &char_to, &to_len)))
         {
-           if (av_len(from_list) > 0) {
+           if (av_tindex(from_list) > 0) {
                 SSize_t i;
  
                 /* We iterate over all combinations of i,j to place each code
                  * point on each list */
-               for (i = 0; i <= av_len(from_list); i++) {
+               for (i = 0; i <= av_tindex(from_list); i++) {
                     SSize_t j;
                     AV* i_list = newAV();
                     SV** entryp = av_fetch(from_list, i, FALSE);
@@ -3898,8 +3319,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
                     }
  
-                   /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
-                   for (j = 0; j <= av_len(from_list); j++) {
+                   /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
+                   for (j = 0; j <= av_tindex(from_list); j++) {
                         entryp = av_fetch(from_list, j, FALSE);
                         if (entryp == NULL) {
                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
@@ -3926,8 +3347,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      while (l < lend) {
         UV min, max, val;
         UV inverse;
-       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
-                                        cBOOL(octets), typestr);
+       l = swash_scan_list_line(l, lend, &min, &max, &val,
+                                                     cBOOL(octets), typestr);
         if (l > lend) {
             break;
         }
@@ -3958,18 +3379,20 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
  
             /* Look through list to see if this inverse mapping already is
              * listed, or if there is a mapping to itself already */
-           for (i = 0; i <= av_len(list); i++) {
+           for (i = 0; i <= av_tindex(list); i++) {
                 SV** entryp = av_fetch(list, i, FALSE);
                 SV* entry;
+               UV uv;
                 if (entryp == NULL) {
                     Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
                 }
                 entry = *entryp;
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
-               if (SvUV(entry) == val) {
+               uv = SvUV(entry);
+               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));*/
+               if (uv == val) {
                     found_key = TRUE;
                 }
-               if (SvUV(entry) == inverse) {
+               if (uv == inverse) {
                     found_inverse = TRUE;
                 }
  
@@ -4069,43 +3492,79 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
      loc = (char *) l;
      lend = l + lcur;
  
-    /* Scan the input to count the number of lines to preallocate array size
-     * based on worst possible case, which is each line in the input creates 2
-     * elements in the inversion list: 1) the beginning of a range in the list;
-     * 2) the beginning of a range not in the list.  */
-    while ((loc = (strchr(loc, '\n'))) != NULL) {
-       elements += 2;
-       loc++;
-    }
+    if (*l == 'V') {    /*  Inversion list format */
+        const char *after_atou = (char *) lend;
+        UV element0;
+        UV* other_elements_ptr;
  
-    /* If the ending is somehow corrupt and isn't a new line, add another
-     * element for the final range that isn't in the inversion list */
-    if (! (*lend == '\n'
-       || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
-    {
-       elements++;
+        /* The first number is a count of the rest */
+        l++;
+        elements = grok_atou((const char *)l, &after_atou);
+        if (elements == 0) {
+            invlist = _new_invlist(0);
+        }
+        else {
+            while (isSPACE(*l)) l++;
+            l = (U8 *) after_atou;
+
+            /* Get the 0th element, which is needed to setup the inversion list */
+            while (isSPACE(*l)) l++;
+            element0 = (UV) grok_atou((const char *)l, &after_atou);
+            l = (U8 *) after_atou;
+            invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
+            elements--;
+
+            /* Then just populate the rest of the input */
+            while (elements-- > 0) {
+                if (l > lend) {
+                    Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
+                }
+                while (isSPACE(*l)) l++;
+                *other_elements_ptr++ = (UV) grok_atou((const char *)l, &after_atou);
+                l = (U8 *) after_atou;
+            }
+        }
      }
+    else {
  
-    invlist = _new_invlist(elements);
+        /* Scan the input to count the number of lines to preallocate array
+         * size based on worst possible case, which is each line in the input
+         * creates 2 elements in the inversion list: 1) the beginning of a
+         * range in the list; 2) the beginning of a range not in the list.  */
+        while ((loc = (strchr(loc, '\n'))) != NULL) {
+            elements += 2;
+            loc++;
+        }
  
-    /* Now go through the input again, adding each range to the list */
-    while (l < lend) {
-       UV start, end;
-       UV val;         /* Not used by this function */
+        /* If the ending is somehow corrupt and isn't a new line, add another
+         * element for the final range that isn't in the inversion list */
+        if (! (*lend == '\n'
+            || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
+        {
+            elements++;
+        }
  
-       l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
-                                        cBOOL(octets), typestr);
+        invlist = _new_invlist(elements);
  
-       if (l > lend) {
-           break;
-       }
+        /* Now go through the input again, adding each range to the list */
+        while (l < lend) {
+            UV start, end;
+            UV val;            /* Not used by this function */
+
+            l = swash_scan_list_line(l, lend, &start, &end, &val,
+                                                        cBOOL(octets), typestr);
+
+            if (l > lend) {
+                break;
+            }
  
-       invlist = _add_range_to_invlist(invlist, start, end);
+            invlist = _add_range_to_invlist(invlist, start, end);
+        }
      }
  
      /* Invert if the data says it should be */
      if (invert_it_svp && SvUV(*invert_it_svp)) {
-       _invlist_invert_prop(invlist);
+       _invlist_invert(invlist);
      }
  
      /* This code is copied from swatch_get()
@@ -4182,6 +3641,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
         sv_free(other); /* through with it! */
      }
  
+    SvREADONLY_on(invlist);
      return invlist;
  }
  
@@ -4210,68 +3670,6 @@ Perl__get_swash_invlist(pTHX_ SV* const swash)
      return *ptr;
  }
  
-/*
-=for apidoc uvchr_to_utf8
-
-Adds the UTF-8 representation of the Native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
-    d = uvchr_to_utf8(d, uv);
-
-is the recommended wide native character-aware way of saying
-
-    *(d++) = uv;
-
-=cut
-*/
-
-/* On ASCII machines this is normally a macro but we want a
-   real function in case XS code wants it
-*/
-U8 *
-Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
-{
-    PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
-
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
-}
-
-U8 *
-Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
-{
-    PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
-
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
-}
-
-/*
-=for apidoc utf8n_to_uvchr
-
-Returns the native character value of the first character in the string
-C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-C<length> and C<flags> are the same as L</utf8n_to_uvoffuni>().
-
-=cut
-*/
-/* On ASCII machines this is normally a macro but we want
-   a real function in case XS code wants it
-*/
-UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
-U32 flags)
-{
-    const UV uv = Perl_utf8n_to_uvoffuni(aTHX_ s, curlen, retlen, flags);
-
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
-
-    return UNI_TO_NATIVE(uv);
-}
-
  bool
  Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  {
@@ -4478,18 +3876,15 @@ L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
   *  0 for as-documented above
   *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
                             ASCII one, to not match
- *  FOLDEQ_UTF8_LOCALE     meaning that locale rules are to be used for code
- *                         points below 256; unicode rules for above 255; and
- *                         folds that cross those boundaries are disallowed,
- *                         like the NOMIX_ASCII option
- *  FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
- *                           routine.  This allows that step to be skipped.
- *  FOLDEQ_S2_ALREADY_FOLDED   Similarly.
+ *  FOLDEQ_LOCALE          is set iff the rules from the current underlying
+ *                         locale are to be used.
+ *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
+ *                            routine.  This allows that step to be skipped.
+ *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
   */
  I32
  Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
  {
-    dVAR;
      const U8 *p1  = (const U8*)s1; /* Point to current char */
      const U8 *p2  = (const U8*)s2;
      const U8 *g1 = NULL;       /* goal for s1 */
@@ -4504,10 +3899,22 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
-    /* The algorithm requires that input with the flags on the first line of
-     * the assert not be pre-folded. */
-    assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE))
-       && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+    assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+           && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+    /* The algorithm is to trial the folds without regard to the flags on
+     * the first line of the above assert(), and then see if the result
+     * violates them.  This means that the inputs can't be pre-folded to a
+     * violating result, hence the assert.  This could be changed, with the
+     * addition of extra tests here for the already-folded case, which would
+     * slow it down.  That cost is more than any possible gain for when these
+     * flags are specified, as the flags indicate /il or /iaa matching which
+     * is less common than /iu, and I (khw) also believe that real-world /il
+     * and /iaa matches are most likely to involve code points 0-255, and this
+     * function only under rare conditions gets called for 0-255. */
+
+    if (IN_UTF8_CTYPE_LOCALE) {
+        flags &= ~FOLDEQ_LOCALE;
+    }
  
      if (pe1) {
          e1 = *(U8**)pe1;
@@ -4569,7 +3976,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                 /* If in locale matching, we use two sets of rules, depending
                  * on if the code point is above or below 255.  Here, we test
                  * for and handle locale rules */
-               if ((flags & FOLDEQ_UTF8_LOCALE)
+               if ((flags & FOLDEQ_LOCALE)
                     && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1)))
                 {
                     /* There is no mixing of code points above and below 255. */
@@ -4614,7 +4021,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
                 n2 = UTF8SKIP(f2);
             }
             else {
-               if ((flags & FOLDEQ_UTF8_LOCALE)
+               if ((flags & FOLDEQ_LOCALE)
                     && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2)))
                 {
                     /* Here, the next char in s2 is < 256.  We've already
@@ -4704,6 +4111,66 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
      return 1;
  }
  
+/* XXX The next two functions should likely be moved to mathoms.c once all
+ * occurrences of them are removed from the core; some cpan-upstream modules
+ * still use them */
+
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
+
+    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
+
+/*
+=for apidoc utf8n_to_uvuni
+
+Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
+
+This function was useful for code that wanted to handle both EBCDIC and
+ASCII platforms with Unicode properties, but starting in Perl v5.20, the
+distinctions between the platforms have mostly been made invisible to most
+code, so this function is quite unlikely to be what you want.  If you do need
+this precise functionality, use instead
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
+or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
+
+    return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
+}
+
+/*
+=for apidoc uvuni_to_utf8_flags
+
+Instead you almost certainly want to use L</uvchr_to_utf8> or
+L</uvchr_to_utf8_flags>>.
+
+This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
+which itself, while not deprecated, should be used only in isolated
+circumstances.  These functions were useful for code that wanted to handle
+both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
+v5.20, the distinctions between the platforms have mostly been made invisible
+to most code, so this function is quite unlikely to be what you want.
+
+=cut
+*/
+
+U8 *
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
+
+    return uvoffuni_to_utf8_flags(d, uv, flags);
+}
+
  /*
   * Local variables:
   * c-indentation-style: bsd