utf8.c: Combine 2 function calls into one

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index c9f2c9a..418f0d8 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -31,11 +31,13 @@
  #include "EXTERN.h"
  #define PERL_IN_UTF8_C
  #include "perl.h"
+#include "inline_invlist.c"
  
  #ifndef EBCDIC
  /* Separate prototypes needed because in ASCII systems these are
   * usually macros but they still are compiled as code, too. */
  PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
+PERL_CALLCONV UV       Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen);
  PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  #endif
  
@@ -57,14 +59,14 @@ within non-zero characters.
  /*
  =for apidoc is_ascii_string
  
-Returns true if the first C<len> bytes of the given string are the same whether
+Returns true if the first C<len> bytes of the string C<s> are the same whether
  or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  fit this definition, hence the function's name.
  
  If C<len> is 0, it will be calculated using C<strlen(s)>.  
  
-See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
+See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  
  =cut
  */
@@ -109,7 +111,8 @@ This is the recommended Unicode-aware way of saying
  
  This function will convert to UTF-8 (and not warn) even code points that aren't
  legal Unicode or are problematic, unless C<flags> contains one or more of the
-following flags.
+following flags:
+
  If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
  the function will raise a warning, provided UTF8 warnings are enabled.  If instead
  UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
@@ -136,7 +139,10 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  
-    if (ckWARN_d(WARN_UTF8)) {
+    /* The first problematic code point is the first surrogate */
+    if (uv >= UNICODE_SURROGATE_FIRST
+        && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
+    {
         if (UNICODE_IS_SURROGATE(uv)) {
             if (flags & UNICODE_WARN_SURROGATE) {
                 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
@@ -257,9 +263,9 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  
  /*
  
-Tests if some arbitrary number of bytes begins in a valid UTF-8
+Tests if the first C<len> bytes of string C<s> form a valid UTF-8
  character.  Note that an INVARIANT (i.e. ASCII) character is a valid
-UTF-8 character.  The actual number of bytes in the UTF-8 character
+UTF-8 character.  The number of bytes in the UTF-8 character
  will be returned if it is valid, otherwise 0.
  
  This is the "slow" version as opposed to the "fast" version which is
@@ -273,84 +279,93 @@ the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
  five bytes or more.
  
  =cut */
-STATIC STRLEN
+PERL_STATIC_INLINE STRLEN
  S_is_utf8_char_slow(const U8 *s, const STRLEN len)
  {
-    U8 u = *s;
-    STRLEN slen;
-    UV uv, ouv;
+    dTHX;   /* The function called below requires thread context */
+
+    STRLEN actual_len;
  
      PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
  
-    if (UTF8_IS_INVARIANT(u))
-       return 1;
+    utf8n_to_uvuni(s, len, &actual_len, UTF8_CHECK_ONLY);
  
-    if (!UTF8_IS_START(u))
-       return 0;
+    return (actual_len == (STRLEN) -1) ? 0 : actual_len;
+}
  
-    if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
-       return 0;
+/*
+=for apidoc is_utf8_char_buf
  
-    slen = len - 1;
-    s++;
-#ifdef EBCDIC
-    u = NATIVE_TO_UTF(u);
-#endif
-    u &= UTF_START_MASK(len);
-    uv  = u;
-    ouv = uv;
-    while (slen--) {
-       if (!UTF8_IS_CONTINUATION(*s))
-           return 0;
-       uv = UTF8_ACCUMULATE(uv, *s);
-       if (uv < ouv)
-           return 0;
-       ouv = uv;
-       s++;
-    }
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
+buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+    STRLEN len;
+
+    PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
  
-    if ((STRLEN)UNISKIP(uv) < len)
+    if (buf_end <= buf) {
         return 0;
+    }
  
-    return len;
+    len = buf_end - buf;
+    if (len > UTF8SKIP(buf)) {
+       len = UTF8SKIP(buf);
+    }
+
+#ifdef IS_UTF8_CHAR
+    if (IS_UTF8_CHAR_FAST(len))
+        return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+    return is_utf8_char_slow(buf, len);
  }
  
  /*
  =for apidoc is_utf8_char
  
+DEPRECATED!
+
  Tests if some arbitrary number of bytes begins in a valid UTF-8
  character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
  character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
  character will be returned if it is valid, otherwise 0.
  
-WARNING: use only if you *know* that C<s> has at least either UTF8_MAXBYTES or
-UTF8SKIP(s) bytes.
+This function is deprecated due to the possibility that malformed input could
+cause reading beyond the end of the input buffer.  Use L</is_utf8_char_buf>
+instead.
  
  =cut */
+
  STRLEN
  Perl_is_utf8_char(const U8 *s)
  {
-    const STRLEN len = UTF8SKIP(s);
-
      PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
-    if (IS_UTF8_CHAR_FAST(len))
-        return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
-    return is_utf8_char_slow(s, len);
+
+    /* Assumes we have enough space, which is why this is deprecated */
+    return is_utf8_char_buf(s, s + UTF8SKIP(s));
  }
  
  
  /*
  =for apidoc is_utf8_string
  
-Returns true if first C<len> bytes of the given string form a valid
+Returns true if the first C<len> bytes of string C<s> form a valid
  UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
  using C<strlen(s)> (which means if you use this option, that C<s> has to have a
  terminating NUL byte).  Note that all characters being ASCII constitute 'a
  valid UTF-8 string'.
  
-See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
+See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  
  =cut
  */
@@ -368,8 +383,6 @@ Perl_is_utf8_string(const U8 *s, STRLEN len)
          if (UTF8_IS_INVARIANT(*x)) {
             x++;
          }
-        else if (!UTF8_IS_START(*x))
-            return FALSE;
          else {
               /* ... and call is_utf8_char() only if really needed. */
              const STRLEN c = UTF8SKIP(x);
@@ -398,20 +411,20 @@ Implemented as a macro in utf8.h
  
  =for apidoc is_utf8_string_loc
  
-Like is_utf8_string() but stores the location of the failure (in the
-case of "utf8ness failure") or the location s+len (in the case of
+Like L</is_utf8_string> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
  "utf8ness success") in the C<ep>.
  
-See also is_utf8_string_loclen() and is_utf8_string().
+See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
  
  =for apidoc is_utf8_string_loclen
  
-Like is_utf8_string() but stores the location of the failure (in the
-case of "utf8ness failure") or the location s+len (in the case of
+Like L</is_utf8_string>() but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
  "utf8ness success") in the C<ep>, and the number of UTF-8
  encoded characters in the C<el>.
  
-See also is_utf8_string_loc() and is_utf8_string().
+See also L</is_utf8_string_loc>() and L</is_utf8_string>().
  
  =cut
  */
@@ -432,8 +445,6 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
          /* Inline the easy bits of is_utf8_char() here for speed... */
          if (UTF8_IS_INVARIANT(*x))
              next_char_ptr = x + 1;
-        else if (!UTF8_IS_START(*x))
-            goto out;
          else {
              /* ... and call is_utf8_char() only if really needed. */
              c = UTF8SKIP(x);
@@ -467,16 +478,16 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  =for apidoc utf8n_to_uvuni
  
  Bottom level UTF-8 decode routine.
-Returns the code point value of the first character in the string C<s>
-which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
-C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
-character.
+Returns the code point value of the first character in the string C<s>,
+which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
+C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
+the length, in bytes, of that character.
  
  The value of C<flags> determines the behavior when C<s> does not point to a
  well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
-C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
-is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
-is raised.
+zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
  
  Various ALLOW flags can be set in C<flags> to allow (and not warn on)
  individual types of malformations, such as the sequence being overlong (that
@@ -484,16 +495,24 @@ is, when there is a shorter sequence that can express the same code point;
  overlong sequences are expressly forbidden in the UTF-8 standard due to
  potential security issues).  Another malformation example is the first byte of
  a character not being a legal first byte.  See F<utf8.h> for the list of such
-flags.  Of course, the value returned by this function under such conditions is
-not reliable.
+flags.  For allowed 0 length strings, this function returns 0; for allowed
+overlong sequences, the computed code point is returned; for all other allowed
+malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
+determinable reasonable value.
  
  The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
  flags) malformation is found.  If this flag is set, the routine assumes that
  the caller will raise a warning, and this function will silently just set
-C<retlen> to C<-1> and return zero.
+C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
+
+Note that this API requires disambiguation between successful decoding a NUL
+character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
+in both cases, 0 is returned.  To disambiguate, upon a zero return, see if the
+first byte of C<s> is 0 as well.  If so, the input was a NUL; if not, the input
+had an error.
  
  Certain code points are considered problematic.  These are Unicode surrogates,
-Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
+Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
  By default these are considered regular code points, but certain situations
  warrant special handling for them.  If C<flags> contains
  UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
@@ -511,20 +530,24 @@ UTF8_CHECK_ONLY is also specified.)
  
  Very large code points (above 0x7FFF_FFFF) are considered more problematic than
  the others that are above the Unicode legal maximum.  There are several
-reasons, one of which is that the original UTF-8 specification never went above
-this number (the current 0x10FFF limit was imposed later).  The UTF-8 encoding
-on ASCII platforms for these large code points begins with a byte containing
-0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
-malformations, while allowing smaller above-Unicode code points.  (Of course
-UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
-as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
-flags, but applies just to these code points.
+reasons: they requre at least 32 bits to represent them on ASCII platforms, are
+not representable at all on EBCDIC platforms, and the original UTF-8
+specification never went above this number (the current 0x10FFFF limit was
+imposed later).  (The smaller ones, those that fit into 32 bits, are
+representable by a UV on ASCII platforms, but not by an IV, which means that
+the number of operations that can be performed on them is quite restricted.)
+The UTF-8 encoding on ASCII platforms for these large code points begins with a
+byte containing 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to
+be treated as malformations, while allowing smaller above-Unicode code points.
+(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
+including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
+the other WARN flags, but applies just to these code points.
  
  All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
-Most code should use utf8_to_uvchr() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  =cut
  */
@@ -534,253 +557,492 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  {
      dVAR;
      const U8 * const s0 = s;
-    UV uv = *s, ouv = 0;
-    STRLEN len = 1;
-    bool dowarn = ckWARN_d(WARN_UTF8);
-    const UV startbyte = *s;
-    STRLEN expectlen = 0;
-    U32 warning = 0;
+    U8 overflow_byte = '\0';   /* Save byte in case of overflow */
+    U8 * send;
+    UV uv = *s;
+    STRLEN expectlen;
      SV* sv = NULL;
+    UV outlier_ret = 0;        /* return value when input is in error or problematic
+                        */
+    UV pack_warn = 0;  /* Save result of packWARN() for later */
+    bool unexpected_non_continuation = FALSE;
+    bool overflowed = FALSE;
+    bool do_overlong_test = TRUE;   /* May have to skip this test */
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
+    const char* const malformed_text = "Malformed UTF-8 character";
  
-/* This list is a superset of the UTF8_ALLOW_XXX. */
+    PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
  
-#define UTF8_WARN_EMPTY                                 1
-#define UTF8_WARN_CONTINUATION                  2
-#define UTF8_WARN_NON_CONTINUATION              3
-#define UTF8_WARN_SHORT                                 4
-#define UTF8_WARN_OVERFLOW                      5
-#define UTF8_WARN_LONG                          6
+    /* The order of malformation tests here is important.  We should consume as
+     * few bytes as possible in order to not skip any valid character.  This is
+     * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
+     * http://unicode.org/reports/tr36 for more discussion as to why.  For
+     * example, once we've done a UTF8SKIP, we can tell the expected number of
+     * bytes, and could fail right off the bat if the input parameters indicate
+     * that there are too few available.  But it could be that just that first
+     * byte is garbled, and the intended character occupies fewer bytes.  If we
+     * blindly assumed that the first byte is correct, and skipped based on
+     * that number, we could skip over a valid input character.  So instead, we
+     * always examine the sequence byte-by-byte.
+     *
+     * We also should not consume too few bytes, otherwise someone could inject
+     * things.  For example, an input could be deliberately designed to
+     * overflow, and if this code bailed out immediately upon discovering that,
+     * returning to the caller *retlen pointing to the very next byte (one
+     * which is actually part of of the overflowing sequence), that could look
+     * legitimate to the caller, which could discard the initial partial
+     * sequence and process the rest, inappropriately */
+
+    /* Zero length strings, if allowed, of necessity are zero */
+    if (UNLIKELY(curlen == 0)) {
+       if (retlen) {
+           *retlen = 0;
+       }
  
-    if (curlen == 0 &&
-       !(flags & UTF8_ALLOW_EMPTY)) {
-       warning = UTF8_WARN_EMPTY;
+       if (flags & UTF8_ALLOW_EMPTY) {
+           return 0;
+       }
+       if (! (flags & UTF8_CHECK_ONLY)) {
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
+       }
         goto malformed;
      }
  
+    expectlen = UTF8SKIP(s);
+
+    /* A well-formed UTF-8 character, as the vast majority of calls to this
+     * function will be for, has this expected length.  For efficiency, set
+     * things up here to return it.  It will be overriden only in those rare
+     * cases where a malformation is found */
+    if (retlen) {
+       *retlen = expectlen;
+    }
+
+    /* An invariant is trivially well-formed */
      if (UTF8_IS_INVARIANT(uv)) {
-       if (retlen)
-           *retlen = 1;
         return (UV) (NATIVE_TO_UTF(*s));
      }
  
-    if (UTF8_IS_CONTINUATION(uv) &&
-       !(flags & UTF8_ALLOW_CONTINUATION)) {
-       warning = UTF8_WARN_CONTINUATION;
-       goto malformed;
-    }
+    /* A continuation character can't start a valid sequence */
+    if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
+       if (flags & UTF8_ALLOW_CONTINUATION) {
+           if (retlen) {
+               *retlen = 1;
+           }
+           return UNICODE_REPLACEMENT;
+       }
  
-    if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
-       !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-       warning = UTF8_WARN_NON_CONTINUATION;
+       if (! (flags & UTF8_CHECK_ONLY)) {
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
+       }
+       curlen = 1;
         goto malformed;
      }
  
  #ifdef EBCDIC
      uv = NATIVE_TO_UTF(uv);
-#else
-    if (uv == 0xfe || uv == 0xff) {
-       if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
-           flags &= ~UTF8_WARN_SUPER;  /* Only warn once on this problem */
-       }
-       if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
-           goto malformed;
-       }
-    }
  #endif
  
-    if      (!(uv & 0x20))     { len =  2; uv &= 0x1f; }
-    else if (!(uv & 0x10))     { len =  3; uv &= 0x0f; }
-    else if (!(uv & 0x08))     { len =  4; uv &= 0x07; }
-    else if (!(uv & 0x04))     { len =  5; uv &= 0x03; }
-#ifdef EBCDIC
-    else if (!(uv & 0x02))     { len =  6; uv &= 0x01; }
-    else                       { len =  7; uv &= 0x01; }
-#else
-    else if (!(uv & 0x02))     { len =  6; uv &= 0x01; }
-    else if (!(uv & 0x01))     { len =  7; uv = 0; }
-    else                       { len = 13; uv = 0; } /* whoa! */
+    /* Here is not a continuation byte, nor an invariant.  The only thing left
+     * is a start byte (possibly for an overlong) */
+
+    /* Remove the leading bits that indicate the number of bytes in the
+     * character's whole UTF-8 sequence, leaving just the bits that are part of
+     * the value */
+    uv &= UTF_START_MASK(expectlen);
+
+    /* Now, loop through the remaining bytes in the character's sequence,
+     * accumulating each into the working value as we go.  Be sure to not look
+     * past the end of the input string */
+    send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
+
+    for (s = s0 + 1; s < send; s++) {
+       if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
+#ifndef EBCDIC /* Can't overflow in EBCDIC */
+           if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
+
+               /* The original implementors viewed this malformation as more
+                * serious than the others (though I, khw, don't understand
+                * why, since other malformations also give very very wrong
+                * results), so there is no way to turn off checking for it.
+                * Set a flag, but keep going in the loop, so that we absorb
+                * the rest of the bytes that comprise the character. */
+               overflowed = TRUE;
+               overflow_byte = *s; /* Save for warning message's use */
+           }
  #endif
+           uv = UTF8_ACCUMULATE(uv, *s);
+       }
+       else {
+           /* Here, found a non-continuation before processing all expected
+            * bytes.  This byte begins a new character, so quit, even if
+            * allowing this malformation. */
+           unexpected_non_continuation = TRUE;
+           break;
+       }
+    } /* End of loop through the character's bytes */
+
+    /* Save how many bytes were actually in the character */
+    curlen = s - s0;
+
+    /* The loop above finds two types of malformations: non-continuation and/or
+     * overflow.  The non-continuation malformation is really a too-short
+     * malformation, as it means that the current character ended before it was
+     * expected to (being terminated prematurely by the beginning of the next
+     * character, whereas in the too-short malformation there just are too few
+     * bytes available to hold the character.  In both cases, the check below
+     * that we have found the expected number of bytes would fail if executed.)
+     * Thus the non-continuation malformation is really unnecessary, being a
+     * subset of the too-short malformation.  But there may be existing
+     * applications that are expecting the non-continuation type, so we retain
+     * it, and return it in preference to the too-short malformation.  (If this
+     * code were being written from scratch, the two types might be collapsed
+     * into one.)  I, khw, am also giving priority to returning the
+     * non-continuation and too-short malformations over overflow when multiple
+     * ones are present.  I don't know of any real reason to prefer one over
+     * the other, except that it seems to me that multiple-byte errors trumps
+     * errors from a single byte */
+    if (UNLIKELY(unexpected_non_continuation)) {
+       if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
+           if (! (flags & UTF8_CHECK_ONLY)) {
+               if (curlen == 1) {
+                   sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
+               }
+               else {
+                   sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
+               }
+           }
+           goto malformed;
+       }
+       uv = UNICODE_REPLACEMENT;
  
-    if (retlen)
-       *retlen = len;
+       /* Skip testing for overlongs, as the REPLACEMENT may not be the same
+        * as what the original expectations were. */
+       do_overlong_test = FALSE;
+       if (retlen) {
+           *retlen = curlen;
+       }
+    }
+    else if (UNLIKELY(curlen < expectlen)) {
+       if (! (flags & UTF8_ALLOW_SHORT)) {
+           if (! (flags & UTF8_CHECK_ONLY)) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
+           }
+           goto malformed;
+       }
+       uv = UNICODE_REPLACEMENT;
+       do_overlong_test = FALSE;
+       if (retlen) {
+           *retlen = curlen;
+       }
+    }
  
-    expectlen = len;
+#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
+    if ((*s0 & 0xFE) == 0xFE   /* matches both FE, FF */
+       && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+    {
+       /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+        * generation of the sv, since no warnings are raised under CHECK */
+       if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
+           && ckWARN_d(WARN_UTF8))
+       {
+           /* This message is deliberately not of the same syntax as the other
+            * messages for malformations, for backwards compatibility in the
+            * unlikely event that code is relying on its precise earlier text
+            */
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
+           pack_warn = packWARN(WARN_UTF8);
+       }
+       if (flags & UTF8_DISALLOW_FE_FF) {
+           goto malformed;
+       }
+    }
+    if (UNLIKELY(overflowed)) {
  
-    if ((curlen < expectlen) &&
-       !(flags & UTF8_ALLOW_SHORT)) {
-       warning = UTF8_WARN_SHORT;
+       /* If the first byte is FF, it will overflow a 32-bit word.  If the
+        * first byte is FE, it will overflow a signed 32-bit word.  The
+        * above preserves backward compatibility, since its message was used
+        * in earlier versions of this code in preference to overflow */
+       sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
         goto malformed;
      }
+#endif
  
-    len--;
-    s++;
-    ouv = uv;  /* ouv is the value from the previous iteration */
-
-    while (len--) {
-       if (!UTF8_IS_CONTINUATION(*s) &&
-           !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-           s--;
-           warning = UTF8_WARN_NON_CONTINUATION;
-           goto malformed;
-       }
-       else
-           uv = UTF8_ACCUMULATE(uv, *s);
-       if (!(uv > ouv)) {  /* If the value didn't grow from the previous
-                              iteration, something is horribly wrong */
-           /* These cannot be allowed. */
-           if (uv == ouv) {
-               if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
-                   warning = UTF8_WARN_LONG;
-                   goto malformed;
-               }
-           }
-           else { /* uv < ouv */
-               /* This cannot be allowed. */
-               warning = UTF8_WARN_OVERFLOW;
-               goto malformed;
-           }
+    if (do_overlong_test
+       && expectlen > (STRLEN)UNISKIP(uv)
+       && ! (flags & UTF8_ALLOW_LONG))
+    {
+       /* The overlong malformation has lower precedence than the others.
+        * Note that if this malformation is allowed, we return the actual
+        * value, instead of the replacement character.  This is because this
+        * value is actually well-defined. */
+       if (! (flags & UTF8_CHECK_ONLY)) {
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0));
         }
-       s++;
-       ouv = uv;
+       goto malformed;
      }
  
-    if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
-       warning = UTF8_WARN_LONG;
-       goto malformed;
-    } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
+    /* Here, the input is considered to be well-formed , but could be a
+     * problematic code point that is not allowed by the input parameters. */
+    if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
+       && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                    |UTF8_WARN_ILLEGAL_INTERCHANGE)))
+    {
         if (UNICODE_IS_SURROGATE(uv)) {
-           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
+           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
+               && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
+           {
                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
+               pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
             }
             if (flags & UTF8_DISALLOW_SURROGATE) {
                 goto disallowed;
             }
         }
-       else if (UNICODE_IS_NONCHAR(uv)) {
-           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+       else if ((uv > PERL_UNICODE_MAX)) {
+           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
+               && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
+           {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+               pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
             }
-           if (flags & UTF8_DISALLOW_NONCHAR) {
+           if (flags & UTF8_DISALLOW_SUPER) {
                 goto disallowed;
             }
         }
-       else if ((uv > PERL_UNICODE_MAX)) {
-           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+       else if (UNICODE_IS_NONCHAR(uv)) {
+           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
+               && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
+           {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+               pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
             }
-           if (flags & UTF8_DISALLOW_SUPER) {
+           if (flags & UTF8_DISALLOW_NONCHAR) {
                 goto disallowed;
             }
         }
  
+       if (sv) {
+           outlier_ret = uv;
+           goto do_warn;
+       }
+
         /* Here, this is not considered a malformed character, so drop through
          * to return it */
      }
  
      return uv;
  
-disallowed: /* Is disallowed, but otherwise not malformed.  'sv' will have been
-              set if there is to be a warning. */
-    if (!sv) {
-       dowarn = 0;
-    }
+    /* There are three cases which get to beyond this point.  In all 3 cases:
+     * <sv>        if not null points to a string to print as a warning.
+     * <curlen>            is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
+     *             set.
+     * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
+     *             This is done by initializing it to 0, and changing it only
+     *             for case 1).
+     * The 3 cases are:
+     * 1)   The input is valid but problematic, and to be warned about.  The
+     *     return value is the resultant code point; <*retlen> is set to
+     *     <curlen>, the number of bytes that comprise the code point.
+     *     <pack_warn> contains the result of packWARN() for the warning
+     *     types.  The entry point for this case is the label <do_warn>;
+     * 2)   The input is a valid code point but disallowed by the parameters to
+     *     this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
+     *     <*relen> is -1; otherwise it is <curlen>, the number of bytes that
+     *     comprise the code point.  <pack_warn> contains the result of
+     *     packWARN() for the warning types.  The entry point for this case is
+     *     the label <disallowed>.
+     * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
+     *     is set, <*relen> is -1; otherwise it is <curlen>, the number of
+     *     bytes that comprise the malformation.  All such malformations are
+     *     assumed to be warning type <utf8>.  The entry point for this case
+     *     is the label <malformed>.
+     */
  
  malformed:
  
+    if (sv && ckWARN_d(WARN_UTF8)) {
+       pack_warn = packWARN(WARN_UTF8);
+    }
+
+disallowed:
+
      if (flags & UTF8_CHECK_ONLY) {
         if (retlen)
             *retlen = ((STRLEN) -1);
         return 0;
      }
  
-    if (dowarn) {
-       if (! sv) {
-           sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
-       }
+do_warn:
  
-       switch (warning) {
-           case 0: /* Intentionally empty. */ break;
-           case UTF8_WARN_EMPTY:
-               sv_catpvs(sv, "(empty string)");
-               break;
-           case UTF8_WARN_CONTINUATION:
-               Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
-               break;
-           case UTF8_WARN_NON_CONTINUATION:
-               if (s == s0)
-                   Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
-                               (UV)s[1], startbyte);
-               else {
-                   const int len = (int)(s-s0);
-                   Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
-                               (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
-               }
+    if (pack_warn) {   /* <pack_warn> was initialized to 0, and changed only
+                          if warnings are to be raised. */
+       const char * const string = SvPVX_const(sv);
  
-               break;
-           case UTF8_WARN_SHORT:
-               Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
-                               (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
-               expectlen = curlen;             /* distance for caller to skip */
-               break;
-           case UTF8_WARN_OVERFLOW:
-               Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
-                               ouv, *s, startbyte);
-               break;
-           case UTF8_WARN_LONG:
-               Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
-                               (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
-               break;
-           default:
-               sv_catpvs(sv, "(unknown reason)");
-               break;
-       }
-       
-       if (sv) {
-           const char * const s = SvPVX_const(sv);
+       if (PL_op)
+           Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
+       else
+           Perl_warner(aTHX_ pack_warn, "%s", string);
+    }
  
-           if (PL_op)
-               Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                           "%s in %s", s,  OP_DESC(PL_op));
-           else
-               Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
-       }
+    if (retlen) {
+       *retlen = curlen;
      }
  
-    if (retlen)
-       *retlen = expectlen ? expectlen : len;
+    return outlier_ret;
+}
+
+/*
+=for apidoc utf8_to_uvchr_buf
+
+Returns the native code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<*retlen> will be set to the length, in bytes, of that character.
+
+If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+NULL) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+
+UV
+Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+
+    assert(s < send);
+
+    return utf8n_to_uvchr(s, send - s, retlen,
+                         ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
+ * there are no malformations in the input UTF-8 string C<s>.  surrogates,
+ * non-character code points, and non-Unicode code points are allowed.  A macro
+ * in utf8.h is used to normally avoid this function wrapper */
  
-    return 0;
+UV
+Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    const UV uv = valid_utf8_to_uvuni(s, retlen);
+
+    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
+
+    return UNI_TO_NATIVE(uv);
  }
  
  /*
  =for apidoc utf8_to_uvchr
  
+DEPRECATED!
+
  Returns the native code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-If C<s> does not point to a well-formed UTF-8 character, zero is
-returned and retlen is set, if possible, to -1.
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+NULL) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
  
  =cut
  */
  
-
  UV
  Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
  
-    return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
-                         ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
+}
+
+/*
+=for apidoc utf8_to_uvuni_buf
+
+Returns the Unicode code point of the first character in the string C<s> which
+is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
+C<retlen> will be set to the length, in bytes, of that character.
+
+This function should only be used when the returned UV is considered
+an index into the Unicode semantic tables (e.g. swashes).
+
+If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+NULL) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
+
+    assert(send > s);
+
+    /* Call the low level routine asking for checks */
+    return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
+                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+}
+
+/* Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
+ * there are no malformations in the input UTF-8 string C<s>.  Surrogates,
+ * non-character code points, and non-Unicode code points are allowed */
+
+UV
+Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
+{
+    UV expectlen = UTF8SKIP(s);
+    const U8* send = s + expectlen;
+    UV uv = NATIVE_TO_UTF(*s);
+
+    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
+
+    if (retlen) {
+       *retlen = expectlen;
+    }
+
+    /* An invariant is trivially returned */
+    if (expectlen == 1) {
+       return uv;
+    }
+
+    /* Remove the leading bits that indicate the number of bytes, leaving just
+     * the bits that are part of the value */
+    uv &= UTF_START_MASK(expectlen);
+
+    /* Now, loop through the remaining bytes, accumulating each into the
+     * working total as we go.  (I khw tried unrolling the loop for up to 4
+     * bytes, but there was no performance improvement) */
+    for (++s; s < send; s++) {
+       uv = UTF8_ACCUMULATE(uv, *s);
+    }
+
+    return uv;
  }
  
  /*
  =for apidoc utf8_to_uvuni
  
+DEPRECATED!
+
  Returns the Unicode code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
@@ -788,8 +1050,17 @@ length, in bytes, of that character.
  This function should only be used when the returned UV is considered
  an index into the Unicode semantic tables (e.g. swashes).
  
-If C<s> does not point to a well-formed UTF-8 character, zero is
-returned and retlen is set, if possible, to -1.
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated.  Use L</utf8_to_uvuni_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
+NULL) to -1.  If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
  
  =cut
  */
@@ -799,9 +1070,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
  
-    /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
-                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return valid_utf8_to_uvuni(s, retlen);
  }
  
  /*
@@ -829,10 +1098,7 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
      if (e < s)
         goto warn_and_return;
      while (s < e) {
-       if (!UTF8_IS_INVARIANT(*s))
-           s += UTF8SKIP(s);
-       else
-           s++;
+        s += UTF8SKIP(s);
         len++;
      }
  
@@ -909,8 +1175,8 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
  /*
  =for apidoc bytes_cmp_utf8
  
-Compares the sequence of characters (stored as octets) in b, blen with the
-sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
+sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
  equal, -1 or -2 if the first string is less than the second string, +1 or +2
  if the first string is greater than the second string.
  
@@ -978,11 +1244,11 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
  =for apidoc utf8_to_bytes
  
  Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string, and
-updates len to contain the new length.
+Unlike L</bytes_to_utf8>, this over-writes the original string, and
+updates C<len> to contain the new length.
  Returns zero on failure, setting C<len> to -1.
  
-If you need a copy of the string, see C<bytes_from_utf8>.
+If you need a copy of the string, see L</bytes_from_utf8>.
  
  =cut
  */
@@ -1011,7 +1277,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
      d = s = save;
      while (s < send) {
          STRLEN ulen;
-        *d++ = (U8)utf8_to_uvchr(s, &ulen);
+        *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
          s += ulen;
      }
      *d = '\0';
@@ -1023,7 +1289,7 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
  =for apidoc bytes_from_utf8
  
  Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
-Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
  the newly-created string, and updates C<len> to contain the new
  length.  Returns the original string if no conversion occurs, C<len>
  is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
@@ -1088,7 +1354,7 @@ A NUL character will be written after the end of the string.
  
  If you want to convert to UTF-8 from encodings other than
  the native (Latin1 or EBCDIC),
-see sv_recode_to_utf8().
+see L</sv_recode_to_utf8>().
  
  =cut
  */
@@ -1233,6 +1499,14 @@ Perl_is_uni_idfirst(pTHX_ UV c)
  }
  
  bool
+Perl__is_uni_perl_idstart(pTHX_ UV c)
+{
+    U8 tmpbuf[UTF8_MAXBYTES+1];
+    uvchr_to_utf8(tmpbuf, c);
+    return _is_utf8_perl_idstart(tmpbuf);
+}
+
+bool
  Perl_is_uni_alpha(pTHX_ UV c)
  {
      U8 tmpbuf[UTF8_MAXBYTES+1];
@@ -1247,11 +1521,15 @@ Perl_is_uni_ascii(pTHX_ UV c)
  }
  
  bool
+Perl_is_uni_blank(pTHX_ UV c)
+{
+    return isBLANK_uni(c);
+}
+
+bool
  Perl_is_uni_space(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_space(tmpbuf);
+    return isSPACE_uni(c);
  }
  
  bool
@@ -1311,9 +1589,7 @@ Perl_is_uni_punct(pTHX_ UV c)
  bool
  Perl_is_uni_xdigit(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_xdigit(tmpbuf);
+    return isXDIGIT_uni(c);
  }
  
  UV
@@ -1356,7 +1632,7 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
                 return 'S';
             default:
                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
-               /* NOTREACHED */
+               assert(0); /* NOTREACHED */
         }
      }
  
@@ -1389,9 +1665,9 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
      dVAR;
  
-    /* Convert the Unicode character whose ordinal is c to its uppercase
-     * version and store that in UTF-8 in p and its length in bytes in lenp.
-     * Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
+    /* Convert the Unicode character whose ordinal is <c> to its uppercase
+     * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
+     * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
       * the changed version may be longer than the original character.
       *
       * The ordinal of the first character of the changed version is returned
@@ -1427,7 +1703,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
  {
      /* We have the latin1-range values compiled into the core, so just use
       * those, converting the result to utf8.  Since the result is always just
-     * one character, we allow p to be NULL */
+     * one character, we allow <p> to be NULL */
  
      U8 converted = toLOWER_LATIN1(c);
  
@@ -1463,7 +1739,7 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  UV
  Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
  {
-    /* Corresponds to to_lower_latin1(), flags is TRUE if to use full case
+    /* Corresponds to to_lower_latin1(), <flags> is TRUE if to use full case
       * folding */
  
      UV converted;
@@ -1498,23 +1774,49 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
  }
  
  UV
-Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags)
+Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const U8 flags)
  {
  
-    /* Not currently externally documented, and subject to change, <flags> is
-     * TRUE iff full folding is to be used */
+    /* Not currently externally documented, and subject to change
+     *  <flags> bits meanings:
+     *     FOLD_FLAGS_FULL  iff full folding is to be used;
+     *     FOLD_FLAGS_LOCALE iff in locale
+     *     FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
+     */
  
      PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      if (c < 256) {
-       return _to_fold_latin1((U8) c, p, lenp, flags);
+       UV result = _to_fold_latin1((U8) c, p, lenp,
+                              cBOOL(((flags & FOLD_FLAGS_FULL)
+                                  /* If ASCII-safe, don't allow full folding,
+                                   * as that could include SHARP S => ss;
+                                   * otherwise there is no crossing of
+                                   * ascii/non-ascii in the latin1 range */
+                                  && ! (flags & FOLD_FLAGS_NOMIX_ASCII))));
+       /* It is illegal for the fold to cross the 255/256 boundary under
+        * locale; in this case return the original */
+       return (result > 256 && flags & FOLD_FLAGS_LOCALE)
+              ? c
+              : result;
      }
  
-    uvchr_to_utf8(p, c);
-    return CALL_FOLD_CASE(p, p, lenp, flags);
+    /* If no special needs, just use the macro */
+    if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
+       uvchr_to_utf8(p, c);
+       return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
+    }
+    else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
+              the special flags. */
+       U8 utf8_c[UTF8_MAXBYTES + 1];
+       uvchr_to_utf8(utf8_c, c);
+       return _to_utf8_fold_flags(utf8_c, p, lenp, flags, NULL);
+    }
  }
  
-/* for now these all assume no locale info available for Unicode > 255 */
+/* for now these all assume no locale info available for Unicode > 255; and
+ * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been
+ * called instead, so that these don't get called for < 255 */
  
  bool
  Perl_is_uni_alnum_lc(pTHX_ UV c)
@@ -1541,6 +1843,12 @@ Perl_is_uni_ascii_lc(pTHX_ UV c)
  }
  
  bool
+Perl_is_uni_blank_lc(pTHX_ UV c)
+{
+    return is_uni_blank(c);    /* XXX no locale support yet */
+}
+
+bool
  Perl_is_uni_space_lc(pTHX_ UV c)
  {
      return is_uni_space(c);    /* XXX no locale support yet */
@@ -1624,18 +1932,36 @@ Perl_to_uni_lower_lc(pTHX_ U32 c)
      return (U32)to_uni_lower(c, tmpbuf, &len);
  }
  
-static bool
+PERL_STATIC_INLINE bool
  S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
                  const char *const swashname)
  {
+    /* returns a boolean giving whether or not the UTF8-encoded character that
+     * starts at <p> is in the swash indicated by <swashname>.  <swash>
+     * contains a pointer to where the swash indicated by <swashname>
+     * is to be stored; which this routine will do, so that future calls will
+     * look at <*swash> and only generate a swash if it is not null
+     *
+     * Note that it is assumed that the buffer length of <p> is enough to
+     * contain all the bytes that comprise the character.  Thus, <*p> should
+     * have been checked before this call for mal-formedness enough to assure
+     * that. */
+
      dVAR;
  
      PERL_ARGS_ASSERT_IS_UTF8_COMMON;
  
-    if (!is_utf8_char(p))
+    /* The API should have included a length for the UTF-8 character in <p>,
+     * but it doesn't.  We therefor assume that p has been validated at least
+     * as far as there being enough bytes available in it to accommodate the
+     * character without reading beyond the end, and pass that number on to the
+     * validating routine */
+    if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
         return FALSE;
-    if (!*swash)
-       *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
+    if (!*swash) {
+        U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+        *swash = _core_swash_init("utf8", swashname, &PL_sv_undef, 1, 0, NULL, &flags);
+    }
      return swash_fetch(*swash, p, TRUE) != 0;
  }
  
@@ -1679,11 +2005,11 @@ Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
  }
  
  bool
-Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
+Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
  {
      dVAR;
  
-    PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
+    PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
  
      return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
  }
@@ -1731,13 +2057,23 @@ Perl_is_utf8_ascii(pTHX_ const U8 *p)
  }
  
  bool
+Perl_is_utf8_blank(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_BLANK;
+
+    return isBLANK_utf8(p);
+}
+
+bool
  Perl_is_utf8_space(pTHX_ const U8 *p)
  {
      dVAR;
  
      PERL_ARGS_ASSERT_IS_UTF8_SPACE;
  
-    return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
+    return isSPACE_utf8(p);
  }
  
  bool
@@ -1813,15 +2149,7 @@ Perl_is_utf8_cntrl(pTHX_ const U8 *p)
  
      PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
  
-    if (isASCII(*p)) {
-       return isCNTRL_A(*p);
-    }
-
-    /* All controls are in Latin1 */
-    if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
-       return 0;
-    }
-    return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
+    return isCNTRL_utf8(p);
  }
  
  bool
@@ -1861,7 +2189,7 @@ Perl_is_utf8_xdigit(pTHX_ const U8 *p)
  
      PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
  
-    return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
+    return is_XDIGIT_utf8(p);
  }
  
  bool
@@ -1875,13 +2203,13 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
  }
  
  bool
-Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+Perl_is_utf8_X_regular_begin(pTHX_ const U8 *p)
  {
      dVAR;
  
-    PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+    PERL_ARGS_ASSERT_IS_UTF8_X_REGULAR_BEGIN;
  
-    return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+    return is_utf8_common(p, &PL_utf8_X_regular_begin, "_X_Regular_Begin");
  }
  
  bool
@@ -1894,107 +2222,28 @@ Perl_is_utf8_X_extend(pTHX_ const U8 *p)
      return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
  }
  
-bool
-Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
-
-    return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
-}
-
-bool
-Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
-
-    return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
-}
-
-bool
-Perl_is_utf8_X_L(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_L;
-
-    return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
-}
-
-bool
-Perl_is_utf8_X_LV(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_LV;
-
-    return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
-}
-
-bool
-Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
-
-    return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
-}
-
-bool
-Perl_is_utf8_X_T(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_T;
-
-    return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
-}
-
-bool
-Perl_is_utf8_X_V(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_V;
-
-    return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
-}
-
-bool
-Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
-
-    return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
-}
-
  /*
  =for apidoc to_utf8_case
  
-The "p" contains the pointer to the UTF-8 string encoding
-the character that is being converted.
+The C<p> contains the pointer to the UTF-8 string encoding
+the character that is being converted.  This routine assumes that the character
+at C<p> is well-formed.
  
-The "ustrp" is a pointer to the character buffer to put the
-conversion result to.  The "lenp" is a pointer to the length
+The C<ustrp> is a pointer to the character buffer to put the
+conversion result to.  The C<lenp> is a pointer to the length
  of the result.
  
-The "swashp" is a pointer to the swash to use.
+The C<swashp> is a pointer to the swash to use.
  
-Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
-and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
+Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
+and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  The C<special> (usually,
  but not always, a multicharacter mapping), is tried first.
  
-The "special" is a string like "utf8::ToSpecLower", which means the
+The C<special> is a string like "utf8::ToSpecLower", which means the
  hash %utf8::ToSpecLower.  The access to the hash is through
  Perl_to_utf8_case().
  
-The "normal" is a string like "ToLower" which means the swash
+The C<normal> is a string like "ToLower" which means the swash
  %utf8::ToLower.
  
  =cut */
@@ -2006,7 +2255,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      dVAR;
      U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
      STRLEN len = 0;
-    const UV uv0 = utf8_to_uvchr(p, NULL);
+    const UV uv0 = valid_utf8_to_uvchr(p, NULL);
      /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
       * are necessary in EBCDIC, they are redundant no-ops
       * in ASCII-ish platforms, and hopefully optimized away. */
@@ -2039,7 +2288,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      uvuni_to_utf8(tmpbuf, uv1);
  
      if (!*swashp) /* load on-demand */
-         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
+         *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
  
      if (special) {
           /* It might be "special" (sometimes, but not always,
@@ -2067,7 +2316,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
                         STRLEN tlen = 0;
                         
                         while (t < tend) {
-                            const UV c = utf8_to_uvchr(t, &tlen);
+                            const UV c = utf8_to_uvchr_buf(t, tend, &tlen);
                              if (tlen > 0) {
                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
                                   t += tlen;
@@ -2092,7 +2341,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      }
  
      if (!len && *swashp) {
-       const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
+       const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */);
  
          if (uv2) {
               /* It was "normal" (a single character mapping). */
@@ -2101,14 +2350,25 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
          }
      }
  
-    if (!len) /* Neither: just copy.  In other words, there was no mapping
-                defined, which means that the code point maps to itself */
-        len = uvchr_to_utf8(ustrp, uv0) - ustrp;
+    if (len) {
+        if (lenp) {
+            *lenp = len;
+        }
+        return valid_utf8_to_uvchr(ustrp, 0);
+    }
+
+    /* Here, there was no mapping defined, which means that the code point maps
+     * to itself.  Return the inputs */
+    len = UTF8SKIP(p);
+    if (p != ustrp) {   /* Don't copy onto itself */
+        Copy(p, ustrp, len, U8);
+    }
  
      if (lenp)
          *lenp = len;
  
-    return len ? utf8_to_uvchr(ustrp, 0) : 0;
+    return uv0;
+
  }
  
  STATIC UV
@@ -2119,7 +2379,8 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
       * contains a character that crosses the 255/256 boundary, disallow the
       * change, and return the original code point.  See L<perlfunc/lc> for why;
       *
-     * p       points to the original string whose case was changed
+     * p       points to the original string whose case was changed; assumed
+     *          by this routine to be well-formed
       * result  the code point of the first character in the changed-case string
       * ustrp   points to the changed-case string (<result> represents its first char)
       * lenp    points to the length of <ustrp> */
@@ -2153,7 +2414,7 @@ S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* c
  bad_crossing:
  
      /* Failed, have to return the original */
-    original = utf8_to_uvchr(p, lenp);
+    original = valid_utf8_to_uvchr(p, lenp);
      Copy(p, ustrp, *lenp, char);
      return original;
  }
@@ -2161,14 +2422,16 @@ bad_crossing:
  /*
  =for apidoc to_utf8_upper
  
-Convert the UTF-8 encoded character at p to its uppercase version and
-store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
+Convert the UTF-8 encoded character at C<p> to its uppercase version and
+store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
  that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
  the uppercase version may be longer than the original character.
  
  The first character of the uppercased version is returned
  (but note, as explained above, that there may be more.)
  
+The character at C<p> is assumed by this routine to be well-formed.
+
  =cut */
  
  /* Not currently externally documented, and subject to change:
@@ -2231,14 +2494,16 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
  /*
  =for apidoc to_utf8_title
  
-Convert the UTF-8 encoded character at p to its titlecase version and
-store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+Convert the UTF-8 encoded character at C<p> to its titlecase version and
+store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
+that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
  titlecase version may be longer than the original character.
  
  The first character of the titlecased version is returned
  (but note, as explained above, that there may be more.)
  
+The character at C<p> is assumed by this routine to be well-formed.
+
  =cut */
  
  /* Not currently externally documented, and subject to change:
@@ -2303,14 +2568,16 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
  /*
  =for apidoc to_utf8_lower
  
-Convert the UTF-8 encoded character at p to its lowercase version and
-store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+Convert the UTF-8 encoded character at C<p> to its lowercase version and
+store that in UTF-8 in ustrp and its length in bytes in C<lenp>.  Note
+that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
  lowercase version may be longer than the original character.
  
  The first character of the lowercased version is returned
  (but note, as explained above, that there may be more.)
  
+The character at C<p> is assumed by this routine to be well-formed.
+
  =cut */
  
  /* Not currently externally documented, and subject to change:
@@ -2374,15 +2641,17 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
  /*
  =for apidoc to_utf8_fold
  
-Convert the UTF-8 encoded character at p to its foldcase version and
-store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
-that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+Convert the UTF-8 encoded character at C<p> to its foldcase version and
+store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
+that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
  foldcase version may be longer than the original character (up to
  three characters).
  
  The first character of the foldcased version is returned
  (but note, as explained above, that there may be more.)
  
+The character at C<p> is assumed by this routine to be well-formed.
+
  =cut */
  
  /* Not currently externally documented, and subject to change,
@@ -2392,6 +2661,8 @@ The first character of the foldcased version is returned
   *                           POSIX, lowercase is used instead
   *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
   *                           otherwise simple folds
+ *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
+ *                           prohibited
   * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
   *              were used in the calculation; otherwise unchanged. */
  
@@ -2404,6 +2675,11 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
  
      PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
  
+    /* These are mutually exclusive */
+    assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
+
+    assert(p != ustrp); /* Otherwise overwrites */
+
      if (UTF8_IS_INVARIANT(*p)) {
         if (flags & FOLD_FLAGS_LOCALE) {
             result = toLOWER_LC(*p);
@@ -2419,17 +2695,49 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
         }
         else {
             return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
-                                  ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
+                                  ustrp, lenp,
+                                  cBOOL((flags & FOLD_FLAGS_FULL
+                                      /* If ASCII safe, don't allow full
+                                       * folding, as that could include SHARP
+                                       * S => ss; otherwise there is no
+                                       * crossing of ascii/non-ascii in the
+                                       * latin1 range */
+                                      && ! (flags & FOLD_FLAGS_NOMIX_ASCII))));
         }
      }
      else {  /* utf8, ord above 255 */
-       result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
+       result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
  
         if ((flags & FOLD_FLAGS_LOCALE)) {
-           result = check_locale_boundary_crossing(p, result, ustrp, lenp);
+           return check_locale_boundary_crossing(p, result, ustrp, lenp);
+       }
+       else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
+           return result;
         }
+       else {
+           /* This is called when changing the case of a utf8-encoded
+            * character above the Latin1 range, and the result should not
+            * contain an ASCII character. */
+
+           UV original;    /* To store the first code point of <p> */
+
+           /* Look at every character in the result; if any cross the
+           * boundary, the whole thing is disallowed */
+           U8* s = ustrp;
+           U8* e = ustrp + *lenp;
+           while (s < e) {
+               if (isASCII(*s)) {
+                   /* Crossed, have to return the original */
+                   original = valid_utf8_to_uvchr(p, lenp);
+                   Copy(p, ustrp, *lenp, char);
+                   return original;
+               }
+               s += UTF8SKIP(s);
+           }
  
-       return result;
+           /* Here, no characters crossed, result is ok as-is */
+           return result;
+       }
      }
  
      /* Here, used locale rules.  Convert back to utf8 */
@@ -2464,14 +2772,18 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
       * public interface, and returning a copy prevents others from doing
       * mischief on the original */
  
-    return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, FALSE, NULL, FALSE));
+    return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
  }
  
  SV*
-Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, bool return_if_undef, SV* invlist, bool passed_in_invlist_has_user_defined_property)
+Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
  {
      /* Initialize and return a swash, creating it if necessary.  It does this
-     * by calling utf8_heavy.pl in the general case.
+     * by calling utf8_heavy.pl in the general case.  The returned value may be
+     * the swash's inversion list instead if the input parameters allow it.
+     * Which is returned should be immaterial to callers, as the only
+     * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
+     * and swash_to_invlist() handle both these transparently.
       *
       * This interface should only be used by functions that won't destroy or
       * adversely change the swash, as doing so affects all other uses of the
@@ -2487,11 +2799,19 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
       * minbits is the number of bits required to represent each data element.
       *     It is '1' for binary properties.
       * none I (khw) do not understand this one, but it is used only in tr///.
-     * return_if_undef is TRUE if the routine shouldn't croak if it can't find
-     *     the requested property
       * invlist is an inversion list to initialize the swash with (or NULL)
-     * has_user_defined_property is TRUE if <invlist> has some component that
-     *      came from a user-defined property
+     * flags_p if non-NULL is the address of various input and output flag bits
+     *      to the routine, as follows:  ('I' means is input to the routine;
+     *      'O' means output from the routine.  Only flags marked O are
+     *      meaningful on return.)
+     *  _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
+     *      came from a user-defined property.  (I O)
+     *  _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
+     *      when the swash cannot be located, to simply return NULL. (I)
+     *  _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
+     *      return of an inversion list instead of a swash hash if this routine
+     *      thinks that would result in faster execution of swash_fetch() later
+     *      on. (I)
       *
       * Thus there are three possible inputs to find the swash: <name>,
       * <listsv>, and <invlist>.  At least one must be specified.  The result
@@ -2502,6 +2822,12 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
  
      dVAR;
      SV* retval = &PL_sv_undef;
+    HV* swash_hv = NULL;
+    const int invlist_swash_boundary =
+        (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
+        ? 512    /* Based on some benchmarking, but not extensive, see commit
+                    message */
+        : -1;   /* Never return just an inversion list */
  
      assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
      assert(! invlist || minbits == 1);
@@ -2522,25 +2848,38 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         ENTER;
         SAVEHINTS();
         save_re_context();
+       /* We might get here via a subroutine signature which uses a utf8
+        * parameter name, at which point PL_subname will have been set
+        * but not yet used. */
+       save_item(PL_subname);
         if (PL_parser && PL_parser->error_count)
             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
         if (!method) {  /* demand load utf8 */
             ENTER;
-           errsv_save = newSVsv(ERRSV);
+           if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
+           GvSV(PL_errgv) = NULL;
             /* It is assumed that callers of this routine are not passing in
              * any user derived data.  */
             /* Need to do this after save_re_context() as it will set
              * PL_tainted to 1 while saving $1 etc (see the code after getrx:
              * in Perl_magic_get).  Even line to create errsv_save can turn on
              * PL_tainted.  */
-           SAVEBOOL(PL_tainted);
-           PL_tainted = 0;
+#ifndef NO_TAINT_SUPPORT
+           SAVEBOOL(TAINT_get);
+           TAINT_NOT;
+#endif
             Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
                              NULL);
-           if (!SvTRUE(ERRSV))
-               sv_setsv(ERRSV, errsv_save);
-           SvREFCNT_dec(errsv_save);
+           {
+               /* Not ERRSV, as there is no need to vivify a scalar we are
+                  about to discard. */
+               SV * const errsv = GvSV(PL_errgv);
+               if (!SvTRUE(errsv)) {
+                   GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
+                   SvREFCNT_dec(errsv);
+               }
+           }
             LEAVE;
         }
         SPAGAIN;
@@ -2552,18 +2891,25 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         mPUSHi(minbits);
         mPUSHi(none);
         PUTBACK;
-       errsv_save = newSVsv(ERRSV);
+       if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
+       GvSV(PL_errgv) = NULL;
         /* If we already have a pointer to the method, no need to use
          * call_method() to repeat the lookup.  */
-       if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
+       if (method
+            ? call_sv(MUTABLE_SV(method), G_SCALAR)
             : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
         {
             retval = *PL_stack_sp--;
             SvREFCNT_inc(retval);
         }
-       if (!SvTRUE(ERRSV))
-           sv_setsv(ERRSV, errsv_save);
-       SvREFCNT_dec(errsv_save);
+       {
+           /* Not ERRSV.  See above. */
+           SV * const errsv = GvSV(PL_errgv);
+           if (!SvTRUE(errsv)) {
+               GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
+               SvREFCNT_dec(errsv);
+           }
+       }
         LEAVE;
         POPSTACK;
         if (IN_PERL_COMPILETIME) {
@@ -2573,7 +2919,7 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             if (SvPOK(retval))
  
                 /* If caller wants to handle missing properties, let them */
-               if (return_if_undef) {
+               if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
                     return NULL;
                 }
                 Perl_croak(aTHX_
@@ -2583,25 +2929,45 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
         }
      } /* End of calling the module to find the swash */
  
+    /* If this operation fetched a swash, and we will need it later, get it */
+    if (retval != &PL_sv_undef
+        && (minbits == 1 || (flags_p
+                            && ! (*flags_p
+                                  & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
+    {
+        swash_hv = MUTABLE_HV(SvRV(retval));
+
+        /* If we don't already know that there is a user-defined component to
+         * this swash, and the user has indicated they wish to know if there is
+         * one (by passing <flags_p>), find out */
+        if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
+            SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
+            if (user_defined && SvUV(*user_defined)) {
+                *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+            }
+        }
+    }
+
      /* Make sure there is an inversion list for binary properties */
      if (minbits == 1) {
         SV** swash_invlistsvp = NULL;
         SV* swash_invlist = NULL;
         bool invlist_in_swash_is_valid = FALSE;
-       HV* swash_hv = NULL;
+       bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
+                                           an unclaimed reference count */
  
          /* If this operation fetched a swash, get its already existing
-         * inversion list or create one for it */
-       if (retval != &PL_sv_undef) {
-           swash_hv = MUTABLE_HV(SvRV(retval));
+         * inversion list, or create one for it */
  
-           swash_invlistsvp = hv_fetchs(swash_hv, "INVLIST", FALSE);
+        if (swash_hv) {
+           swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
             if (swash_invlistsvp) {
                 swash_invlist = *swash_invlistsvp;
                 invlist_in_swash_is_valid = TRUE;
             }
             else {
                 swash_invlist = _swash_to_invlist(retval);
+               swash_invlist_unclaimed = TRUE;
             }
         }
  
@@ -2620,28 +2986,38 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
             }
             else {
  
-               /* Here, there is no swash already.  Set up a minimal one */
-               swash_hv = newHV();
-               retval = newRV_inc(MUTABLE_SV(swash_hv));
+                /* Here, there is no swash already.  Set up a minimal one, if
+                 * we are going to return a swash */
+                if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
+                    swash_hv = newHV();
+                    retval = newRV_noinc(MUTABLE_SV(swash_hv));
+                }
                 swash_invlist = invlist;
             }
-
-            if (passed_in_invlist_has_user_defined_property) {
-                if (! hv_stores(swash_hv, "USER_DEFINED", newSVuv(1))) {
-                    Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
-                }
-            }
         }
  
          /* Here, we have computed the union of all the passed-in data.  It may
           * be that there was an inversion list in the swash which didn't get
           * touched; otherwise save the one computed one */
-       if (! invlist_in_swash_is_valid) {
-           if (! hv_stores(MUTABLE_HV(SvRV(retval)), "INVLIST", swash_invlist))
+       if (! invlist_in_swash_is_valid
+            && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
+        {
+           if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
              {
                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
             }
+           /* We just stole a reference count. */
+           if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
+           else SvREFCNT_inc_simple_void_NN(swash_invlist);
         }
+
+        /* Use the inversion list stand-alone if small enough */
+        if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
+           SvREFCNT_dec(retval);
+           if (!swash_invlist_unclaimed)
+               SvREFCNT_inc_simple_void_NN(swash_invlist);
+            retval = newRV_noinc(swash_invlist);
+        }
      }
  
      return retval;
@@ -2707,6 +3083,15 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
  
      PERL_ARGS_ASSERT_SWASH_FETCH;
  
+    /* If it really isn't a hash, it isn't really swash; must be an inversion
+     * list */
+    if (SvTYPE(hv) != SVt_PVHV) {
+        return _invlist_contains_cp((SV*)hv,
+                                    (do_utf8)
+                                     ? valid_utf8_to_uvchr(ptr, NULL)
+                                     : c);
+    }
+
      /* Convert to utf8 if not already */
      if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
@@ -2790,24 +3175,6 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
             Copy(ptr, PL_last_swash_key, klen, U8);
      }
  
-    if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
-       SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-
-       /* This outputs warnings for binary properties only, assuming that
-        * to_utf8_case() will output any for non-binary.  Also, surrogates
-        * aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
-
-       if (! bitssvp || SvUV(*bitssvp) == 1) {
-           /* User-defined properties can silently match above-Unicode */
-           SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
-           if (! user_defined_svp || ! SvUV(*user_defined_svp)) {
-               const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
-               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
-                   "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
-           }
-       }
-    }
-
      switch ((int)((slen << 3) / needents)) {
      case 1:
         bit = 1 << (off & 7);
@@ -2895,19 +3262,18 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
                     char *after_strtol = (char *) lend;
                     *val = Strtol((char *)l, &after_strtol, 10);
                     l = (U8 *) after_strtol;
-                   *val += *min;
                 }
                 else { /* Other tables are in hex, and are the correct result
                           without tweaking */
-               flags = PERL_SCAN_SILENT_ILLDIGIT
-                     | PERL_SCAN_DISALLOW_PREFIX
-                     | PERL_SCAN_SILENT_NON_PORTABLE;
-               numlen = lend - l;
-               *val = grok_hex((char *)l, &numlen, &flags, NULL);
-               if (numlen)
-                   l += numlen;
-               else
-                   *val = 0;
+                   flags = PERL_SCAN_SILENT_ILLDIGIT
+                       | PERL_SCAN_DISALLOW_PREFIX
+                       | PERL_SCAN_SILENT_NON_PORTABLE;
+                   numlen = lend - l;
+                   *val = grok_hex((char *)l, &numlen, &flags, NULL);
+                   if (numlen)
+                       l += numlen;
+                   else
+                       *val = 0;
                 }
             }
             else {
@@ -2958,7 +3324,7 @@ S_swatch_get(pTHX_ SV* swash, UV start, UV span)
      U8 *l, *lend, *x, *xend, *s, *send;
      STRLEN lcur, xcur, scur;
      HV *const hv = MUTABLE_HV(SvRV(swash));
-    SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
+    SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
  
      SV** listsvp = NULL; /* The string containing the main body of the table */
      SV** extssvp = NULL;
@@ -3263,7 +3629,7 @@ HV*
  Perl__swash_inversion_hash(pTHX_ SV* const swash)
  {
  
-   /* Subject to change or removal.  For use only in one place in regcomp.c.
+   /* Subject to change or removal.  For use only in regcomp.c and regexec.c
      * Can't be used on a property that is subject to user override, as it
      * relies on the value of SPECIALS in the swash which would be set by
      * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
@@ -3304,7 +3670,8 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
      STRLEN lcur;
      HV *const hv = MUTABLE_HV(SvRV(swash));
  
-    /* The string containing the main body of the table */
+    /* The string containing the main body of the table.  This will have its
+     * assertion fail if the swash has been converted to its inversion list */
      SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
  
      SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
@@ -3352,7 +3719,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                            "unexpectedly is not a string, flags=%lu",
                            (unsigned long)SvFLAGS(sv_to));
             }
-           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
+           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
  
             /* Each key in the inverse list is a mapped-to value, and the key's
              * hash value is a list of the strings (each in utf8) that map to
@@ -3419,7 +3786,7 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
                     }
  
-                   /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
+                   /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
                     for (j = 0; j <= av_len(from_list); j++) {
                         entryp = av_fetch(from_list, j, FALSE);
                         if (entryp == NULL) {
@@ -3427,9 +3794,11 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                         }
  
                         /* When i==j this adds itself to the list */
-                       av_push(i_list, newSVuv(utf8_to_uvchr(
-                                               (U8*) SvPVX(*entryp), 0)));
-                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
+                       av_push(i_list, newSVuv(utf8_to_uvchr_buf(
+                                       (U8*) SvPVX(*entryp),
+                                       (U8*) SvPVX(*entryp) + SvCUR(*entryp),
+                                       0)));
+                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
                     }
                 }
             }
@@ -3502,14 +3871,14 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
             /* Make sure there is a mapping to itself on the list */
             if (! found_key) {
                 av_push(list, newSVuv(val));
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));*/
+               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
             }
  
  
             /* Simply add the value to the list */
             if (! found_inverse) {
                 av_push(list, newSVuv(inverse));
-               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));*/
+               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
             }
  
             /* swatch_get() increments the value of val for each element in the
@@ -3541,17 +3910,15 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
      HV *const hv = MUTABLE_HV(SvRV(swash));
      UV elements = 0;    /* Number of elements in the inversion list */
      U8 empty[] = "";
+    SV** listsvp;
+    SV** typesvp;
+    SV** bitssvp;
+    SV** extssvp;
+    SV** invert_it_svp;
  
-    /* The string containing the main body of the table */
-    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
-    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
-    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-    SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
-    SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
-    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
-    const STRLEN bits  = SvUV(*bitssvp);
-    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+    U8* typestr;
+    STRLEN bits;
+    STRLEN octets; /* if bits == 1, then octets == 0 */
      U8 *x, *xend;
      STRLEN xcur;
  
@@ -3559,6 +3926,22 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
  
      PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
  
+    /* If not a hash, it must be the swash's inversion list instead */
+    if (SvTYPE(hv) != SVt_PVHV) {
+        return (SV*) hv;
+    }
+
+    /* The string containing the main body of the table */
+    listsvp = hv_fetchs(hv, "LIST", FALSE);
+    typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    bitssvp = hv_fetchs(hv, "BITS", FALSE);
+    extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
+    invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
+
+    typestr = (U8*)SvPV_nolen(*typesvp);
+    bits  = SvUV(*bitssvp);
+    octets = bits >> 3; /* if bits == 1, then octets == 0 */
+
      /* read $swash->{LIST} */
      if (SvPOK(*listsvp)) {
         l = (U8*)SvPV(*listsvp, lcur);
@@ -3604,7 +3987,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
             break;
         }
  
-       _append_range_to_invlist(invlist, start, end);
+       invlist = _add_range_to_invlist(invlist, start, end);
      }
  
      /* Invert if the data says it should be */
@@ -3672,8 +4055,7 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
             _invlist_union(invlist, other, &invlist);
             break;
         case '!':
-           _invlist_invert(other);
-           _invlist_union(invlist, other, &invlist);
+            _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
             break;
         case '-':
             _invlist_subtract(invlist, other, &invlist);
@@ -3690,6 +4072,31 @@ Perl__swash_to_invlist(pTHX_ SV* const swash)
      return invlist;
  }
  
+SV*
+Perl__get_swash_invlist(pTHX_ SV* const swash)
+{
+    SV** ptr;
+
+    PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
+
+    if (! SvROK(swash)) {
+        return NULL;
+    }
+
+    /* If it really isn't a hash, it isn't really swash; must be an inversion
+     * list */
+    if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
+        return SvRV(swash);
+    }
+
+    ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
+    if (! ptr) {
+        return NULL;
+    }
+
+    return *ptr;
+}
+
  /*
  =for apidoc uvchr_to_utf8
  
@@ -3734,7 +4141,7 @@ C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-length and flags are the same as utf8n_to_uvuni().
+C<length> and C<flags> are the same as L</utf8n_to_uvuni>().
  
  =cut
  */
@@ -3753,7 +4160,7 @@ U32 flags)
  }
  
  bool
-Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  {
      /* May change: warns if surrogates, non-character code points, or
       * non-Unicode code points are in s which has length len bytes.  Returns
@@ -3775,7 +4182,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
             STRLEN char_len;
             if (UTF8_IS_SUPER(s)) {
                 if (ckWARN_d(WARN_NON_UNICODE)) {
-                   UV uv = utf8_to_uvchr(s, &char_len);
+                   UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
                     ok = FALSE;
@@ -3783,7 +4190,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
             }
             else if (UTF8_IS_SURROGATE(s)) {
                 if (ckWARN_d(WARN_SURROGATE)) {
-                   UV uv = utf8_to_uvchr(s, &char_len);
+                   UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
                     ok = FALSE;
@@ -3793,7 +4200,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
                 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
                  && (ckWARN_d(WARN_NONCHAR)))
             {
-               UV uv = utf8_to_uvchr(s, &char_len);
+               UV uv = utf8_to_uvchr_buf(s, e, &char_len);
                 Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
                     "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
                 ok = FALSE;
@@ -3808,18 +4215,18 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
  /*
  =for apidoc pv_uni_display
  
-Build to the scalar dsv a displayable version of the string spv,
-length len, the displayable version being at most pvlim bytes long
+Build to the scalar C<dsv> a displayable version of the string C<spv>,
+length C<len>, the displayable version being at most C<pvlim> bytes long
  (if longer, the rest is truncated and "..." will be appended).
  
-The flags argument can have UNI_DISPLAY_ISPRINT set to display
+The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
  isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
  to display the \\[nrfta\\] as the backslashed versions (like '\n')
  (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
  UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
  UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
  
-The pointer to the PV of the dsv is returned.
+The pointer to the PV of the C<dsv> is returned.
  
  =cut */
  char *
@@ -3843,7 +4250,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
               truncated++;
               break;
          }
-        u = utf8_to_uvchr((U8*)s, 0);
+        u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
          if (u < 256) {
              const unsigned char c = (unsigned char)u & 0xFF;
              if (flags & UNI_DISPLAY_BACKSLASH) {
@@ -3887,13 +4294,13 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
  /*
  =for apidoc sv_uni_display
  
-Build to the scalar dsv a displayable version of the scalar sv,
-the displayable version being at most pvlim bytes long
+Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
+the displayable version being at most C<pvlim> bytes long
  (if longer, the rest is truncated and "..." will be appended).
  
-The flags argument is as in pv_uni_display().
+The C<flags> argument is as in L</pv_uni_display>().
  
-The pointer to the PV of the dsv is returned.
+The pointer to the PV of the C<dsv> is returned.
  
  =cut
  */
@@ -3909,40 +4316,44 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
  /*
  =for apidoc foldEQ_utf8
  
-Returns true if the leading portions of the strings s1 and s2 (either or both
+Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
  of which may be in UTF-8) are the same case-insensitively; false otherwise.
  How far into the strings to compare is determined by other input parameters.
  
-If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
-otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
-with respect to s2.
+If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
+otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
+with respect to C<s2>.
  
-If the byte length l1 is non-zero, it says how far into s1 to check for fold
-equality.  In other words, s1+l1 will be used as a goal to reach.  The
+If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
+equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
  scan will not be considered to be a match unless the goal is reached, and
-scanning won't continue past that goal.  Correspondingly for l2 with respect to
-s2.
-
-If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
-considered an end pointer beyond which scanning of s1 will not continue under
-any circumstances.  This means that if both l1 and pe1 are specified, and pe1
-is less than s1+l1, the match will never be successful because it can never
+scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
+C<s2>.
+
+If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
+considered an end pointer to the position 1 byte past the maximum point
+in C<s1> beyond which scanning will not continue under any circumstances.
+(This routine assumes that UTF-8 encoded input strings are not malformed;
+malformed input can cause it to read past C<pe1>).
+This means that if both C<l1> and C<pe1> are specified, and C<pe1>
+is less than C<s1>+C<l1>, the match will never be successful because it can
+never
  get as far as its goal (and in fact is asserted against).  Correspondingly for
-pe2 with respect to s2.
+C<pe2> with respect to C<s2>.
  
-At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
-non-zero), and if both do, both have to be
+At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
+C<l2> must be non-zero), and if both do, both have to be
  reached for a successful match.   Also, if the fold of a character is multiple
  characters, all of them must be matched (see tr21 reference below for
  'folding').
  
-Upon a successful match, if pe1 is non-NULL,
-it will be set to point to the beginning of the I<next> character of s1 beyond
-what was matched.  Correspondingly for pe2 and s2.
+Upon a successful match, if C<pe1> is non-NULL,
+it will be set to point to the beginning of the I<next> character of C<s1>
+beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
  
  For case-insensitiveness, the "casefolding" of Unicode is used
  instead of upper/lowercasing both the characters, see
-http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
+L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
  
  =cut */
  
@@ -3960,17 +4371,17 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
   *  FOLDEQ_S2_ALREADY_FOLDED   Similarly.
   */
  I32
-Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
+Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
  {
      dVAR;
-    register const U8 *p1  = (const U8*)s1; /* Point to current char */
-    register const U8 *p2  = (const U8*)s2;
-    register const U8 *g1 = NULL;       /* goal for s1 */
-    register const U8 *g2 = NULL;
-    register const U8 *e1 = NULL;       /* Don't scan s1 past this */
-    register U8 *f1 = NULL;             /* Point to current folded */
-    register const U8 *e2 = NULL;
-    register U8 *f2 = NULL;
+    const U8 *p1  = (const U8*)s1; /* Point to current char */
+    const U8 *p2  = (const U8*)s2;
+    const U8 *g1 = NULL;       /* goal for s1 */
+    const U8 *g2 = NULL;
+    const U8 *e1 = NULL;       /* Don't scan s1 past this */
+    U8 *f1 = NULL;             /* Point to current folded */
+    const U8 *e2 = NULL;
+    U8 *f2 = NULL;
      STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
@@ -4038,7 +4449,6 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
                 f1 = (U8 *) p1;
                 n1 = UTF8SKIP(f1);
             }
-
             else {
                 /* If in locale matching, we use two sets of rules, depending
                  * on if the code point is above or below 255.  Here, we test
@@ -4188,8 +4598,8 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
   * Local variables:
   * c-indentation-style: bsd
   * c-basic-offset: 4
- * indent-tabs-mode: t
+ * indent-tabs-mode: nil
   * End:
   *
- * ex: set ts=8 sts=4 sw=4 noet:
+ * ex: set ts=8 sts=4 sw=4 et:
   */