utf8.c: Add comments

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index a68af53..c8bdc7a 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -13,12 +13,12 @@
   *  heard of that we don't want to see any closer; and that's the one place
   *  we're trying to get to!  And that's just where we can't get, nohow.'
   *
   *  heard of that we don't want to see any closer; and that's the one place
   *  we're trying to get to!  And that's just where we can't get, nohow.'
   *
- *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
+ *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
   *
   * 'Well do I understand your speech,' he answered in the same language;
   * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
   *  as is the custom in the West, if you wish to be answered?'
   *
   * 'Well do I understand your speech,' he answered in the same language;
   * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
   *  as is the custom in the West, if you wish to be answered?'
- *                           --Gandalf, addressing Théoden's door wardens
+ *                           --Gandalf, addressing Théoden's door wardens
   *
   *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
   *
   *
   *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
   *
@@ -33,7 +33,7 @@
  #include "perl.h"
  
  #ifndef EBCDIC
  #include "perl.h"
  
  #ifndef EBCDIC
-/* Separate prototypes needed because in ASCII systems these
+/* Separate prototypes needed because in ASCII systems these are
   * usually macros but they still are compiled as code, too. */
  PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
   * usually macros but they still are compiled as code, too. */
  PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
@@ -42,7 +42,7 @@ PERL_CALLCONV U8*     Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
  
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
  
-/* 
+/*
  =head1 Unicode Support
  
  This file contains various utility functions for manipulating UTF8-encoded
  =head1 Unicode Support
  
  This file contains various utility functions for manipulating UTF8-encoded
@@ -57,8 +57,12 @@ within non-zero characters.
  /*
  =for apidoc is_ascii_string
  
  /*
  =for apidoc is_ascii_string
  
-Returns true if first C<len> bytes of the given string are ASCII (i.e. none
-of them even raise the question of UTF-8-ness).
+Returns true if the first C<len> bytes of the given string are the same whether
+or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
+is, if they are invariant.  On ASCII-ish machines, only ASCII characters
+fit this definition, hence the function's name.
+
+If C<len> is 0, it will be calculated using C<strlen(s)>.  
  
  See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
  
  See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
@@ -84,8 +88,8 @@ Perl_is_ascii_string(const U8 *s, STRLEN len)
  /*
  =for apidoc uvuni_to_utf8_flags
  
  /*
  =for apidoc uvuni_to_utf8_flags
  
-Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
+Adds the UTF-8 representation of the code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
  
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
  
@@ -99,10 +103,31 @@ or, in most cases,
  
      d = uvuni_to_utf8_flags(d, uv, 0);
  
  
      d = uvuni_to_utf8_flags(d, uv, 0);
  
-is the recommended Unicode-aware way of saying
+This is the recommended Unicode-aware way of saying
  
      *(d++) = uv;
  
  
      *(d++) = uv;
  
+This function will convert to UTF-8 (and not warn) even code points that aren't
+legal Unicode or are problematic, unless C<flags> contains one or more of the
+following flags.
+If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
+the function will raise a warning, provided UTF8 warnings are enabled.  If instead
+UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
+If both flags are set, the function will both warn and return NULL.
+
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
+affect how the function handles a Unicode non-character.  And, likewise for the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
+above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
+even less portable) can be warned and/or disallowed even if other above-Unicode
+code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
+flags.
+
+And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
+above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
+DISALLOW flags.
+
+
  =cut
  */
  
  =cut
  */
  
@@ -111,23 +136,39 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  
-    if (ckWARN(WARN_UTF8)) {
-        if (UNICODE_IS_SURROGATE(uv) &&
-            !(flags & UNICODE_ALLOW_SURROGATE))
-             Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
-        else if (
-                 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
-                   !(flags & UNICODE_ALLOW_FDD0))
-                  ||
-                  ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
-                   !(flags & UNICODE_ALLOW_FFFF))) &&
-                 /* UNICODE_ALLOW_SUPER includes
-                  * FFFEs and FFFFs beyond 0x10FFFF. */
-                 ((uv <= PERL_UNICODE_MAX) ||
-                  !(flags & UNICODE_ALLOW_SUPER))
-                 )
-             Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                        "Unicode character 0x%04"UVxf" is illegal", uv);
+    if (ckWARN_d(WARN_UTF8)) {
+       if (UNICODE_IS_SURROGATE(uv)) {
+           if (flags & UNICODE_WARN_SURROGATE) {
+               Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
+                                           "UTF-16 surrogate U+%04"UVXf, uv);
+           }
+           if (flags & UNICODE_DISALLOW_SURROGATE) {
+               return NULL;
+           }
+       }
+       else if (UNICODE_IS_SUPER(uv)) {
+           if (flags & UNICODE_WARN_SUPER
+               || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
+           {
+               Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
+                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+           }
+           if (flags & UNICODE_DISALLOW_SUPER
+               || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
+           {
+               return NULL;
+           }
+       }
+       else if (UNICODE_IS_NONCHAR(uv)) {
+           if (flags & UNICODE_WARN_NONCHAR) {
+               Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
+                "Unicode non-character U+%04"UVXf" is illegal for open interchange",
+                uv);
+           }
+           if (flags & UNICODE_DISALLOW_NONCHAR) {
+               return NULL;
+           }
+       }
      }
      if (UNI_IS_INVARIANT(uv)) {
         *d++ = (U8)UTF_TO_NATIVE(uv);
      }
      if (UNI_IS_INVARIANT(uv)) {
         *d++ = (U8)UTF_TO_NATIVE(uv);
@@ -262,7 +303,7 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
         if (!UTF8_IS_CONTINUATION(*s))
             return 0;
         uv = UTF8_ACCUMULATE(uv, *s);
         if (!UTF8_IS_CONTINUATION(*s))
             return 0;
         uv = UTF8_ACCUMULATE(uv, *s);
-       if (uv < ouv) 
+       if (uv < ouv)
             return 0;
         ouv = uv;
         s++;
             return 0;
         ouv = uv;
         s++;
@@ -301,9 +342,10 @@ Perl_is_utf8_char(const U8 *s)
  =for apidoc is_utf8_string
  
  Returns true if first C<len> bytes of the given string form a valid
  =for apidoc is_utf8_string
  
  Returns true if first C<len> bytes of the given string form a valid
-UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
-not mean 'a string that contains code points above 0x7F encoded in UTF-8'
-because a valid ASCII string is a valid UTF-8 string.
+UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
+using C<strlen(s)>.  Note that 'a valid UTF-8 string' does not mean 'a
+string that contains code points above 0x7F encoded in UTF-8' because a
+valid ASCII string is a valid UTF-8 string.
  
  See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
  
  See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
@@ -423,20 +465,62 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  =for apidoc utf8n_to_uvuni
  
  Bottom level UTF-8 decode routine.
  =for apidoc utf8n_to_uvuni
  
  Bottom level UTF-8 decode routine.
-Returns the Unicode code point value of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character.
-
-If C<s> does not point to a well-formed UTF-8 character, the behaviour
-is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
-it is assumed that the caller will raise a warning, and this function
-will silently just set C<retlen> to C<-1> and return zero.  If the
-C<flags> does not contain UTF8_CHECK_ONLY, warnings about
-malformations will be given, C<retlen> will be set to the expected
-length of the UTF-8 character in bytes, and zero will be returned.
-
-The C<flags> can also contain various flags to allow deviations from
-the strict UTF-8 encoding (see F<utf8.h>).
+Returns the code point value of the first character in the string C<s>
+which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
+C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
+character.
+
+The value of C<flags> determines the behavior when C<s> does not point to a
+well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
+C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
+is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
+is raised.
+
+Various ALLOW flags can be set in C<flags> to allow (and not warn on)
+individual types of malformations, such as the sequence being overlong (that
+is, when there is a shorter sequence that can express the same code point;
+overlong sequences are expressly forbidden in the UTF-8 standard due to
+potential security issues).  Another malformation example is the first byte of
+a character not being a legal first byte.  See F<utf8.h> for the list of such
+flags.  Of course, the value returned by this function under such conditions is
+not reliable.
+
+The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
+flags) malformation is found.  If this flag is set, the routine assumes that
+the caller will raise a warning, and this function will silently just set
+C<retlen> to C<-1> and return zero.
+
+Certain code points are considered problematic.  These are Unicode surrogates,
+Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
+By default these are considered regular code points, but certain situations
+warrant special handling for them.  if C<flags> contains
+UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
+malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
+UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
+maximum) can be set to disallow these categories individually.
+
+The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
+UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
+for their respective categories, but otherwise the code points are considered
+valid (not malformations).  To get a category to both be treated as a
+malformation and raise a warning, specify both the WARN and DISALLOW flags.
+(But note that warnings are not raised if lexically disabled nor if
+UTF8_CHECK_ONLY is also specified.)
+
+Very large code points (above 0x7FFF_FFFF) are considered more problematic than
+the others that are above the Unicode legal maximum.  There are several
+reasons, one of which is that the original UTF-8 specification never went above
+this number (the current 0x10FFF limit was imposed later).  The UTF-8 encoding
+on ASCII platforms for these large code point begins with a byte containing
+0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
+malformations, while allowing smaller above-Unicode code points.  (Of course
+UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
+as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
+flags, but applies just to these code points.
+
+All other code points corresponding to Unicode characters, including private
+use and those yet to be assigned, are never considered malformed and never
+warn.
  
  Most code should use utf8_to_uvchr() rather than call this directly.
  
  
  Most code should use utf8_to_uvchr() rather than call this directly.
  
@@ -450,10 +534,11 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      const U8 * const s0 = s;
      UV uv = *s, ouv = 0;
      STRLEN len = 1;
      const U8 * const s0 = s;
      UV uv = *s, ouv = 0;
      STRLEN len = 1;
-    const bool dowarn = ckWARN_d(WARN_UTF8);
+    bool dowarn = ckWARN_d(WARN_UTF8);
      const UV startbyte = *s;
      STRLEN expectlen = 0;
      U32 warning = 0;
      const UV startbyte = *s;
      STRLEN expectlen = 0;
      U32 warning = 0;
+    SV* sv = NULL;
  
      PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
  
  
      PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
  
@@ -462,12 +547,9 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  #define UTF8_WARN_EMPTY                                 1
  #define UTF8_WARN_CONTINUATION                  2
  #define UTF8_WARN_NON_CONTINUATION              3
  #define UTF8_WARN_EMPTY                                 1
  #define UTF8_WARN_CONTINUATION                  2
  #define UTF8_WARN_NON_CONTINUATION              3
-#define UTF8_WARN_FE_FF                                 4
-#define UTF8_WARN_SHORT                                 5
-#define UTF8_WARN_OVERFLOW                      6
-#define UTF8_WARN_SURROGATE                     7
-#define UTF8_WARN_LONG                          8
-#define UTF8_WARN_FFFF                          9 /* Also FFFE. */
+#define UTF8_WARN_SHORT                                 4
+#define UTF8_WARN_OVERFLOW                      5
+#define UTF8_WARN_LONG                          6
  
      if (curlen == 0 &&
         !(flags & UTF8_ALLOW_EMPTY)) {
  
      if (curlen == 0 &&
         !(flags & UTF8_ALLOW_EMPTY)) {
@@ -496,10 +578,14 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  #ifdef EBCDIC
      uv = NATIVE_TO_UTF(uv);
  #else
  #ifdef EBCDIC
      uv = NATIVE_TO_UTF(uv);
  #else
-    if ((uv == 0xfe || uv == 0xff) &&
-       !(flags & UTF8_ALLOW_FE_FF)) {
-       warning = UTF8_WARN_FE_FF;
-       goto malformed;
+    if (uv == 0xfe || uv == 0xff) {
+       if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
+           flags &= ~UTF8_WARN_SUPER;  /* Only warn once on this problem */
+       }
+       if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
+           goto malformed;
+       }
      }
  #endif
  
      }
  #endif
  
@@ -529,7 +615,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  
      len--;
      s++;
  
      len--;
      s++;
-    ouv = uv;
+    ouv = uv;  /* ouv is the value from the previous iteration */
  
      while (len--) {
         if (!UTF8_IS_CONTINUATION(*s) &&
  
      while (len--) {
         if (!UTF8_IS_CONTINUATION(*s) &&
@@ -540,7 +626,8 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
         }
         else
             uv = UTF8_ACCUMULATE(uv, *s);
         }
         else
             uv = UTF8_ACCUMULATE(uv, *s);
-       if (!(uv > ouv)) {
+       if (!(uv > ouv)) {  /* If the value didn't grow from the previous
+                              iteration, something is horribly wrong */
             /* These cannot be allowed. */
             if (uv == ouv) {
                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
             /* These cannot be allowed. */
             if (uv == ouv) {
                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
@@ -558,22 +645,47 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
         ouv = uv;
      }
  
         ouv = uv;
      }
  
-    if (UNICODE_IS_SURROGATE(uv) &&
-       !(flags & UTF8_ALLOW_SURROGATE)) {
-       warning = UTF8_WARN_SURROGATE;
-       goto malformed;
-    } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
-              !(flags & UTF8_ALLOW_LONG)) {
+    if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
         warning = UTF8_WARN_LONG;
         goto malformed;
         warning = UTF8_WARN_LONG;
         goto malformed;
-    } else if (UNICODE_IS_ILLEGAL(uv) &&
-              !(flags & UTF8_ALLOW_FFFF)) {
-       warning = UTF8_WARN_FFFF;
-       goto malformed;
+    } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
+       if (UNICODE_IS_SURROGATE(uv)) {
+           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
+           }
+           if (flags & UTF8_DISALLOW_SURROGATE) {
+               goto disallowed;
+           }
+       }
+       else if (UNICODE_IS_NONCHAR(uv)) {
+           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+           }
+           if (flags & UTF8_DISALLOW_NONCHAR) {
+               goto disallowed;
+           }
+       }
+       else if ((uv > PERL_UNICODE_MAX)) {
+           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+           }
+           if (flags & UTF8_DISALLOW_SUPER) {
+               goto disallowed;
+           }
+       }
+
+       /* Here, this is not considered a malformed character, so drop through
+        * to return it */
      }
  
      return uv;
  
      }
  
      return uv;
  
+disallowed: /* Is disallowed, but otherwise not malformed.  'sv' will have been
+              set if there is to be a warning. */
+    if (!sv) {
+       dowarn = 0;
+    }
+
  malformed:
  
      if (flags & UTF8_CHECK_ONLY) {
  malformed:
  
      if (flags & UTF8_CHECK_ONLY) {
@@ -583,55 +695,48 @@ malformed:
      }
  
      if (dowarn) {
      }
  
      if (dowarn) {
-       SV* const sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
+       if (! sv) {
+           sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
+       }
  
         switch (warning) {
  
         switch (warning) {
-       case 0: /* Intentionally empty. */ break;
-       case UTF8_WARN_EMPTY:
-           sv_catpvs(sv, "(empty string)");
-           break;
-       case UTF8_WARN_CONTINUATION:
-           Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
-           break;
-       case UTF8_WARN_NON_CONTINUATION:
-           if (s == s0)
-               Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
-                           (UV)s[1], startbyte);
-           else {
-               const int len = (int)(s-s0);
-               Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
-                           (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
-           }
+           case 0: /* Intentionally empty. */ break;
+           case UTF8_WARN_EMPTY:
+               sv_catpvs(sv, "(empty string)");
+               break;
+           case UTF8_WARN_CONTINUATION:
+               Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
+               break;
+           case UTF8_WARN_NON_CONTINUATION:
+               if (s == s0)
+                   Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
+                               (UV)s[1], startbyte);
+               else {
+                   const int len = (int)(s-s0);
+                   Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
+                               (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
+               }
  
  
-           break;
-       case UTF8_WARN_FE_FF:
-           Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
-           break;
-       case UTF8_WARN_SHORT:
-           Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
-                           (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
-           expectlen = curlen;         /* distance for caller to skip */
-           break;
-       case UTF8_WARN_OVERFLOW:
-           Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
-                           ouv, *s, startbyte);
-           break;
-       case UTF8_WARN_SURROGATE:
-           Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
-           break;
-       case UTF8_WARN_LONG:
-           Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
-                          (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
-           break;
-       case UTF8_WARN_FFFF:
-           Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
-           break;
-       default:
-           sv_catpvs(sv, "(unknown reason)");
-           break;
+               break;
+           case UTF8_WARN_SHORT:
+               Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+                               (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
+               expectlen = curlen;             /* distance for caller to skip */
+               break;
+           case UTF8_WARN_OVERFLOW:
+               Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
+                               ouv, *s, startbyte);
+               break;
+           case UTF8_WARN_LONG:
+               Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+                               (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
+               break;
+           default:
+               sv_catpvs(sv, "(unknown reason)");
+               break;
         }
         
         }
         
-       if (warning) {
+       if (sv) {
             const char * const s = SvPVX_const(sv);
  
             if (PL_op)
             const char * const s = SvPVX_const(sv);
  
             if (PL_op)
@@ -651,7 +756,7 @@ malformed:
  /*
  =for apidoc utf8_to_uvchr
  
  /*
  =for apidoc utf8_to_uvchr
  
-Returns the native character value of the first character in the string C<s>
+Returns the native code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
@@ -661,13 +766,14 @@ returned and retlen is set, if possible, to -1.
  =cut
  */
  
  =cut
  */
  
+
  UV
  Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
  
      return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
  UV
  Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
  
      return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
-                         ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+                         ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
  }
  
  /*
  }
  
  /*
@@ -693,7 +799,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
  
      /* Call the low level routine asking for checks */
      return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
  
      /* Call the low level routine asking for checks */
      return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
-                              ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
  }
  
  /*
  }
  
  /*
@@ -731,13 +837,11 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
      if (e != s) {
         len--;
          warn_and_return:
      if (e != s) {
         len--;
          warn_and_return:
-       if (ckWARN_d(WARN_UTF8)) {
-           if (PL_op)
-               Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                           "%s in %s", unees, OP_DESC(PL_op));
-           else
-               Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
-       }
+       if (PL_op)
+           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                            "%s in %s", unees, OP_DESC(PL_op));
+       else
+           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
      }
  
      return len;
      }
  
      return len;
@@ -801,6 +905,74 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
  }
  
  /*
  }
  
  /*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+    const U8 *const bend = b + blen;
+    const U8 *const uend = u + ulen;
+
+    PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+    PERL_UNUSED_CONTEXT;
+
+    while (b < bend && u < uend) {
+        U8 c = *u++;
+       if (!UTF8_IS_INVARIANT(c)) {
+           if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+               if (u < uend) {
+                   U8 c1 = *u++;
+                   if (UTF8_IS_CONTINUATION(c1)) {
+                       c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
+                   } else {
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                                        "Malformed UTF-8 character "
+                                        "(unexpected non-continuation byte 0x%02x"
+                                        ", immediately after start byte 0x%02x)"
+                                        /* Dear diag.t, it's in the pod.  */
+                                        "%s%s", c1, c,
+                                        PL_op ? " in " : "",
+                                        PL_op ? OP_DESC(PL_op) : "");
+                       return -2;
+                   }
+               } else {
+                   if (PL_op)
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                                        "%s in %s", unees, OP_DESC(PL_op));
+                   else
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
+                   return -2; /* Really want to return undef :-)  */
+               }
+           } else {
+               return -2;
+           }
+       }
+       if (*b != c) {
+           return *b < c ? -2 : +2;
+       }
+       ++b;
+    }
+
+    if (b == bend && u == uend)
+       return 0;
+
+    return b < bend ? +1 : -1;
+}
+
+/*
  =for apidoc utf8_to_bytes
  
  Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
  =for apidoc utf8_to_bytes
  
  Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
@@ -893,8 +1065,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
         U8 c = *s++;
         if (!UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
         U8 c = *s++;
         if (!UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
-           c = ASCII_TO_NATIVE(c);
+           c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
         }
         *d++ = c;
      }
         }
         *d++ = c;
      }
@@ -906,9 +1077,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
  /*
  =for apidoc bytes_to_utf8
  
  /*
  =for apidoc bytes_to_utf8
  
-Converts a string C<s> of length C<len> from the native encoding into UTF-8.
+Converts a string C<s> of length C<len> bytes from the native encoding into
+UTF-8.
  Returns a pointer to the newly-created string, and sets C<len> to
  Returns a pointer to the newly-created string, and sets C<len> to
-reflect the new length.
+reflect the new length in bytes.
  
  A NUL character will be written after the end of the string.
  
  
  A NUL character will be written after the end of the string.
  
@@ -919,6 +1091,9 @@ see sv_recode_to_utf8().
  =cut
  */
  
  =cut
  */
  
+/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
+   likewise need duplication. */
+
  U8*
  Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
  {
  U8*
  Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
  {
@@ -960,12 +1135,6 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  
      PERL_ARGS_ASSERT_UTF16_TO_UTF8;
  
  
      PERL_ARGS_ASSERT_UTF16_TO_UTF8;
  
-    if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
-        d[0] = 0;
-        *newlen = 1;
-        return d;
-    }
-
      if (bytelen & 1)
         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
  
      if (bytelen & 1)
         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
  
@@ -987,12 +1156,18 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
             continue;
         }
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
             continue;
         }
-       if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
-           UV low = (p[0] << 8) + p[1];
-           p += 2;
-           if (low < 0xdc00 || low >= 0xdfff)
+       if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
+           if (p >= pend) {
                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
-           uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
+           } else {
+               UV low = (p[0] << 8) + p[1];
+               p += 2;
+               if (low < 0xdc00 || low > 0xdfff)
+                   Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+               uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
+           }
+       } else if (uv >= 0xdc00 && uv <= 0xdfff) {
+           Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
         }
         if (uv < 0x10000) {
             *d++ = (U8)(( uv >> 12)         | 0xe0);
         }
         if (uv < 0x10000) {
             *d++ = (U8)(( uv >> 12)         | 0xe0);
@@ -1022,6 +1197,10 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
  
      PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
  
  
      PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
  
+    if (bytelen & 1)
+       Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
+                  (UV)bytelen);
+
      while (s < send) {
         const U8 tmp = s[0];
         s[0] = s[1];
      while (s < send) {
         const U8 tmp = s[0];
         s[0] = s[1];
@@ -1031,7 +1210,9 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
      return utf16_to_utf8(p, d, bytelen, newlen);
  }
  
      return utf16_to_utf8(p, d, bytelen, newlen);
  }
  
-/* for now these are all defined (inefficiently) in terms of the utf8 versions */
+/* for now these are all defined (inefficiently) in terms of the utf8 versions.
+ * Note that the macros in handy.h that call these short-circuit calling them
+ * for Latin-1 range inputs */
  
  bool
  Perl_is_uni_alnum(pTHX_ UV c)
  
  bool
  Perl_is_uni_alnum(pTHX_ UV c)
@@ -1060,9 +1241,7 @@ Perl_is_uni_alpha(pTHX_ UV c)
  bool
  Perl_is_uni_ascii(pTHX_ UV c)
  {
  bool
  Perl_is_uni_ascii(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_ascii(tmpbuf);
+    return isASCII(c);
  }
  
  bool
  }
  
  bool
@@ -1100,9 +1279,7 @@ Perl_is_uni_lower(pTHX_ UV c)
  bool
  Perl_is_uni_cntrl(pTHX_ UV c)
  {
  bool
  Perl_is_uni_cntrl(pTHX_ UV c)
  {
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_cntrl(tmpbuf);
+    return isCNTRL_L1(c);
  }
  
  bool
  }
  
  bool
@@ -1137,9 +1314,18 @@ Perl_is_uni_xdigit(pTHX_ UV c)
      return is_utf8_xdigit(tmpbuf);
  }
  
      return is_utf8_xdigit(tmpbuf);
  }
  
+
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
  UV
  Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
  {
+    /* Convert the Unicode character whose ordinal is c to its uppercase
+     * version and store that in UTF-8 in p and its length in bytes in lenp.
+     * Note that the p needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
+     * the changed version may be longer than the original character.
+     *
+     * The ordinal of the first character of the changed version is returned
+     * (but note, as explained above, that there may be more.) */
+
      PERL_ARGS_ASSERT_TO_UNI_UPPER;
  
      uvchr_to_utf8(p, c);
      PERL_ARGS_ASSERT_TO_UNI_UPPER;
  
      uvchr_to_utf8(p, c);
@@ -1165,12 +1351,12 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
  }
  
  UV
  }
  
  UV
-Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
+Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
  {
  {
-    PERL_ARGS_ASSERT_TO_UNI_FOLD;
+    PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
  
      uvchr_to_utf8(p, c);
  
      uvchr_to_utf8(p, c);
-    return to_utf8_fold(p, p, lenp);
+    return _to_utf8_fold_flags(p, p, lenp, flags);
  }
  
  /* for now these all assume no locale info available for Unicode > 255 */
  }
  
  /* for now these all assume no locale info available for Unicode > 255 */
@@ -1325,18 +1511,49 @@ Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
  }
  
  bool
  }
  
  bool
-Perl_is_utf8_idcont(pTHX_ const U8 *p)
+Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
  {
      dVAR;
  
  {
      dVAR;
  
-    PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
+    PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
  
      if (*p == '_')
         return TRUE;
  
      if (*p == '_')
         return TRUE;
+    /* is_utf8_idstart would be more logical. */
+    return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
+}
+
+bool
+Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
+
+    return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
+}
+
+bool
+Perl_is_utf8_idcont(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
+
      return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
  }
  
  bool
      return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
  }
  
  bool
+Perl_is_utf8_xidcont(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
+
+    return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
+}
+
+bool
  Perl_is_utf8_alpha(pTHX_ const U8 *p)
  {
      dVAR;
  Perl_is_utf8_alpha(pTHX_ const U8 *p)
  {
      dVAR;
@@ -1353,7 +1570,9 @@ Perl_is_utf8_ascii(pTHX_ const U8 *p)
  
      PERL_ARGS_ASSERT_IS_UTF8_ASCII;
  
  
      PERL_ARGS_ASSERT_IS_UTF8_ASCII;
  
-    return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
+    /* ASCII characters are the same whether in utf8 or not.  So the macro
+     * works on both utf8 and non-utf8 representations. */
+    return isASCII(*p);
  }
  
  bool
  }
  
  bool
@@ -1363,7 +1582,31 @@ Perl_is_utf8_space(pTHX_ const U8 *p)
  
      PERL_ARGS_ASSERT_IS_UTF8_SPACE;
  
  
      PERL_ARGS_ASSERT_IS_UTF8_SPACE;
  
-    return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
+    return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
+}
+
+bool
+Perl_is_utf8_perl_space(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
+
+    /* Only true if is an ASCII space-like character, and ASCII is invariant
+     * under utf8, so can just use the macro */
+    return isSPACE_A(*p);
+}
+
+bool
+Perl_is_utf8_perl_word(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
+
+    /* Only true if is an ASCII word character, and ASCII is invariant
+     * under utf8, so can just use the macro */
+    return isWORDCHAR_A(*p);
  }
  
  bool
  }
  
  bool
@@ -1377,6 +1620,18 @@ Perl_is_utf8_digit(pTHX_ const U8 *p)
  }
  
  bool
  }
  
  bool
+Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
+
+    /* Only true if is an ASCII digit character, and ASCII is invariant
+     * under utf8, so can just use the macro */
+    return isDIGIT_A(*p);
+}
+
+bool
  Perl_is_utf8_upper(pTHX_ const U8 *p)
  {
      dVAR;
  Perl_is_utf8_upper(pTHX_ const U8 *p)
  {
      dVAR;
@@ -1403,7 +1658,15 @@ Perl_is_utf8_cntrl(pTHX_ const U8 *p)
  
      PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
  
  
      PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
  
-    return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
+    if (isASCII(*p)) {
+       return isCNTRL_A(*p);
+    }
+
+    /* All controls are in Latin1 */
+    if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
+       return 0;
+    }
+    return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
  }
  
  bool
  }
  
  bool
@@ -1443,7 +1706,7 @@ Perl_is_utf8_xdigit(pTHX_ const U8 *p)
  
      PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
  
  
      PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
  
-    return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
+    return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
  }
  
  bool
  }
  
  bool
@@ -1456,6 +1719,106 @@ Perl_is_utf8_mark(pTHX_ const U8 *p)
      return is_utf8_common(p, &PL_utf8_mark, "IsM");
  }
  
      return is_utf8_common(p, &PL_utf8_mark, "IsM");
  }
  
+bool
+Perl_is_utf8_X_begin(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
+
+    return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
+}
+
+bool
+Perl_is_utf8_X_extend(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
+
+    return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
+}
+
+bool
+Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
+
+    return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
+}
+
+bool
+Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
+
+    return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
+}
+
+bool
+Perl_is_utf8_X_L(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_L;
+
+    return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
+}
+
+bool
+Perl_is_utf8_X_LV(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LV;
+
+    return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
+}
+
+bool
+Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
+
+    return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
+}
+
+bool
+Perl_is_utf8_X_T(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_T;
+
+    return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
+}
+
+bool
+Perl_is_utf8_X_V(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_V;
+
+    return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
+}
+
+bool
+Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
+{
+    dVAR;
+
+    PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
+
+    return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
+}
+
  /*
  =for apidoc to_utf8_case
  
  /*
  =for apidoc to_utf8_case
  
@@ -1468,7 +1831,7 @@ of the result.
  
  The "swashp" is a pointer to the swash to use.
  
  
  The "swashp" is a pointer to the swash to use.
  
-Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
+Both the special and normal mappings are stored in lib/unicore/To/Foo.pl,
  and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
  but not always, a multicharacter mapping), is tried first.
  
  and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
  but not always, a multicharacter mapping), is tried first.
  
@@ -1496,13 +1859,34 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
  
      PERL_ARGS_ASSERT_TO_UTF8_CASE;
  
  
      PERL_ARGS_ASSERT_TO_UTF8_CASE;
  
+    /* Note that swash_fetch() doesn't output warnings for these because it
+     * assumes we will */
+    if (uv1 >= UNICODE_SURROGATE_FIRST) {
+       if (uv1 <= UNICODE_SURROGATE_LAST) {
+           if (ckWARN_d(WARN_SURROGATE)) {
+               const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+               Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
+                   "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
+           }
+       }
+       else if (UNICODE_IS_SUPER(uv1)) {
+           if (ckWARN_d(WARN_NON_UNICODE)) {
+               const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+                   "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
+           }
+       }
+
+       /* Note that non-characters are perfectly legal, so no warning should
+        * be given */
+    }
+
      uvuni_to_utf8(tmpbuf, uv1);
  
      if (!*swashp) /* load on-demand */
           *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
  
      uvuni_to_utf8(tmpbuf, uv1);
  
      if (!*swashp) /* load on-demand */
           *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
  
-    /* The 0xDF is the only special casing Unicode code point below 0x100. */
-    if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
+    if (special) {
           /* It might be "special" (sometimes, but not always,
           * a multicharacter mapping) */
          HV * const hv = get_hv(special, 0);
           /* It might be "special" (sometimes, but not always,
           * a multicharacter mapping) */
          HV * const hv = get_hv(special, 0);
@@ -1562,7 +1946,8 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
          }
      }
  
          }
      }
  
-    if (!len) /* Neither: just copy. */
+    if (!len) /* Neither: just copy.  In other words, there was no mapping
+                defined, which means that the code point maps to itself */
          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
  
      if (lenp)
          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
  
      if (lenp)
@@ -1657,15 +2042,20 @@ The first character of the foldcased version is returned
  
  =cut */
  
  
  =cut */
  
+/* Not currently externally documented is 'flags', which currently is non-zero
+ * if full case folds are to be used; otherwise simple folds */
+
  UV
  UV
-Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
+Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
  {
  {
+    const char *specials = (flags) ? "utf8::ToSpecFold" : NULL;
+
      dVAR;
  
      dVAR;
  
-    PERL_ARGS_ASSERT_TO_UTF8_FOLD;
+    PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
  
      return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
  
      return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
-                             &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
+                             &PL_utf8_tofold, "ToFold", specials);
  }
  
  /* Note:
  }
  
  /* Note:
@@ -1684,15 +2074,18 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
      const size_t name_len = strlen(name);
      HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
      SV* errsv_save;
      const size_t name_len = strlen(name);
      HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
      SV* errsv_save;
+    GV *method;
  
      PERL_ARGS_ASSERT_SWASH_INIT;
  
      PUSHSTACKi(PERLSI_MAGIC);
      ENTER;
  
      PERL_ARGS_ASSERT_SWASH_INIT;
  
      PUSHSTACKi(PERLSI_MAGIC);
      ENTER;
-    SAVEI32(PL_hints);
-    PL_hints = 0;
+    SAVEHINTS();
      save_re_context();
      save_re_context();
-    if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {     /* demand load utf8 */
+    if (PL_parser && PL_parser->error_count)
+       SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
+    method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
+    if (!method) {     /* demand load utf8 */
         ENTER;
         errsv_save = newSVsv(ERRSV);
         /* It is assumed that callers of this routine are not passing in any
         ENTER;
         errsv_save = newSVsv(ERRSV);
         /* It is assumed that callers of this routine are not passing in any
@@ -1719,7 +2112,10 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
      mPUSHi(none);
      PUTBACK;
      errsv_save = newSVsv(ERRSV);
      mPUSHi(none);
      PUTBACK;
      errsv_save = newSVsv(ERRSV);
-    if (call_method("SWASHNEW", G_SCALAR))
+    /* If we already have a pointer to the method, no need to use call_method()
+       to repeat the lookup.  */
+    if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
+       : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
         retval = newSVsv(*PL_stack_sp--);
      else
         retval = &PL_sv_undef;
         retval = newSVsv(*PL_stack_sp--);
      else
         retval = &PL_sv_undef;
@@ -1745,7 +2141,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
   * return several Unicode characters for a single Unicode character
   * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
   * the lower-level routine, and it is similarly broken for returning
   * return several Unicode characters for a single Unicode character
   * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
   * the lower-level routine, and it is similarly broken for returning
- * multiple values.  --jhi */
+ * multiple values.  --jhi
+ * For those, you should use to_utf8_case() instead */
  /* Now SWASHGET is recasted into S_swash_get in this file. */
  
  /* Note:
  /* Now SWASHGET is recasted into S_swash_get in this file. */
  
  /* Note:
@@ -1777,7 +2174,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
         ptr = tmputf8;
      }
      /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
         ptr = tmputf8;
      }
      /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
-     * then the "swatch" is a vec() for al the chars which start
+     * then the "swatch" is a vec() for all the chars which start
       * with 0xAA..0xYY
       * So the key in the hash (klen) is length of encoded char -1
       */
       * with 0xAA..0xYY
       * So the key in the hash (klen) is length of encoded char -1
       */
@@ -1785,7 +2182,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      off  = ptr[klen];
  
      if (klen == 0) {
      off  = ptr[klen];
  
      if (klen == 0) {
-      /* If char in invariant then swatch is for all the invariant chars
+      /* If char is invariant then swatch is for all the invariant chars
         * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
         */
         needents = UTF_CONTINUATION_MARK;
         * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
         */
         needents = UTF_CONTINUATION_MARK;
@@ -1795,6 +2192,19 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
        /* If char is encoded then swatch is for the prefix */
         needents = (1 << UTF_ACCUMULATION_SHIFT);
         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
        /* If char is encoded then swatch is for the prefix */
         needents = (1 << UTF_ACCUMULATION_SHIFT);
         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+       if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
+           const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
+
+           /* This outputs warnings for binary properties only, assuming that
+            * to_utf8_case() will output any for non-binary.  Also, surrogates
+            * aren't checked for, as that would warn on things like
+            * /\p{Gc=Cs}/ */
+           SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+           if (SvUV(*bitssvp) == 1) {
+               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+                   "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
+           }
+       }
      }
  
      /*
      }
  
      /*
@@ -1869,27 +2279,134 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      NORETURN_FUNCTION_END;
  }
  
      NORETURN_FUNCTION_END;
  }
  
-/* Note:
- * Returns a swatch (a bit vector string) for a code point sequence
- * that starts from the value C<start> and comprises the number C<span>.
- * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
- * Should be used via swash_fetch, which will cache the swatch in C<swash>.
+/* Read a single line of the main body of the swash input text.  These are of
+ * the form:
+ * 0053        0056    0073
+ * where each number is hex.  The first two numbers form the minimum and
+ * maximum of a range, and the third is the value associated with the range.
+ * Not all swashes should have a third number
+ *
+ * On input: l   points to the beginning of the line to be examined; it points
+ *               to somewhere in the string of the whole input text, and is
+ *               terminated by a \n or the null string terminator.
+ *          lend   points to the null terminator of that string
+ *          wants_value    is non-zero if the swash expects a third number
+ *          typestr is the name of the swash's mapping, like 'ToLower'
+ * On output: *min, *max, and *val are set to the values read from the line.
+ *           returns a pointer just beyond the line examined.  If there was no
+ *           valid min number on the line, returns lend+1
   */
   */
-STATIC SV*
-S_swash_get(pTHX_ SV* swash, UV start, UV span)
+
+STATIC U8*
+S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
+                            const bool wants_value, const U8* const typestr)
  {
  {
-    SV *swatch;
-    U8 *l, *lend, *x, *xend, *s;
-    STRLEN lcur, xcur, scur;
-    HV *const hv = MUTABLE_HV(SvRV(swash));
-    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
-    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
-    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
-    SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
-    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
      const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
      const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
-    const STRLEN bits  = SvUV(*bitssvp);
+    STRLEN numlen;         /* Length of the number */
+    I32 flags = PERL_SCAN_SILENT_ILLDIGIT
+               | PERL_SCAN_DISALLOW_PREFIX
+               | PERL_SCAN_SILENT_NON_PORTABLE;
+
+    /* nl points to the next \n in the scan */
+    U8* const nl = (U8*)memchr(l, '\n', lend - l);
+
+    /* Get the first number on the line: the range minimum */
+    numlen = lend - l;
+    *min = grok_hex((char *)l, &numlen, &flags, NULL);
+    if (numlen)            /* If found a hex number, position past it */
+       l += numlen;
+    else if (nl) {         /* Else, go handle next line, if any */
+       return nl + 1;  /* 1 is length of "\n" */
+    }
+    else {             /* Else, no next line */
+       return lend + 1;        /* to LIST's end at which \n is not found */
+    }
+
+    /* The max range value follows, separated by a BLANK */
+    if (isBLANK(*l)) {
+       ++l;
+       flags = PERL_SCAN_SILENT_ILLDIGIT
+               | PERL_SCAN_DISALLOW_PREFIX
+               | PERL_SCAN_SILENT_NON_PORTABLE;
+       numlen = lend - l;
+       *max = grok_hex((char *)l, &numlen, &flags, NULL);
+       if (numlen)
+           l += numlen;
+       else    /* If no value here, it is a single element range */
+           *max = *min;
+
+       /* Non-binary tables have a third entry: what the first element of the
+        * range maps to */
+       if (wants_value) {
+           if (isBLANK(*l)) {
+               ++l;
+               flags = PERL_SCAN_SILENT_ILLDIGIT
+                     | PERL_SCAN_DISALLOW_PREFIX
+                     | PERL_SCAN_SILENT_NON_PORTABLE;
+               numlen = lend - l;
+               *val = grok_hex((char *)l, &numlen, &flags, NULL);
+               if (numlen)
+                   l += numlen;
+               else
+                   *val = 0;
+           }
+           else {
+               *val = 0;
+               if (typeto) {
+                   Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+                                    typestr, l);
+               }
+           }
+       }
+       else
+           *val = 0; /* bits == 1, then any val should be ignored */
+    }
+    else { /* Nothing following range min, should be single element with no
+             mapping expected */
+       *max = *min;
+       if (wants_value) {
+           *val = 0;
+           if (typeto) {
+               Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+           }
+       }
+       else
+           *val = 0; /* bits == 1, then val should be ignored */
+    }
+
+    /* Position to next line if any, or EOF */
+    if (nl)
+       l = nl + 1;
+    else
+       l = lend;
+
+    return l;
+}
+
+/* Note:
+ * Returns a swatch (a bit vector string) for a code point sequence
+ * that starts from the value C<start> and comprises the number C<span>.
+ * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
+ * Should be used via swash_fetch, which will cache the swatch in C<swash>.
+ */
+STATIC SV*
+S_swash_get(pTHX_ SV* swash, UV start, UV span)
+{
+    SV *swatch;
+    U8 *l, *lend, *x, *xend, *s, *send;
+    STRLEN lcur, xcur, scur;
+    HV *const hv = MUTABLE_HV(SvRV(swash));
+
+    /* The string containing the main body of the table */
+    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+    SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
+    SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
+    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+    const STRLEN bits  = SvUV(*bitssvp);
      const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
      const UV     none  = SvUV(*nonesvp);
      const UV     end   = start + span;
      const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
      const UV     none  = SvUV(*nonesvp);
      const UV     end   = start + span;
@@ -1935,74 +2452,13 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      lend = l + lcur;
      while (l < lend) {
         UV min, max, val;
      lend = l + lcur;
      while (l < lend) {
         UV min, max, val;
-       STRLEN numlen;
-       I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-
-       U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
-       numlen = lend - l;
-       min = grok_hex((char *)l, &numlen, &flags, NULL);
-       if (numlen)
-           l += numlen;
-       else if (nl) {
-           l = nl + 1; /* 1 is length of "\n" */
-           continue;
-       }
-       else {
-           l = lend; /* to LIST's end at which \n is not found */
+       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+                                        cBOOL(octets), typestr);
+       if (l > lend) {
             break;
         }
  
             break;
         }
  
-       if (isBLANK(*l)) {
-           ++l;
-           flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-           numlen = lend - l;
-           max = grok_hex((char *)l, &numlen, &flags, NULL);
-           if (numlen)
-               l += numlen;
-           else
-               max = min;
-
-           if (octets) {
-               if (isBLANK(*l)) {
-                   ++l;
-                   flags = PERL_SCAN_SILENT_ILLDIGIT |
-                           PERL_SCAN_DISALLOW_PREFIX;
-                   numlen = lend - l;
-                   val = grok_hex((char *)l, &numlen, &flags, NULL);
-                   if (numlen)
-                       l += numlen;
-                   else
-                       val = 0;
-               }
-               else {
-                   val = 0;
-                   if (typeto) {
-                       Perl_croak(aTHX_ "%s: illegal mapping '%s'",
-                                        typestr, l);
-                   }
-               }
-           }
-           else
-               val = 0; /* bits == 1, then val should be ignored */
-       }
-       else {
-           max = min;
-           if (octets) {
-               val = 0;
-               if (typeto) {
-                   Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
-               }
-           }
-           else
-               val = 0; /* bits == 1, then val should be ignored */
-       }
-
-       if (nl)
-           l = nl + 1;
-       else
-           l = lend;
-
+       /* If looking for something beyond this range, go try the next one */
         if (max < start)
             continue;
  
         if (max < start)
             continue;
  
@@ -2051,7 +2507,31 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      } /* while */
    go_out_list:
  
      } /* while */
    go_out_list:
  
-    /* read $swash->{EXTRAS} */
+    /* Invert if the data says it should be.  Assumes that bits == 1 */
+    if (invert_it_svp && SvUV(*invert_it_svp)) {
+
+       /* Unicode properties should come with all bits above PERL_UNICODE_MAX
+        * be 0, and their inversion should also be 0, as we don't succeed any
+        * Unicode property matches for non-Unicode code points */
+       if (start <= PERL_UNICODE_MAX) {
+
+           /* The code below assumes that we never cross the
+            * Unicode/above-Unicode boundary in a range, as otherwise we would
+            * have to figure out where to stop flipping the bits.  Since this
+            * boundary is divisible by a large power of 2, and swatches comes
+            * in small powers of 2, this should be a valid assumption */
+           assert(start + span - 1 <= PERL_UNICODE_MAX);
+
+           send = s + scur;
+           while (s < send) {
+               *s = ~(*s);
+               s++;
+           }
+       }
+    }
+
+    /* read $swash->{EXTRAS}
+     * This code also copied to swash_to_invlist() below */
      x = (U8*)SvPV(*extssvp, xcur);
      xend = x + xcur;
      while (x < xend) {
      x = (U8*)SvPV(*extssvp, xcur);
      xend = x + xcur;
      while (x < xend) {
@@ -2184,10 +2664,435 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      return swatch;
  }
  
      return swatch;
  }
  
+HV*
+Perl__swash_inversion_hash(pTHX_ SV* const swash)
+{
+
+   /* Subject to change or removal.  For use only in one place in regcomp.c.
+    * Can't be used on a property that is subject to user override, as it
+    * relies on the value of SPECIALS in the swash which would be set by
+    * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
+    * for overridden properties
+    *
+    * Returns a hash which is the inversion and closure of a swash mapping.
+    * For example, consider the input lines:
+    * 004B             006B
+    * 004C             006C
+    * 212A             006B
+    *
+    * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+    * 006C.  The value for each key is an array.  For 006C, the array would
+    * have a two elements, the utf8 for itself, and for 004C.  For 006B, there
+    * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+    *
+    * Essentially, for any code point, it gives all the code points that map to
+    * it, or the list of 'froms' for that point.
+    *
+    * Currently it ignores any additions or deletions from other swashes,
+    * looking at just the main body of the swash, and if there are SPECIALS
+    * in the swash, at that hash
+    *
+    * The specials hash can be extra code points, and most likely consists of
+    * maps from single code points to multiple ones (each expressed as a string
+    * of utf8 characters).   This function currently returns only 1-1 mappings.
+    * However consider this possible input in the specials hash:
+    * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
+    * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
+    *
+    * Both FB05 and FB06 map to the same multi-char sequence, which we don't
+    * currently handle.  But it also means that FB05 and FB06 are equivalent in
+    * a 1-1 mapping which we should handle, and this relationship may not be in
+    * the main table.  Therefore this function examines all the multi-char
+    * sequences and adds the 1-1 mappings that come out of that.  */
+
+    U8 *l, *lend;
+    STRLEN lcur;
+    HV *const hv = MUTABLE_HV(SvRV(swash));
+
+    /* The string containing the main body of the table */
+    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+    /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
+    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+    const STRLEN bits  = SvUV(*bitssvp);
+    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+    const UV     none  = SvUV(*nonesvp);
+    SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
+
+    HV* ret = newHV();
+
+    PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
+
+    /* Must have at least 8 bits to get the mappings */
+    if (bits != 8 && bits != 16 && bits != 32) {
+       Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
+                                                (UV)bits);
+    }
+
+    if (specials_p) { /* It might be "special" (sometimes, but not always, a
+                       mapping to more than one character */
+
+       /* Construct an inverse mapping hash for the specials */
+       HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
+       HV * specials_inverse = newHV();
+       char *char_from; /* the lhs of the map */
+       I32 from_len;   /* its byte length */
+       char *char_to;  /* the rhs of the map */
+       I32 to_len;     /* its byte length */
+       SV *sv_to;      /* and in a sv */
+       AV* from_list;  /* list of things that map to each 'to' */
+
+       hv_iterinit(specials_hv);
+
+       /* The keys are the characters (in utf8) that map to the corresponding
+        * utf8 string value.  Iterate through the list creating the inverse
+        * list. */
+       while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
+           SV** listp;
+           if (! SvPOK(sv_to)) {
+               Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() unexpectedly is not a string");
+           }
+           /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
+
+           /* Each key in the inverse list is a mapped-to value, and the key's
+            * hash value is a list of the strings (each in utf8) that map to
+            * it.  Those strings are all one character long */
+           if ((listp = hv_fetch(specials_inverse,
+                                   SvPVX(sv_to),
+                                   SvCUR(sv_to), 0)))
+           {
+               from_list = (AV*) *listp;
+           }
+           else { /* No entry yet for it: create one */
+               from_list = newAV();
+               if (! hv_store(specials_inverse,
+                               SvPVX(sv_to),
+                               SvCUR(sv_to),
+                               (SV*) from_list, 0))
+               {
+                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+               }
+           }
+
+           /* Here have the list associated with this 'to' (perhaps newly
+            * created and empty).  Just add to it.  Note that we ASSUME that
+            * the input is guaranteed to not have duplications, so we don't
+            * check for that.  Duplications just slow down execution time. */
+           av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
+       }
+
+       /* Here, 'specials_inverse' contains the inverse mapping.  Go through
+        * it looking for cases like the FB05/FB06 examples above.  There would
+        * be an entry in the hash like
+       *       'st' => [ FB05, FB06 ]
+       * In this example we will create two lists that get stored in the
+       * returned hash, 'ret':
+       *       FB05 => [ FB05, FB06 ]
+       *       FB06 => [ FB05, FB06 ]
+       *
+       * Note that there is nothing to do if the array only has one element.
+       * (In the normal 1-1 case handled below, we don't have to worry about
+       * two lists, as everything gets tied to the single list that is
+       * generated for the single character 'to'.  But here, we are omitting
+       * that list, ('st' in the example), so must have multiple lists.) */
+       while ((from_list = (AV *) hv_iternextsv(specials_inverse,
+                                                &char_to, &to_len)))
+       {
+           if (av_len(from_list) > 0) {
+               int i;
+
+               /* We iterate over all combinations of i,j to place each code
+                * point on each list */
+               for (i = 0; i <= av_len(from_list); i++) {
+                   int j;
+                   AV* i_list = newAV();
+                   SV** entryp = av_fetch(from_list, i, FALSE);
+                   if (entryp == NULL) {
+                       Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+                   }
+                   if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
+                       Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
+                   }
+                   if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
+                                  (SV*) i_list, FALSE))
+                   {
+                       Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+                   }
+
+                   /* For debugging: UV u = utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
+                   for (j = 0; j <= av_len(from_list); j++) {
+                       entryp = av_fetch(from_list, j, FALSE);
+                       if (entryp == NULL) {
+                           Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+                       }
+
+                       /* When i==j this adds itself to the list */
+                       av_push(i_list, newSVuv(utf8_to_uvchr(
+                                               (U8*) SvPVX(*entryp), 0)));
+                       /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
+                   }
+               }
+           }
+       }
+       SvREFCNT_dec(specials_inverse); /* done with it */
+    } /* End of specials */
+
+    /* read $swash->{LIST} */
+    l = (U8*)SvPV(*listsvp, lcur);
+    lend = l + lcur;
+
+    /* Go through each input line */
+    while (l < lend) {
+       UV min, max, val;
+       UV inverse;
+       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+                                        cBOOL(octets), typestr);
+       if (l > lend) {
+           break;
+       }
+
+       /* Each element in the range is to be inverted */
+       for (inverse = min; inverse <= max; inverse++) {
+           AV* list;
+           SV** listp;
+           IV i;
+           bool found_key = FALSE;
+           bool found_inverse = FALSE;
+
+           /* The key is the inverse mapping */
+           char key[UTF8_MAXBYTES+1];
+           char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+           STRLEN key_len = key_end - key;
+
+           /* Get the list for the map */
+           if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
+               list = (AV*) *listp;
+           }
+           else { /* No entry yet for it: create one */
+               list = newAV();
+               if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
+                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+               }
+           }
+
+           /* Look through list to see if this inverse mapping already is
+            * listed, or if there is a mapping to itself already */
+           for (i = 0; i <= av_len(list); i++) {
+               SV** entryp = av_fetch(list, i, FALSE);
+               SV* entry;
+               if (entryp == NULL) {
+                   Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+               }
+               entry = *entryp;
+               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
+               if (SvUV(entry) == val) {
+                   found_key = TRUE;
+               }
+               if (SvUV(entry) == inverse) {
+                   found_inverse = TRUE;
+               }
+
+               /* No need to continue searching if found everything we are
+                * looking for */
+               if (found_key && found_inverse) {
+                   break;
+               }
+           }
+
+           /* Make sure there is a mapping to itself on the list */
+           if (! found_key) {
+               av_push(list, newSVuv(val));
+               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));*/
+           }
+
+
+           /* Simply add the value to the list */
+           if (! found_inverse) {
+               av_push(list, newSVuv(inverse));
+               /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));*/
+           }
+
+           /* swash_get() increments the value of val for each element in the
+            * range.  That makes more compact tables possible.  You can
+            * express the capitalization, for example, of all consecutive
+            * letters with a single line: 0061\t007A\t0041 This maps 0061 to
+            * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
+            * and it's not documented; it appears to be used only in
+            * implementing tr//; I copied the semantics from swash_get(), just
+            * in case */
+           if (!none || val < none) {
+               ++val;
+           }
+       }
+    }
+
+    return ret;
+}
+
+SV*
+Perl__swash_to_invlist(pTHX_ SV* const swash)
+{
+
+   /* Subject to change or removal.  For use only in one place in regcomp.c */
+
+    U8 *l, *lend;
+    char *loc;
+    STRLEN lcur;
+    HV *const hv = MUTABLE_HV(SvRV(swash));
+    UV elements = 0;    /* Number of elements in the inversion list */
+    U8 empty[] = "";
+
+    /* The string containing the main body of the table */
+    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+    SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
+    SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
+
+    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+    const STRLEN bits  = SvUV(*bitssvp);
+    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+    U8 *x, *xend;
+    STRLEN xcur;
+
+    SV* invlist;
+
+    PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
+
+    /* read $swash->{LIST} */
+    if (SvPOK(*listsvp)) {
+       l = (U8*)SvPV(*listsvp, lcur);
+    }
+    else {
+       /* LIST legitimately doesn't contain a string during compilation phases
+        * of Perl itself, before the Unicode tables are generated.  In this
+        * case, just fake things up by creating an empty list */
+       l = empty;
+       lcur = 0;
+    }
+    loc = (char *) l;
+    lend = l + lcur;
+
+    /* Scan the input to count the number of lines to preallocate array size
+     * based on worst possible case, which is each line in the input creates 2
+     * elements in the inversion list: 1) the beginning of a range in the list;
+     * 2) the beginning of a range not in the list.  */
+    while ((loc = (strchr(loc, '\n'))) != NULL) {
+       elements += 2;
+       loc++;
+    }
+
+    /* If the ending is somehow corrupt and isn't a new line, add another
+     * element for the final range that isn't in the inversion list */
+    if (! (*lend == '\n' || (*lend == '\0' && *(lend - 1) == '\n'))) {
+       elements++;
+    }
+
+    invlist = _new_invlist(elements);
+
+    /* Now go through the input again, adding each range to the list */
+    while (l < lend) {
+       UV start, end;
+       UV val;         /* Not used by this function */
+
+       l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+                                        cBOOL(octets), typestr);
+
+       if (l > lend) {
+           break;
+       }
+
+       _append_range_to_invlist(invlist, start, end);
+    }
+
+    /* Invert if the data says it should be */
+    if (invert_it_svp && SvUV(*invert_it_svp)) {
+       _invlist_invert_prop(invlist);
+    }
+
+    /* This code is copied from swash_get()
+     * read $swash->{EXTRAS} */
+    x = (U8*)SvPV(*extssvp, xcur);
+    xend = x + xcur;
+    while (x < xend) {
+       STRLEN namelen;
+       U8 *namestr;
+       SV** othersvp;
+       HV* otherhv;
+       STRLEN otherbits;
+       SV **otherbitssvp, *other;
+       U8 *nl;
+
+       const U8 opc = *x++;
+       if (opc == '\n')
+           continue;
+
+       nl = (U8*)memchr(x, '\n', xend - x);
+
+       if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
+           if (nl) {
+               x = nl + 1; /* 1 is length of "\n" */
+               continue;
+           }
+           else {
+               x = xend; /* to EXTRAS' end at which \n is not found */
+               break;
+           }
+       }
+
+       namestr = x;
+       if (nl) {
+           namelen = nl - namestr;
+           x = nl + 1;
+       }
+       else {
+           namelen = xend - namestr;
+           x = xend;
+       }
+
+       othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
+       otherhv = MUTABLE_HV(SvRV(*othersvp));
+       otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
+       otherbits = (STRLEN)SvUV(*otherbitssvp);
+
+       if (bits != otherbits || bits != 1) {
+           Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean properties");
+       }
+
+       /* The "other" swatch must be destroyed after. */
+       other = _swash_to_invlist((SV *)*othersvp);
+
+       /* End of code copied from swash_get() */
+       switch (opc) {
+       case '+':
+           _invlist_union(invlist, other, &invlist);
+           break;
+       case '!':
+           _invlist_invert(other);
+           _invlist_union(invlist, other, &invlist);
+           break;
+       case '-':
+           _invlist_subtract(invlist, other, &invlist);
+           break;
+       case '&':
+           _invlist_intersection(invlist, other, &invlist);
+           break;
+       default:
+           break;
+       }
+       sv_free(other); /* through with it! */
+    }
+
+    return invlist;
+}
+
  /*
  =for apidoc uvchr_to_utf8
  
  /*
  =for apidoc uvchr_to_utf8
  
-Adds the UTF-8 representation of the Native codepoint C<uv> to the end
+Adds the UTF-8 representation of the Native code point C<uv> to the end
  of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
  of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
@@ -2222,14 +3127,13 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  
  /*
  =for apidoc utf8n_to_uvchr
  
  /*
  =for apidoc utf8n_to_uvchr
-flags
  
  
-Returns the native character value of the first character in the string 
+Returns the native character value of the first character in the string
  C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
  C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-Allows length and flags to be passed to low level routine.
+length and flags are the same as utf8n_to_uvuni().
  
  =cut
  */
  
  =cut
  */
@@ -2237,7 +3141,7 @@ Allows length and flags to be passed to low level routine.
     a real function in case XS code wants it
  */
  UV
     a real function in case XS code wants it
  */
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, 
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
  U32 flags)
  {
      const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
  U32 flags)
  {
      const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
@@ -2247,6 +3151,59 @@ U32 flags)
      return UNI_TO_NATIVE(uv);
  }
  
      return UNI_TO_NATIVE(uv);
  }
  
+bool
+Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+{
+    /* May change: warns if surrogates, non-character code points, or
+     * non-Unicode code points are in s which has length len.  Returns TRUE if
+     * none found; FALSE otherwise.  The only other validity check is to make
+     * sure that this won't exceed the string's length */
+
+    const U8* const e = s + len;
+    bool ok = TRUE;
+
+    PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
+
+    while (s < e) {
+       if (UTF8SKIP(s) > len) {
+           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                          "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+           return FALSE;
+       }
+       if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+           STRLEN char_len;
+           if (UTF8_IS_SUPER(s)) {
+               if (ckWARN_d(WARN_NON_UNICODE)) {
+                   UV uv = utf8_to_uvchr(s, &char_len);
+                   Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+                       "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+                   ok = FALSE;
+               }
+           }
+           else if (UTF8_IS_SURROGATE(s)) {
+               if (ckWARN_d(WARN_SURROGATE)) {
+                   UV uv = utf8_to_uvchr(s, &char_len);
+                   Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
+                       "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
+                   ok = FALSE;
+               }
+           }
+           else if
+               ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
+                && (ckWARN_d(WARN_NONCHAR)))
+           {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
+                   "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+               ok = FALSE;
+           }
+       }
+       s += UTF8SKIP(s);
+    }
+
+    return ok;
+}
+
  /*
  =for apidoc pv_uni_display
  
  /*
  =for apidoc pv_uni_display
  
@@ -2322,7 +3279,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
      }
      if (truncated)
          sv_catpvs(dsv, "...");
      }
      if (truncated)
          sv_catpvs(dsv, "...");
-    
+
      return SvPVX(dsv);
  }
  
      return SvPVX(dsv);
  }
  
@@ -2349,124 +3306,259 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
  }
  
  /*
  }
  
  /*
-=for apidoc ibcmp_utf8
-
-Return true if the strings s1 and s2 differ case-insensitively, false
-if not (if they are equal case-insensitively).  If u1 is true, the
-string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
-the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
-are false, the respective string is assumed to be in native 8-bit
-encoding.
-
-If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
-in there (they will point at the beginning of the I<next> character).
-If the pointers behind pe1 or pe2 are non-NULL, they are the end
-pointers beyond which scanning will not continue under any
-circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
-s2+l2 will be used as goal end pointers that will also stop the scan,
-and which qualify towards defining a successful match: all the scans
-that define an explicit length must reach their goal pointers for
-a match to succeed).
+=for apidoc foldEQ_utf8
+
+Returns true if the leading portions of the strings s1 and s2 (either or both
+of which may be in UTF-8) are the same case-insensitively; false otherwise.
+How far into the strings to compare is determined by other input parameters.
+
+If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
+otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
+with respect to s2.
+
+If the byte length l1 is non-zero, it says how far into s1 to check for fold
+equality.  In other words, s1+l1 will be used as a goal to reach.  The
+scan will not be considered to be a match unless the goal is reached, and
+scanning won't continue past that goal.  Correspondingly for l2 with respect to
+s2.
+
+If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
+considered an end pointer beyond which scanning of s1 will not continue under
+any circumstances.  This means that if both l1 and pe1 are specified, and pe1
+is less than s1+l1, the match will never be successful because it can never
+get as far as its goal (and in fact is asserted against).  Correspondingly for
+pe2 with respect to s2.
+
+At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
+non-zero), and if both do, both have to be
+reached for a successful match.   Also, if the fold of a character is multiple
+characters, all of them must be matched (see tr21 reference below for
+'folding').
+
+Upon a successful match, if pe1 is non-NULL,
+it will be set to point to the beginning of the I<next> character of s1 beyond
+what was matched.  Correspondingly for pe2 and s2.
  
  For case-insensitiveness, the "casefolding" of Unicode is used
  instead of upper/lowercasing both the characters, see
  http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
  
  =cut */
  
  For case-insensitiveness, the "casefolding" of Unicode is used
  instead of upper/lowercasing both the characters, see
  http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
  
  =cut */
+
+/* A flags parameter has been added which may change, and hence isn't
+ * externally documented.  Currently it is:
+ *  0 for as-documented above
+ *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
+                           ASCII one, to not match
+ *  FOLDEQ_UTF8_LOCALE     meaning that locale rules are to be used for code
+ *                         points below 256; unicode rules for above 255; and
+ *                         folds that cross those boundaries are disallowed,
+ *                         like the NOMIX_ASCII option
+ */
  I32
  I32
-Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
-{
-     dVAR;
-     register const U8 *p1  = (const U8*)s1;
-     register const U8 *p2  = (const U8*)s2;
-     register const U8 *f1 = NULL;
-     register const U8 *f2 = NULL;
-     register U8 *e1 = NULL;
-     register U8 *q1 = NULL;
-     register U8 *e2 = NULL;
-     register U8 *q2 = NULL;
-     STRLEN n1 = 0, n2 = 0;
-     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
-     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
-     U8 natbuf[1+1];
-     STRLEN foldlen1, foldlen2;
-     bool match;
-
-     PERL_ARGS_ASSERT_IBCMP_UTF8;
-     
-     if (pe1)
-         e1 = *(U8**)pe1;
-     /* assert(e1 || l1); */
-     if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
-         f1 = (const U8*)s1 + l1;
-     if (pe2)
-         e2 = *(U8**)pe2;
-     /* assert(e2 || l2); */
-     if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
-         f2 = (const U8*)s2 + l2;
-
-     /* This shouldn't happen. However, putting an assert() there makes some
-      * tests fail. */
-     /* assert((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)); */
-     if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
-         return 1; /* mismatch; possible infinite loop or false positive */
-
-     if (!u1 || !u2)
-         natbuf[1] = 0; /* Need to terminate the buffer. */
-
-     while ((e1 == 0 || p1 < e1) &&
-           (f1 == 0 || p1 < f1) &&
-           (e2 == 0 || p2 < e2) &&
-           (f2 == 0 || p2 < f2)) {
-         if (n1 == 0) {
-              if (u1)
-                   to_utf8_fold(p1, foldbuf1, &foldlen1);
-              else {
-                   uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
-                   to_utf8_fold(natbuf, foldbuf1, &foldlen1);
-              }
-              q1 = foldbuf1;
-              n1 = foldlen1;
-         }
-         if (n2 == 0) {
-              if (u2)
-                   to_utf8_fold(p2, foldbuf2, &foldlen2);
-              else {
-                   uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
-                   to_utf8_fold(natbuf, foldbuf2, &foldlen2);
-              }
-              q2 = foldbuf2;
-              n2 = foldlen2;
-         }
-         while (n1 && n2) {
-              if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
-                  (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
-                   memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
-                  return 1; /* mismatch */
-              n1 -= UTF8SKIP(q1);
-              q1 += UTF8SKIP(q1);
-              n2 -= UTF8SKIP(q2);
-              q2 += UTF8SKIP(q2);
-         }
-         if (n1 == 0)
-              p1 += u1 ? UTF8SKIP(p1) : 1;
-         if (n2 == 0)
-              p2 += u2 ? UTF8SKIP(p2) : 1;
-
-     }
-
-     /* A match is defined by all the scans that specified
-      * an explicit length reaching their final goals. */
-     match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
-
-     if (match) {
-         if (pe1)
-              *pe1 = (char*)p1;
-         if (pe2)
-              *pe2 = (char*)p2;
-     }
-
-     return match ? 0 : 1; /* 0 match, 1 mismatch */
+Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
+{
+    dVAR;
+    register const U8 *p1  = (const U8*)s1; /* Point to current char */
+    register const U8 *p2  = (const U8*)s2;
+    register const U8 *g1 = NULL;       /* goal for s1 */
+    register const U8 *g2 = NULL;
+    register const U8 *e1 = NULL;       /* Don't scan s1 past this */
+    register U8 *f1 = NULL;             /* Point to current folded */
+    register const U8 *e2 = NULL;
+    register U8 *f2 = NULL;
+    STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
+    U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+    U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
+    U8 natbuf[2];               /* Holds native 8-bit char converted to utf8;
+                                   these always fit in 2 bytes */
+
+    PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
+
+    if (pe1) {
+        e1 = *(U8**)pe1;
+    }
+
+    if (l1) {
+        g1 = (const U8*)s1 + l1;
+    }
+
+    if (pe2) {
+        e2 = *(U8**)pe2;
+    }
+
+    if (l2) {
+        g2 = (const U8*)s2 + l2;
+    }
+
+    /* Must have at least one goal */
+    assert(g1 || g2);
+
+    if (g1) {
+
+        /* Will never match if goal is out-of-bounds */
+        assert(! e1  || e1 >= g1);
+
+        /* Here, there isn't an end pointer, or it is beyond the goal.  We
+        * only go as far as the goal */
+        e1 = g1;
+    }
+    else {
+       assert(e1);    /* Must have an end for looking at s1 */
+    }
+
+    /* Same for goal for s2 */
+    if (g2) {
+        assert(! e2  || e2 >= g2);
+        e2 = g2;
+    }
+    else {
+       assert(e2);
+    }
+
+    /* Look through both strings, a character at a time */
+    while (p1 < e1 && p2 < e2) {
+
+        /* If at the beginning of a new character in s1, get its fold to use
+        * and the length of the fold.  (exception: locale rules just get the
+        * character to a single byte) */
+        if (n1 == 0) {
+
+           /* If in locale matching, we use two sets of rules, depending on if
+            * the code point is above or below 255.  Here, we test for and
+            * handle locale rules */
+           if ((flags & FOLDEQ_UTF8_LOCALE)
+               && (! u1 || UTF8_IS_INVARIANT(*p1) || UTF8_IS_DOWNGRADEABLE_START(*p1)))
+           {
+               /* There is no mixing of code points above and below 255. */
+               if (u2 && (! UTF8_IS_INVARIANT(*p2)
+                   && ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
+               {
+                   return 0;
+               }
+
+               /* We handle locale rules by converting, if necessary, the code
+                * point to a single byte. */
+               if (! u1 || UTF8_IS_INVARIANT(*p1)) {
+                   *foldbuf1 = *p1;
+               }
+               else {
+                   *foldbuf1 = TWO_BYTE_UTF8_TO_UNI(*p1, *(p1 + 1));
+               }
+               n1 = 1;
+           }
+           else if (isASCII(*p1)) {    /* Note, that here won't be both ASCII
+                                          and using locale rules */
+
+               /* If trying to mix non- with ASCII, and not supposed to, fail */
+               if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
+                   return 0;
+               }
+               n1 = 1;
+               *foldbuf1 = toLOWER(*p1);   /* Folds in the ASCII range are
+                                              just lowercased */
+           }
+           else if (u1) {
+                to_utf8_fold(p1, foldbuf1, &n1);
+            }
+            else {  /* Not utf8, convert to it first and then get fold */
+                uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+                to_utf8_fold(natbuf, foldbuf1, &n1);
+            }
+            f1 = foldbuf1;
+        }
+
+        if (n2 == 0) {    /* Same for s2 */
+           if ((flags & FOLDEQ_UTF8_LOCALE)
+               && (! u2 || UTF8_IS_INVARIANT(*p2) || UTF8_IS_DOWNGRADEABLE_START(*p2)))
+           {
+               /* Here, the next char in s2 is < 256.  We've already worked on
+                * s1, and if it isn't also < 256, can't match */
+               if (u1 && (! UTF8_IS_INVARIANT(*p1)
+                   && ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
+               {
+                   return 0;
+               }
+               if (! u2 || UTF8_IS_INVARIANT(*p2)) {
+                   *foldbuf2 = *p2;
+               }
+               else {
+                   *foldbuf2 = TWO_BYTE_UTF8_TO_UNI(*p2, *(p2 + 1));
+               }
+
+               /* Use another function to handle locale rules.  We've made
+                * sure that both characters to compare are single bytes */
+               if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
+                   return 0;
+               }
+               n1 = n2 = 0;
+           }
+           else if (isASCII(*p2)) {
+               if (flags && ! isASCII(*p1)) {
+                   return 0;
+               }
+               n2 = 1;
+               *foldbuf2 = toLOWER(*p2);
+           }
+           else if (u2) {
+                to_utf8_fold(p2, foldbuf2, &n2);
+            }
+            else {
+                uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+                to_utf8_fold(natbuf, foldbuf2, &n2);
+            }
+            f2 = foldbuf2;
+        }
+
+       /* Here f1 and f2 point to the beginning of the strings to compare.
+        * These strings are the folds of the input characters, stored in utf8.
+        */
+
+        /* While there is more to look for in both folds, see if they
+        * continue to match */
+        while (n1 && n2) {
+            U8 fold_length = UTF8SKIP(f1);
+            if (fold_length != UTF8SKIP(f2)
+                || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+                                                       function call for single
+                                                       character */
+                || memNE((char*)f1, (char*)f2, fold_length))
+            {
+                return 0; /* mismatch */
+            }
+
+            /* Here, they matched, advance past them */
+            n1 -= fold_length;
+            f1 += fold_length;
+            n2 -= fold_length;
+            f2 += fold_length;
+        }
+
+        /* When reach the end of any fold, advance the input past it */
+        if (n1 == 0) {
+            p1 += u1 ? UTF8SKIP(p1) : 1;
+        }
+        if (n2 == 0) {
+            p2 += u2 ? UTF8SKIP(p2) : 1;
+        }
+    } /* End of loop through both strings */
+
+    /* A match is defined by each scan that specified an explicit length
+    * reaching its final goal, and the other not having matched a partial
+    * character (which can happen when the fold of a character is more than one
+    * character). */
+    if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
+        return 0;
+    }
+
+    /* Successful match.  Set output pointers */
+    if (pe1) {
+        *pe1 = (char*)p1;
+    }
+    if (pe2) {
+        *pe2 = (char*)p2;
+    }
+    return 1;
  }
  
  /*
  }
  
  /*