[perl #78494] Pipes cause threads to hang on join()

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 8fd5db9..b5d8531 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -42,7 +42,7 @@ PERL_CALLCONV U8*     Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
  
-/* 
+/*
  =head1 Unicode Support
  
  This file contains various utility functions for manipulating UTF8-encoded
@@ -62,6 +62,8 @@ or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  fit this definition, hence the function's name.
  
+If C<len> is 0, it will be calculated using C<strlen(s)>.  
+
  See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
  =cut
@@ -86,8 +88,8 @@ Perl_is_ascii_string(const U8 *s, STRLEN len)
  /*
  =for apidoc uvuni_to_utf8_flags
  
-Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
+Adds the UTF-8 representation of the code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
  
@@ -101,10 +103,31 @@ or, in most cases,
  
      d = uvuni_to_utf8_flags(d, uv, 0);
  
-is the recommended Unicode-aware way of saying
+This is the recommended Unicode-aware way of saying
  
      *(d++) = uv;
  
+This function will convert to UTF-8 (and not warn) even code points that aren't
+legal Unicode or are problematic, unless C<flags> contains one or more of the
+following flags.
+If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
+the function will raise a warning, provided UTF8 warnings are enabled.  If instead
+UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
+If both flags are set, the function will both warn and return NULL.
+
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
+affect how the function handles a Unicode non-character.  And, likewise for the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
+above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
+even less portable) can be warned and/or disallowed even if other above-Unicode
+code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
+flags.
+
+And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
+above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
+DISALLOW flags.
+
+
  =cut
  */
  
@@ -113,23 +136,39 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
  
-    if (ckWARN(WARN_UTF8)) {
-        if (UNICODE_IS_SURROGATE(uv) &&
-            !(flags & UNICODE_ALLOW_SURROGATE))
-             Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
-        else if (
-                 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
-                   !(flags & UNICODE_ALLOW_FDD0))
-                  ||
-                  ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
-                   !(flags & UNICODE_ALLOW_FFFF))) &&
-                 /* UNICODE_ALLOW_SUPER includes
-                  * FFFEs and FFFFs beyond 0x10FFFF. */
-                 ((uv <= PERL_UNICODE_MAX) ||
-                  !(flags & UNICODE_ALLOW_SUPER))
-                 )
-             Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                     "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
+    if (ckWARN_d(WARN_UTF8)) {
+       if (UNICODE_IS_SURROGATE(uv)) {
+           if (flags & UNICODE_WARN_SURROGATE) {
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                                           "UTF-16 surrogate U+%04"UVXf, uv);
+           }
+           if (flags & UNICODE_DISALLOW_SURROGATE) {
+               return NULL;
+           }
+       }
+       else if (UNICODE_IS_SUPER(uv)) {
+           if (flags & UNICODE_WARN_SUPER
+               || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
+           {
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+           }
+           if (flags & UNICODE_DISALLOW_SUPER
+               || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
+           {
+               return NULL;
+           }
+       }
+       else if (UNICODE_IS_NONCHAR(uv)) {
+           if (flags & UNICODE_WARN_NONCHAR) {
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                "Unicode non-character U+%04"UVXf" is illegal for open interchange",
+                uv);
+           }
+           if (flags & UNICODE_DISALLOW_NONCHAR) {
+               return NULL;
+           }
+       }
      }
      if (UNI_IS_INVARIANT(uv)) {
         *d++ = (U8)UTF_TO_NATIVE(uv);
@@ -264,7 +303,7 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
         if (!UTF8_IS_CONTINUATION(*s))
             return 0;
         uv = UTF8_ACCUMULATE(uv, *s);
-       if (uv < ouv) 
+       if (uv < ouv)
             return 0;
         ouv = uv;
         s++;
@@ -303,9 +342,10 @@ Perl_is_utf8_char(const U8 *s)
  =for apidoc is_utf8_string
  
  Returns true if first C<len> bytes of the given string form a valid
-UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
-not mean 'a string that contains code points above 0x7F encoded in UTF-8'
-because a valid ASCII string is a valid UTF-8 string.
+UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
+using C<strlen(s)>.  Note that 'a valid UTF-8 string' does not mean 'a
+string that contains code points above 0x7F encoded in UTF-8' because a
+valid ASCII string is a valid UTF-8 string.
  
  See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
@@ -425,20 +465,62 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  =for apidoc utf8n_to_uvuni
  
  Bottom level UTF-8 decode routine.
-Returns the Unicode code point value of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character.
-
-If C<s> does not point to a well-formed UTF-8 character, the behaviour
-is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
-it is assumed that the caller will raise a warning, and this function
-will silently just set C<retlen> to C<-1> and return zero.  If the
-C<flags> does not contain UTF8_CHECK_ONLY, warnings about
-malformations will be given, C<retlen> will be set to the expected
-length of the UTF-8 character in bytes, and zero will be returned.
-
-The C<flags> can also contain various flags to allow deviations from
-the strict UTF-8 encoding (see F<utf8.h>).
+Returns the code point value of the first character in the string C<s>
+which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
+C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
+character.
+
+The value of C<flags> determines the behavior when C<s> does not point to a
+well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
+C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
+is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
+is raised.
+
+Various ALLOW flags can be set in C<flags> to allow (and not warn on)
+individual types of malformations, such as the sequence being overlong (that
+is, when there is a shorter sequence that can express the same code point;
+overlong sequences are expressly forbidden in the UTF-8 standard due to
+potential security issues).  Another malformation example is the first byte of
+a character not being a legal first byte.  See F<utf8.h> for the list of such
+flags.  Of course, the value returned by this function under such conditions is
+not reliable.
+
+The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
+flags) malformation is found.  If this flag is set, the routine assumes that
+the caller will raise a warning, and this function will silently just set
+C<retlen> to C<-1> and return zero.
+
+Certain code points are considered problematic.  These are Unicode surrogates,
+Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
+By default these are considered regular code points, but certain situations
+warrant special handling for them.  if C<flags> contains
+UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
+malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
+UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
+maximum) can be set to disallow these categories individually.
+
+The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
+UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
+for their respective categories, but otherwise the code points are considered
+valid (not malformations).  To get a category to both be treated as a
+malformation and raise a warning, specify both the WARN and DISALLOW flags.
+(But note that warnings are not raised if lexically disabled nor if
+UTF8_CHECK_ONLY is also specified.)
+
+Very large code points (above 0x7FFF_FFFF) are considered more problematic than
+the others that are above the Unicode legal maximum.  There are several
+reasons, one of which is that the original UTF-8 specification never went above
+this number (the current 0x10FFF limit was imposed later).  The UTF-8 encoding
+on ASCII platforms for these large code point begins with a byte containing
+0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
+malformations, while allowing smaller above-Unicode code points.  (Of course
+UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
+as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
+flags, but applies just to these code points.
+
+All other code points corresponding to Unicode characters, including private
+use and those yet to be assigned, are never considered malformed and never
+warn.
  
  Most code should use utf8_to_uvchr() rather than call this directly.
  
@@ -452,25 +534,22 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      const U8 * const s0 = s;
      UV uv = *s, ouv = 0;
      STRLEN len = 1;
-    const bool dowarn = ckWARN_d(WARN_UTF8);
+    bool dowarn = ckWARN_d(WARN_UTF8);
      const UV startbyte = *s;
      STRLEN expectlen = 0;
      U32 warning = 0;
-    SV* sv;
+    SV* sv = NULL;
  
      PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
  
-/* This list is a superset of the UTF8_ALLOW_XXX.  BUT it isn't, eg SUPER missing XXX */
+/* This list is a superset of the UTF8_ALLOW_XXX. */
  
  #define UTF8_WARN_EMPTY                                 1
  #define UTF8_WARN_CONTINUATION                  2
  #define UTF8_WARN_NON_CONTINUATION              3
-#define UTF8_WARN_FE_FF                                 4
-#define UTF8_WARN_SHORT                                 5
-#define UTF8_WARN_OVERFLOW                      6
-#define UTF8_WARN_SURROGATE                     7
-#define UTF8_WARN_LONG                          8
-#define UTF8_WARN_FFFF                          9 /* Also FFFE. */
+#define UTF8_WARN_SHORT                                 4
+#define UTF8_WARN_OVERFLOW                      5
+#define UTF8_WARN_LONG                          6
  
      if (curlen == 0 &&
         !(flags & UTF8_ALLOW_EMPTY)) {
@@ -499,10 +578,14 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  #ifdef EBCDIC
      uv = NATIVE_TO_UTF(uv);
  #else
-    if ((uv == 0xfe || uv == 0xff) &&
-       !(flags & UTF8_ALLOW_FE_FF)) {
-       warning = UTF8_WARN_FE_FF;
-       goto malformed;
+    if (uv == 0xfe || uv == 0xff) {
+       if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
+           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
+           flags &= ~UTF8_WARN_SUPER;  /* Only warn once on this problem */
+       }
+       if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
+           goto malformed;
+       }
      }
  #endif
  
@@ -532,7 +615,7 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  
      len--;
      s++;
-    ouv = uv;
+    ouv = uv;  /* ouv is the value from the previous iteration */
  
      while (len--) {
         if (!UTF8_IS_CONTINUATION(*s) &&
@@ -543,7 +626,8 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
         }
         else
             uv = UTF8_ACCUMULATE(uv, *s);
-       if (!(uv > ouv)) {
+       if (!(uv > ouv)) {  /* If the value didn't grow from the previous
+                              iteration, something is horribly wrong */
             /* These cannot be allowed. */
             if (uv == ouv) {
                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
@@ -561,22 +645,47 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
         ouv = uv;
      }
  
-    if (UNICODE_IS_SURROGATE(uv) &&
-       !(flags & UTF8_ALLOW_SURROGATE)) {
-       warning = UTF8_WARN_SURROGATE;
-       goto malformed;
-    } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
-              !(flags & UTF8_ALLOW_LONG)) {
+    if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
         warning = UTF8_WARN_LONG;
         goto malformed;
-    } else if (UNICODE_IS_ILLEGAL(uv) &&
-              !(flags & UTF8_ALLOW_FFFF)) {
-       warning = UTF8_WARN_FFFF;
-       goto malformed;
+    } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
+       if (UNICODE_IS_SURROGATE(uv)) {
+           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
+           }
+           if (flags & UTF8_DISALLOW_SURROGATE) {
+               goto disallowed;
+           }
+       }
+       else if (UNICODE_IS_NONCHAR(uv)) {
+           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+           }
+           if (flags & UTF8_DISALLOW_NONCHAR) {
+               goto disallowed;
+           }
+       }
+       else if ((uv > PERL_UNICODE_MAX)) {
+           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
+               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+           }
+           if (flags & UTF8_DISALLOW_SUPER) {
+               goto disallowed;
+           }
+       }
+
+       /* Here, this is not considered a malformed character, so drop through
+        * to return it */
      }
  
      return uv;
  
+disallowed: /* Is disallowed, but otherwise not malformed.  'sv' will have been
+              set if there is to be a warning. */
+    if (!sv) {
+       dowarn = 0;
+    }
+
  malformed:
  
      if (flags & UTF8_CHECK_ONLY) {
@@ -586,58 +695,48 @@ malformed:
      }
  
      if (dowarn) {
-       if (warning == UTF8_WARN_FFFF) {
-           sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
-           Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
-       }
-       else {
+       if (! sv) {
             sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
+       }
  
-           switch (warning) {
-               case 0: /* Intentionally empty. */ break;
-               case UTF8_WARN_EMPTY:
-                   sv_catpvs(sv, "(empty string)");
-                   break;
-               case UTF8_WARN_CONTINUATION:
-                   Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
-                   break;
-               case UTF8_WARN_NON_CONTINUATION:
-                   if (s == s0)
-                       Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
-                                  (UV)s[1], startbyte);
-                   else {
-                       const int len = (int)(s-s0);
-                       Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
-                                  (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
-                   }
+       switch (warning) {
+           case 0: /* Intentionally empty. */ break;
+           case UTF8_WARN_EMPTY:
+               sv_catpvs(sv, "(empty string)");
+               break;
+           case UTF8_WARN_CONTINUATION:
+               Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
+               break;
+           case UTF8_WARN_NON_CONTINUATION:
+               if (s == s0)
+                   Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
+                               (UV)s[1], startbyte);
+               else {
+                   const int len = (int)(s-s0);
+                   Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
+                               (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
+               }
  
-                   break;
-               case UTF8_WARN_FE_FF:
-                   Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
-                   break;
-               case UTF8_WARN_SHORT:
-                   Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
-                                  (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
-                   expectlen = curlen;         /* distance for caller to skip */
-                   break;
-               case UTF8_WARN_OVERFLOW:
-                   Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
-                                  ouv, *s, startbyte);
-                   break;
-               case UTF8_WARN_SURROGATE:
-                   Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
-                   break;
-               case UTF8_WARN_LONG:
-                   Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
-                                  (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
-                   break;
-               default:
-                   sv_catpvs(sv, "(unknown reason)");
-                   break;
-           }
+               break;
+           case UTF8_WARN_SHORT:
+               Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+                               (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
+               expectlen = curlen;             /* distance for caller to skip */
+               break;
+           case UTF8_WARN_OVERFLOW:
+               Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
+                               ouv, *s, startbyte);
+               break;
+           case UTF8_WARN_LONG:
+               Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+                               (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
+               break;
+           default:
+               sv_catpvs(sv, "(unknown reason)");
+               break;
         }
         
-       if (warning) {
+       if (sv) {
             const char * const s = SvPVX_const(sv);
  
             if (PL_op)
@@ -657,7 +756,7 @@ malformed:
  /*
  =for apidoc utf8_to_uvchr
  
-Returns the native character value of the first character in the string C<s>
+Returns the native code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
@@ -667,13 +766,14 @@ returned and retlen is set, if possible, to -1.
  =cut
  */
  
+
  UV
  Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  {
      PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
  
      return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
-                         ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+                         ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
  }
  
  /*
@@ -699,7 +799,7 @@ Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
  
      /* Call the low level routine asking for checks */
      return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
-                              ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
  }
  
  /*
@@ -805,6 +905,74 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
  }
  
  /*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+    const U8 *const bend = b + blen;
+    const U8 *const uend = u + ulen;
+
+    PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+    PERL_UNUSED_CONTEXT;
+
+    while (b < bend && u < uend) {
+        U8 c = *u++;
+       if (!UTF8_IS_INVARIANT(c)) {
+           if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+               if (u < uend) {
+                   U8 c1 = *u++;
+                   if (UTF8_IS_CONTINUATION(c1)) {
+                       c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
+                   } else {
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                                        "Malformed UTF-8 character "
+                                        "(unexpected non-continuation byte 0x%02x"
+                                        ", immediately after start byte 0x%02x)"
+                                        /* Dear diag.t, it's in the pod.  */
+                                        "%s%s", c1, c,
+                                        PL_op ? " in " : "",
+                                        PL_op ? OP_DESC(PL_op) : "");
+                       return -2;
+                   }
+               } else {
+                   if (PL_op)
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                                        "%s in %s", unees, OP_DESC(PL_op));
+                   else
+                       Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+                   return -2; /* Really want to return undef :-)  */
+               }
+           } else {
+               return -2;
+           }
+       }
+       if (*b != c) {
+           return *b < c ? -2 : +2;
+       }
+       ++b;
+    }
+
+    if (b == bend && u == uend)
+       return 0;
+
+    return b < bend ? +1 : -1;
+}
+
+/*
  =for apidoc utf8_to_bytes
  
  Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
@@ -897,8 +1065,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
         U8 c = *s++;
         if (!UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
-           c = ASCII_TO_NATIVE(c);
+           c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
         }
         *d++ = c;
      }
@@ -910,9 +1077,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
  /*
  =for apidoc bytes_to_utf8
  
-Converts a string C<s> of length C<len> from the native encoding into UTF-8.
+Converts a string C<s> of length C<len> bytes from the native encoding into
+UTF-8.
  Returns a pointer to the newly-created string, and sets C<len> to
-reflect the new length.
+reflect the new length in bytes.
  
  A NUL character will be written after the end of the string.
  
@@ -1634,6 +1802,24 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
  
      PERL_ARGS_ASSERT_TO_UTF8_CASE;
  
+    /* Note that swash_fetch() doesn't output warnings for these because it
+     * assumes we will */
+    if (uv1 >= UNICODE_SURROGATE_FIRST && ckWARN_d(WARN_UTF8)) {
+       if (uv1 <= UNICODE_SURROGATE_LAST) {
+           const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+           Perl_warner(aTHX_ packWARN(WARN_UTF8),
+               "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
+       }
+       else if (UNICODE_IS_SUPER(uv1)) {
+           const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+           Perl_warner(aTHX_ packWARN(WARN_UTF8),
+               "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
+       }
+
+       /* Note that non-characters are perfectly legal, so no warning should
+        * be given */
+    }
+
      uvuni_to_utf8(tmpbuf, uv1);
  
      if (!*swashp) /* load on-demand */
@@ -1838,6 +2024,7 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
      const size_t name_len = strlen(name);
      HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
      SV* errsv_save;
+    GV *method;
  
      PERL_ARGS_ASSERT_SWASH_INIT;
  
@@ -1845,7 +2032,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
      ENTER;
      SAVEHINTS();
      save_re_context();
-    if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {     /* demand load utf8 */
+    method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
+    if (!method) {     /* demand load utf8 */
         ENTER;
         errsv_save = newSVsv(ERRSV);
         /* It is assumed that callers of this routine are not passing in any
@@ -1872,7 +2060,10 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
      mPUSHi(none);
      PUTBACK;
      errsv_save = newSVsv(ERRSV);
-    if (call_method("SWASHNEW", G_SCALAR))
+    /* If we already have a pointer to the method, no need to use call_method()
+       to repeat the lookup.  */
+    if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
+       : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
         retval = newSVsv(*PL_stack_sp--);
      else
         retval = &PL_sv_undef;
@@ -1898,7 +2089,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
   * return several Unicode characters for a single Unicode character
   * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
   * the lower-level routine, and it is similarly broken for returning
- * multiple values.  --jhi */
+ * multiple values.  --jhi
+ * For those, you should use to_utf8_case() instead */
  /* Now SWASHGET is recasted into S_swash_get in this file. */
  
  /* Note:
@@ -1948,6 +2140,18 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
        /* If char is encoded then swatch is for the prefix */
         needents = (1 << UTF_ACCUMULATION_SHIFT);
         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+       if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_UTF8)) {
+           const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
+
+           /* This outputs warnings for binary properties only, assuming that
+            * to_utf8_case() will output any.  Also, surrogates aren't checked
+            * for, as that would warn on things like /\p{Gc=Cs}/ */
+           SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+           if (SvUV(*bitssvp) == 1) {
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point);
+           }
+       }
      }
  
      /*
@@ -2022,6 +2226,105 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      NORETURN_FUNCTION_END;
  }
  
+/* Read a single line of the main body of the swash input text.  These are of
+ * the form:
+ * 0053        0056    0073
+ * where each number is hex.  The first two numbers form the minimum and
+ * maximum of a range, and the third is the value associated with the range.
+ * Not all swashes should have a third number
+ *
+ * On input: l   points to the beginning of the line to be examined; it points
+ *               to somewhere in the string of the whole input text, and is
+ *               terminated by a \n or the null string terminator.
+ *          lend   points to the null terminator of that string
+ *          wants_value    is non-zero if the swash expects a third number
+ *          typestr is the name of the swash's mapping, like 'ToLower'
+ * On output: *min, *max, and *val are set to the values read from the line.
+ *           returns a pointer just beyond the line examined.  If there was no
+ *           valid min number on the line, returns lend+1
+ */
+
+STATIC U8*
+S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
+                            const bool wants_value, const U8* const typestr)
+{
+    const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
+    STRLEN numlen;         /* Length of the number */
+    I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+    /* nl points to the next \n in the scan */
+    U8* const nl = (U8*)memchr(l, '\n', lend - l);
+
+    /* Get the first number on the line: the range minimum */
+    numlen = lend - l;
+    *min = grok_hex((char *)l, &numlen, &flags, NULL);
+    if (numlen)            /* If found a hex number, position past it */
+       l += numlen;
+    else if (nl) {         /* Else, go handle next line, if any */
+       return nl + 1;  /* 1 is length of "\n" */
+    }
+    else {             /* Else, no next line */
+       return lend + 1;        /* to LIST's end at which \n is not found */
+    }
+
+    /* The max range value follows, separated by a BLANK */
+    if (isBLANK(*l)) {
+       ++l;
+       flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+       numlen = lend - l;
+       *max = grok_hex((char *)l, &numlen, &flags, NULL);
+       if (numlen)
+           l += numlen;
+       else    /* If no value here, it is a single element range */
+           *max = *min;
+
+       /* Non-binary tables have a third entry: what the first element of the
+        * range maps to */
+       if (wants_value) {
+           if (isBLANK(*l)) {
+               ++l;
+               flags = PERL_SCAN_SILENT_ILLDIGIT |
+                       PERL_SCAN_DISALLOW_PREFIX;
+               numlen = lend - l;
+               *val = grok_hex((char *)l, &numlen, &flags, NULL);
+               if (numlen)
+                   l += numlen;
+               else
+                   *val = 0;
+           }
+           else {
+               *val = 0;
+               if (typeto) {
+                   Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+                                    typestr, l);
+               }
+           }
+       }
+       else
+           *val = 0; /* bits == 1, then any val should be ignored */
+    }
+    else { /* Nothing following range min, should be single element with no
+             mapping expected */
+       *max = *min;
+       if (wants_value) {
+           *val = 0;
+           if (typeto) {
+               Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+           }
+       }
+       else
+           *val = 0; /* bits == 1, then val should be ignored */
+    }
+
+    /* Position to next line if any, or EOF */
+    if (nl)
+       l = nl + 1;
+    else
+       l = lend;
+
+    return l;
+}
+
  /* Note:
   * Returns a swatch (a bit vector string) for a code point sequence
   * that starts from the value C<start> and comprises the number C<span>.
@@ -2035,13 +2338,15 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      U8 *l, *lend, *x, *xend, *s;
      STRLEN lcur, xcur, scur;
      HV *const hv = MUTABLE_HV(SvRV(swash));
+
+    /* The string containing the main body of the table */
      SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
      SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
      SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
      SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
      SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
      const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
-    const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
      const STRLEN bits  = SvUV(*bitssvp);
      const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
      const UV     none  = SvUV(*nonesvp);
@@ -2088,74 +2393,13 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      lend = l + lcur;
      while (l < lend) {
         UV min, max, val;
-       STRLEN numlen;
-       I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-
-       U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
-       numlen = lend - l;
-       min = grok_hex((char *)l, &numlen, &flags, NULL);
-       if (numlen)
-           l += numlen;
-       else if (nl) {
-           l = nl + 1; /* 1 is length of "\n" */
-           continue;
-       }
-       else {
-           l = lend; /* to LIST's end at which \n is not found */
+       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+                                        cBOOL(octets), typestr);
+       if (l > lend) {
             break;
         }
  
-       if (isBLANK(*l)) {
-           ++l;
-           flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-           numlen = lend - l;
-           max = grok_hex((char *)l, &numlen, &flags, NULL);
-           if (numlen)
-               l += numlen;
-           else
-               max = min;
-
-           if (octets) {
-               if (isBLANK(*l)) {
-                   ++l;
-                   flags = PERL_SCAN_SILENT_ILLDIGIT |
-                           PERL_SCAN_DISALLOW_PREFIX;
-                   numlen = lend - l;
-                   val = grok_hex((char *)l, &numlen, &flags, NULL);
-                   if (numlen)
-                       l += numlen;
-                   else
-                       val = 0;
-               }
-               else {
-                   val = 0;
-                   if (typeto) {
-                       Perl_croak(aTHX_ "%s: illegal mapping '%s'",
-                                        typestr, l);
-                   }
-               }
-           }
-           else
-               val = 0; /* bits == 1, then val should be ignored */
-       }
-       else {
-           max = min;
-           if (octets) {
-               val = 0;
-               if (typeto) {
-                   Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
-               }
-           }
-           else
-               val = 0; /* bits == 1, then val should be ignored */
-       }
-
-       if (nl)
-           l = nl + 1;
-       else
-           l = lend;
-
+       /* If looking for something beyond this range, go try the next one */
         if (max < start)
             continue;
  
@@ -2337,10 +2581,212 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      return swatch;
  }
  
+HV*
+Perl__swash_inversion_hash(pTHX_ SV* const swash)
+{
+
+   /* Subject to change or removal.  For use only in one place in regexec.c
+    *
+    * Returns a hash which is the inversion and closure of a swash mapping.
+    * For example, consider the input lines:
+    * 004B             006B
+    * 004C             006C
+    * 212A             006B
+    *
+    * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+    * 006C.  The value for each key is an array.  For 006C, the array would
+    * have a two elements, the utf8 for itself, and for 004C.  For 006B, there
+    * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+    *
+    * Essentially, for any code point, it gives all the code points that map to
+    * it, or the list of 'froms' for that point.
+    *
+    * Currently it only looks at the main body of the swash, and ignores any
+    * additions or deletions from other swashes */
+
+    U8 *l, *lend;
+    STRLEN lcur;
+    HV *const hv = MUTABLE_HV(SvRV(swash));
+
+    /* The string containing the main body of the table */
+    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+    SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+    /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
+    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+    const STRLEN bits  = SvUV(*bitssvp);
+    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+    const UV     none  = SvUV(*nonesvp);
+
+    HV* ret = newHV();
+
+    PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
+
+    /* Must have at least 8 bits to get the mappings */
+    if (bits != 8 && bits != 16 && bits != 32) {
+       Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
+                                                (UV)bits);
+    }
+
+    /* read $swash->{LIST} */
+    l = (U8*)SvPV(*listsvp, lcur);
+    lend = l + lcur;
+
+    /* Go through each input line */
+    while (l < lend) {
+       UV min, max, val;
+       UV inverse;
+       l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+                                        cBOOL(octets), typestr);
+       if (l > lend) {
+           break;
+       }
+
+       /* Each element in the range is to be inverted */
+       for (inverse = min; inverse <= max; inverse++) {
+           AV* list;
+           SV* element;
+           SV** listp;
+           IV i;
+           bool found_key = FALSE;
+
+           /* The key is the inverse mapping */
+           char key[UTF8_MAXBYTES+1];
+           char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+           STRLEN key_len = key_end - key;
+
+           /* Get the list for the map */
+           if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
+               list = (AV*) *listp;
+           }
+           else { /* No entry yet for it: create one */
+               list = newAV();
+               if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
+                   Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+               }
+           }
+
+           for (i = 0; i < av_len(list); i++) {
+               SV** entryp = av_fetch(list, i, FALSE);
+               SV* entry;
+               if (entryp == NULL) {
+                   Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+               }
+               entry = *entryp;
+               if (SvUV(entry) == val) {
+                   found_key = TRUE;
+                   break;
+               }
+           }
+
+           /* Make sure there is a mapping to itself on the list */
+           if (! found_key) {
+               element = newSVuv(val);
+               av_push(list, element);
+           }
+
+
+           /* Simply add the value to the list */
+           element = newSVuv(inverse);
+           av_push(list, element);
+
+           /* swash_get() increments the value of val for each element in the
+            * range.  That makes more compact tables possible.  You can
+            * express the capitalization, for example, of all consecutive
+            * letters with a single line: 0061\t007A\t0041 This maps 0061 to
+            * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
+            * and it's not documented, and perhaps not even currently used,
+            * but I copied the semantics from swash_get(), just in case */
+           if (!none || val < none) {
+               ++val;
+           }
+       }
+    }
+
+    return ret;
+}
+
+HV*
+Perl__swash_to_invlist(pTHX_ SV* const swash)
+{
+
+   /* Subject to change or removal.  For use only in one place in regcomp.c */
+
+    U8 *l, *lend;
+    char *loc;
+    STRLEN lcur;
+    HV *const hv = MUTABLE_HV(SvRV(swash));
+    UV elements = 0;    /* Number of elements in the inversion list */
+    U8 empty[] = "";
+
+    /* The string containing the main body of the table */
+    SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+    SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+    SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+
+    const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+    const STRLEN bits  = SvUV(*bitssvp);
+    const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+
+    HV* invlist;
+
+    PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
+
+    /* read $swash->{LIST} */
+    if (SvPOK(*listsvp)) {
+       l = (U8*)SvPV(*listsvp, lcur);
+    }
+    else {
+       /* LIST legitimately doesn't contain a string during compilation phases
+        * of Perl itself, before the Unicode tables are generated.  In this
+        * case, just fake things up by creating an empty list */
+       l = empty;
+       lcur = 0;
+    }
+    loc = (char *) l;
+    lend = l + lcur;
+
+    /* Scan the input to count the number of lines to preallocate array size
+     * based on worst possible case, which is each line in the input creates 2
+     * elements in the inversion list: 1) the beginning of a range in the list;
+     * 2) the beginning of a range not in the list.  */
+    while ((loc = (strchr(loc, '\n'))) != NULL) {
+       elements += 2;
+       loc++;
+    }
+
+    /* If the ending is somehow corrupt and isn't a new line, add another
+     * element for the final range that isn't in the inversion list */
+    if (! (*lend == '\n' || (*lend == '\0' && *(lend - 1) == '\n'))) {
+       elements++;
+    }
+
+    invlist = _new_invlist(elements);
+
+    /* Now go through the input again, adding each range to the list */
+    while (l < lend) {
+       UV start, end;
+       UV val;         /* Not used by this function */
+
+       l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+                                        cBOOL(octets), typestr);
+
+       if (l > lend) {
+           break;
+       }
+
+       _append_range_to_invlist(invlist, start, end);
+    }
+
+    return invlist;
+}
+
  /*
  =for apidoc uvchr_to_utf8
  
-Adds the UTF-8 representation of the Native codepoint C<uv> to the end
+Adds the UTF-8 representation of the Native code point C<uv> to the end
  of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  bytes available. The return value is the pointer to the byte after the
  end of the new character. In other words,
@@ -2375,14 +2821,13 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  
  /*
  =for apidoc utf8n_to_uvchr
-flags
  
-Returns the native character value of the first character in the string 
+Returns the native character value of the first character in the string
  C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-Allows length and flags to be passed to low level routine.
+length and flags are the same as utf8n_to_uvuni().
  
  =cut
  */
@@ -2390,7 +2835,7 @@ Allows length and flags to be passed to low level routine.
     a real function in case XS code wants it
  */
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, 
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
  U32 flags)
  {
      const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
@@ -2400,6 +2845,54 @@ U32 flags)
      return UNI_TO_NATIVE(uv);
  }
  
+bool
+Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+{
+    /* May change: warns if surrogates, non-character code points, or
+     * non-Unicode code points are in s which has length len.  Returns TRUE if
+     * none found; FALSE otherwise.  The only other validity check is to make
+     * sure that this won't exceed the string's length */
+
+    const U8* const e = s + len;
+    bool ok = TRUE;
+
+    PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
+
+    while (s < e) {
+       if (UTF8SKIP(s) > len) {
+           Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+                          "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+           return FALSE;
+       }
+       if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+           STRLEN char_len;
+           if (UTF8_IS_SUPER(s)) {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+               ok = FALSE;
+           }
+           else if (UTF8_IS_SURROGATE(s)) {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
+               ok = FALSE;
+           }
+           else if
+               (UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
+           {
+               UV uv = utf8_to_uvchr(s, &char_len);
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
+                   "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+               ok = FALSE;
+           }
+       }
+       s += UTF8SKIP(s);
+    }
+
+    return ok;
+}
+
  /*
  =for apidoc pv_uni_display
  
@@ -2475,7 +2968,7 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f
      }
      if (truncated)
          sv_catpvs(dsv, "...");
-    
+
      return SvPVX(dsv);
  }
  
@@ -2502,17 +2995,18 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
  }
  
  /*
-=for apidoc ibcmp_utf8
+=for apidoc foldEQ_utf8
  
-Returns true if the strings s1 and s2 differ case-insensitively, false
-if they are equal case-insensitively.  Note that this is the complement of what
-you might expect (perhaps it would have been better to name it C<ibncmp_utf8>).
+Returns true if the leading portions of the strings s1 and s2 (either or both
+of which may be in UTF-8) are the same case-insensitively; false otherwise.
+How far into the strings to compare is determined by other input parameters.
  
  If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
  otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
  with respect to s2.
  
-If the byte length l1 is non-zero, s1+l1 will be used as a goal to reach.  The
+If the byte length l1 is non-zero, it says how far into s1 to check for fold
+equality.  In other words, s1+l1 will be used as a goal to reach.  The
  scan will not be considered to be a match unless the goal is reached, and
  scanning won't continue past that goal.  Correspondingly for l2 with respect to
  s2.
@@ -2521,14 +3015,16 @@ If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
  considered an end pointer beyond which scanning of s1 will not continue under
  any circumstances.  This means that if both l1 and pe1 are specified, and pe1
  is less than s1+l1, the match will never be successful because it can never
-get as far as its goal.  Correspondingly for pe2 with respect to s2.
+get as far as its goal (and in fact is asserted against).  Correspondingly for
+pe2 with respect to s2.
  
-At least one of s1 and s2 must have a goal, and if both do, both have to be
+At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
+non-zero), and if both do, both have to be
  reached for a successful match.   Also, if the fold of a character is multiple
  characters, all of them must be matched (see tr21 reference below for
  'folding').
  
-Upon a successful match (when the routine returns false), if pe1 is non-NULL,
+Upon a successful match, if pe1 is non-NULL,
  it will be set to point to the beginning of the I<next> character of s1 beyond
  what was matched.  Correspondingly for pe2 and s2.
  
@@ -2537,40 +3033,47 @@ instead of upper/lowercasing both the characters, see
  http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
  
  =cut */
+
+/* A flags parameter has been added which may change, and hence isn't
+ * externally documented.  Currently it is:
+ *  0 for as-documented above
+ *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
+                           ASCII one, to not match
+ */
  I32
-Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
+Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
  {
      dVAR;
      register const U8 *p1  = (const U8*)s1; /* Point to current char */
      register const U8 *p2  = (const U8*)s2;
-    register const U8 *g1 = NULL;      /* goal for s1 */
+    register const U8 *g1 = NULL;       /* goal for s1 */
      register const U8 *g2 = NULL;
-    register const U8 *e1 = NULL;      /* Don't scan s1 past this */
-    register U8 *f1 = NULL;            /* Point to current folded */
+    register const U8 *e1 = NULL;       /* Don't scan s1 past this */
+    register U8 *f1 = NULL;             /* Point to current folded */
      register const U8 *e2 = NULL;
      register U8 *f2 = NULL;
-    STRLEN n1 = 0, n2 = 0;             /* Number of bytes in current char */
+    STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
-    U8 natbuf[2];              /* Holds native 8-bit char converted to utf8;
-                                  these always fit in 2 bytes */
+    U8 natbuf[2];               /* Holds native 8-bit char converted to utf8;
+                                   these always fit in 2 bytes */
  
-    PERL_ARGS_ASSERT_IBCMP_UTF8;
+    PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
      if (pe1) {
-       e1 = *(U8**)pe1;
+        e1 = *(U8**)pe1;
      }
  
      if (l1) {
-       g1 = (const U8*)s1 + l1;
+        g1 = (const U8*)s1 + l1;
      }
  
      if (pe2) {
-       e2 = *(U8**)pe2;
+        e2 = *(U8**)pe2;
      }
  
      if (l2) {
-       g2 = (const U8*)s2 + l2;
+        g2 = (const U8*)s2 + l2;
      }
  
      /* Must have at least one goal */
@@ -2578,75 +3081,96 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
  
      if (g1) {
  
-       /* Will never match if goal is out-of-bounds */
-       assert(! e1  || e1 >= g1);
+        /* Will never match if goal is out-of-bounds */
+        assert(! e1  || e1 >= g1);
  
-       /* Here, there isn't an end pointer, or it is beyond the goal.  We
-       * only go as far as the goal */
-       e1 = g1;
+        /* Here, there isn't an end pointer, or it is beyond the goal.  We
+        * only go as far as the goal */
+        e1 = g1;
+    }
+    else {
+       assert(e1);    /* Must have an end for looking at s1 */
      }
-    else assert(e1);   /* Must have an end for looking at s1 */
  
      /* Same for goal for s2 */
      if (g2) {
-       assert(! e2  || e2 >= g2);
-       e2 = g2;
+        assert(! e2  || e2 >= g2);
+        e2 = g2;
+    }
+    else {
+       assert(e2);
      }
-    else assert(e2);
  
      /* Look through both strings, a character at a time */
      while (p1 < e1 && p2 < e2) {
  
-       /* If at the beginning of a new character in s1, get its fold to use */
-       if (n1 == 0) {
-           if (u1) {
-               to_utf8_fold(p1, foldbuf1, &n1);
-           }
-           else {  /* Not utf8, convert to it first and then get fold */
-               uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
-               to_utf8_fold(natbuf, foldbuf1, &n1);
-           }
-           f1 = foldbuf1;
-       }
+        /* If at the beginning of a new character in s1, get its fold to use
+         * and the length of the fold */
+        if (n1 == 0) {
+           if (isASCII(*p1)) {
  
-       if (n2 == 0) {    /* Same for s2 */
-           if (u2) {
-               to_utf8_fold(p2, foldbuf2, &n2);
-           }
-           else {
-               uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
-               to_utf8_fold(natbuf, foldbuf2, &n2);
+               /* But if not to mix non- with ASCII, fail */
+               if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
+                   return 0;
+               }
+               n1 = 1;
+               *foldbuf1 = toLOWER(*p1);   /* ASCII range fold is lowercase */
             }
-           f2 = foldbuf2;
-       }
+           else if (u1) {
+                to_utf8_fold(p1, foldbuf1, &n1);
+            }
+            else {  /* Not utf8, convert to it first and then get fold */
+                uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+                to_utf8_fold(natbuf, foldbuf1, &n1);
+            }
+            f1 = foldbuf1;
+        }
  
-       /* While there is more to look for in both folds, see if they
-       * continue to match */
-       while (n1 && n2) {
-           U8 fold_length = UTF8SKIP(f1);
-           if (fold_length != UTF8SKIP(f2)
-               || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
-                                                      function call for single
-                                                      character */
-               || memNE((char*)f1, (char*)f2, fold_length))
-           {
-               return 1; /* mismatch */
+        if (n2 == 0) {    /* Same for s2 */
+           if (isASCII(*p2)) {
+               if (flags && ! isASCII(*p1)) {
+                   return 0;
+               }
+               n2 = 1;
+               *foldbuf2 = toLOWER(*p2);
             }
+           else if (u2) {
+                to_utf8_fold(p2, foldbuf2, &n2);
+            }
+            else {
+                uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+                to_utf8_fold(natbuf, foldbuf2, &n2);
+            }
+            f2 = foldbuf2;
+        }
  
-           /* Here, they matched, advance past them */
-           n1 -= fold_length;
-           f1 += fold_length;
-           n2 -= fold_length;
-           f2 += fold_length;
-       }
+        /* While there is more to look for in both folds, see if they
+        * continue to match */
+        while (n1 && n2) {
+            U8 fold_length = UTF8SKIP(f1);
+            if (fold_length != UTF8SKIP(f2)
+                || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+                                                       function call for single
+                                                       character */
+                || memNE((char*)f1, (char*)f2, fold_length))
+            {
+                return 0; /* mismatch */
+            }
+
+            /* Here, they matched, advance past them */
+            n1 -= fold_length;
+            f1 += fold_length;
+            n2 -= fold_length;
+            f2 += fold_length;
+        }
  
-       /* When reach the end of any fold, advance the input past it */
-       if (n1 == 0) {
-           p1 += u1 ? UTF8SKIP(p1) : 1;
-       }
-       if (n2 == 0) {
-           p2 += u2 ? UTF8SKIP(p2) : 1;
-       }
+        /* When reach the end of any fold, advance the input past it */
+        if (n1 == 0) {
+            p1 += u1 ? UTF8SKIP(p1) : 1;
+        }
+        if (n2 == 0) {
+            p2 += u2 ? UTF8SKIP(p2) : 1;
+        }
      } /* End of loop through both strings */
  
      /* A match is defined by each scan that specified an explicit length
@@ -2654,17 +3178,17 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
      * character (which can happen when the fold of a character is more than one
      * character). */
      if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
-       return 1;
+        return 0;
      }
  
      /* Successful match.  Set output pointers */
      if (pe1) {
-       *pe1 = (char*)p1;
+        *pe1 = (char*)p1;
      }
      if (pe2) {
-       *pe2 = (char*)p2;
+        *pe2 = (char*)p2;
      }
-    return 0;
+    return 1;
  }
  
  /*