Fix bug with (??{$overload}) regexp caching

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index fb3934c..bf51a91 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -33,13 +33,6 @@
  #include "perl.h"
  #include "inline_invlist.c"
  
-#ifndef EBCDIC
-/* Separate prototypes needed because in ASCII systems these are
- * usually macros but they still are compiled as code, too. */
-PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
-PERL_CALLCONV U8*      Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
-#endif
-
  static const char unees[] =
      "Malformed UTF-8 character (unexpected end of string)";
  
@@ -90,48 +83,14 @@ Perl_is_ascii_string(const U8 *s, STRLEN len)
  =for apidoc uvoffuni_to_utf8_flags
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Instead, B<Almost all code should use L</uvchr_to_utf8> or
+L</uvchr_to_utf8_flags>>.
  
-It adds the UTF-8 representation of the Unicode code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
-    d = uvoffuni_to_utf8_flags(d, uv, flags);
-
-or, in most cases,
-
-    d = uvoffuni_to_utf8_flags(d, uv, 0);
-
-This is the Unicode-aware way of saying
-
-    *(d++) = uv;
-
-where uv is a code point expressed in Latin-1 or above, not the platform's
-native character set.  B<Almost all code should instead use L</uvchr_to_utf8>
-or L</uvchr_to_utf8_flags>>.
-
-This function will convert to UTF-8 (and not warn) even code points that aren't
-legal Unicode or are problematic, unless C<flags> contains one or more of the
-following flags:
-
-If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
-the function will raise a warning, provided UTF8 warnings are enabled.  If instead
-UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
-If both flags are set, the function will both warn and return NULL.
-
-The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
-affect how the function handles a Unicode non-character.  And likewise, the
-UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
-code points that are
-above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
-even less portable) can be warned and/or disallowed even if other above-Unicode
-code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
-flags.
-
-And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
-above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
-DISALLOW flags.
+This function is like them, but the input is a strict Unicode
+(as opposed to native) code point.  Only in very rare circumstances should code
+not be using the native code point.
  
+For details, see the description for L</uvchr_to_utf8_flags>>.
  
  =cut
  */
@@ -141,6 +100,11 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  {
      PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
  
+    if (UNI_IS_INVARIANT(uv)) {
+       *d++ = (U8) LATIN1_TO_NATIVE(uv);
+       return d;
+    }
+
      /* The first problematic code point is the first surrogate */
      if (uv >= UNICODE_SURROGATE_FIRST
          && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
@@ -178,12 +142,9 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
             }
         }
      }
-    if (UNI_IS_INVARIANT(uv)) {
-       *d++ = (U8) LATIN1_TO_NATIVE(uv);
-       return d;
-    }
+
  #if defined(EBCDIC)
-    else {
+    {
         STRLEN len  = OFFUNISKIP(uv);
         U8 *p = d+len-1;
         while (p > d) {
@@ -229,7 +190,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
         *d++ = (U8)(( uv        & 0x3f) | 0x80);
         return d;
      }
-#ifdef HAS_QUAD
+#ifdef UTF8_QUAD_MAX
      if (uv < UTF8_QUAD_MAX)
  #endif
      {
@@ -242,7 +203,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
         *d++ = (U8)(( uv        & 0x3f) | 0x80);
         return d;
      }
-#ifdef HAS_QUAD
+#ifdef UTF8_QUAD_MAX
      {
         *d++ =                            0xff;         /* Can't match U+FFFE! */
         *d++ =                            0x80;         /* 6 Reserved bits */
@@ -262,12 +223,92 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  #endif
  #endif /* Non loop style */
  }
+/*
+=for apidoc uvchr_to_utf8
+
+Adds the UTF-8 representation of the native code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+    d = uvchr_to_utf8(d, uv);
+
+is the recommended wide native character-aware way of saying
+
+    *(d++) = uv;
+
+This function accepts any UV as input.  To forbid or warn on non-Unicode code
+points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
+
+=cut
+*/
+
+/* This is also a macro */
+PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
+
+U8 *
+Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    return uvchr_to_utf8(d, uv);
+}
+
+/*
+=for apidoc uvchr_to_utf8_flags
+
+Adds the UTF-8 representation of the native code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
+
+    d = uvchr_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
+    d = uvchr_to_utf8_flags(d, uv, 0);
+
+This is the Unicode-aware way of saying
+
+    *(d++) = uv;
+
+This function will convert to UTF-8 (and not warn) even code points that aren't
+legal Unicode or are problematic, unless C<flags> contains one or more of the
+following flags:
+
+If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
+the function will raise a warning, provided UTF8 warnings are enabled.  If instead
+UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
+If both flags are set, the function will both warn and return NULL.
+
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
+affect how the function handles a Unicode non-character.  And likewise, the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
+code points that are
+above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
+even less portable) can be warned and/or disallowed even if other above-Unicode
+code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
+flags.
+
+And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
+above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
+DISALLOW flags.
+
+=cut
+*/
+
+/* This is also a macro */
+PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    return uvchr_to_utf8_flags(d, uv, flags);
+}
  
  /*
  
  Tests if the first C<len> bytes of string C<s> form a valid UTF-8
-character.  Note that an INVARIANT (i.e. ASCII) character is a valid
-UTF-8 character.  The number of bytes in the UTF-8 character
+character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
+valid UTF-8 character.  The number of bytes in the UTF-8 character
  will be returned if it is valid, otherwise 0.
  
  This is the "slow" version as opposed to the "fast" version which is
@@ -325,10 +366,8 @@ Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
         len = UTF8SKIP(buf);
      }
  
-#ifdef IS_UTF8_CHAR
      if (IS_UTF8_CHAR_FAST(len))
          return IS_UTF8_CHAR(buf, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
      return is_utf8_char_slow(buf, len);
  }
  
@@ -475,13 +514,13 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  
  /*
  
-=for apidoc utf8n_to_uvoffuni
+=for apidoc utf8n_to_uvchr
  
  THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
  
  Bottom level UTF-8 decode routine.
-Returns the official Unicode (not native) code point value of the first
-character in the string C<s>,
+Returns the native code point value of the first character in the string C<s>,
  which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
  C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
  the length, in bytes, of that character.
@@ -550,13 +589,11 @@ All other code points corresponding to Unicode characters, including private
  use and those yet to be assigned, are never considered malformed and never
  warn.
  
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
-
  =cut
  */
  
  UV
-Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  {
      dVAR;
      const U8 * const s0 = s;
@@ -574,7 +611,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
  
      const char* const malformed_text = "Malformed UTF-8 character";
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVOFFUNI;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
  
      /* The order of malformation tests here is important.  We should consume as
       * few bytes as possible in order to not skip any valid character.  This is
@@ -591,7 +628,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
       * We also should not consume too few bytes, otherwise someone could inject
       * things.  For example, an input could be deliberately designed to
       * overflow, and if this code bailed out immediately upon discovering that,
-     * returning to the caller *retlen pointing to the very next byte (one
+     * returning to the caller C<*retlen> pointing to the very next byte (one
       * which is actually part of of the overflowing sequence), that could look
       * legitimate to the caller, which could discard the initial partial
       * sequence and process the rest, inappropriately */
@@ -623,7 +660,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
  
      /* An invariant is trivially well-formed */
      if (UTF8_IS_INVARIANT(uv)) {
-       return NATIVE_TO_LATIN1(uv);
+       return uv;
      }
  
      /* A continuation character can't start a valid sequence */
@@ -826,7 +863,9 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
         }
  
         if (sv) {
-           outlier_ret = uv;
+            outlier_ret = uv;   /* Note we don't bother to convert to native,
+                                   as all the outlier code points are the same
+                                   in both ASCII and EBCDIC */
             goto do_warn;
         }
  
@@ -834,7 +873,7 @@ Perl_utf8n_to_uvoffuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 fla
          * to return it */
      }
  
-    return uv;
+    return UNI_TO_NATIVE(uv);
  
      /* There are three cases which get to beyond this point.  In all 3 cases:
       * <sv>        if not null points to a string to print as a warning.
@@ -908,7 +947,7 @@ NULL) to -1.  If those warnings are off, the computed value, if well-defined
  (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
  C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
  the next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
  returned.
  
  =cut
@@ -918,8 +957,6 @@ returned.
  UV
  Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  {
-    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
-
      assert(s < send);
  
      return utf8n_to_uvchr(s, send - s, retlen,
@@ -928,8 +965,7 @@ Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  
  /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
   * there are no malformations in the input UTF-8 string C<s>.  surrogates,
- * non-character code points, and non-Unicode code points are allowed.  A macro
- * in utf8.h is used to normally avoid this function wrapper */
+ * non-character code points, and non-Unicode code points are allowed. */
  
  UV
  Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
@@ -985,7 +1021,7 @@ NULL) to -1.  If those warnings are off, the computed value if well-defined (or
  the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
  is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  =cut
  */
@@ -1001,8 +1037,9 @@ Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
  /*
  =for apidoc utf8_to_uvuni_buf
  
-Only in very rare circumstances should code need to be dealing in the Unicode
-code point.  Use L</utf8_to_uvchr_buf> instead.
+Only in very rare circumstances should code need to be dealing in Unicode
+(as opposed to native) code points.  In those few cases, use
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
  
  Returns the Unicode (not-native) code point of the first character in the
  string C<s> which
@@ -1015,7 +1052,7 @@ NULL) to -1.  If those warnings are off, the computed value if well-defined (or
  the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
  is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  =cut
  */
@@ -1028,8 +1065,8 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
      assert(send > s);
  
      /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvoffuni(aTHX_ s, send -s, retlen,
-                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+    return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
+                              ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
  }
  
  /* DEPRECATED!
@@ -1056,7 +1093,7 @@ Some, but not all, UTF-8 malformations are detected, and in fact, some
  malformed input could cause reading beyond the end of the input buffer, which
  is one reason why this function is deprecated.  The other is that only in
  extremely limited circumstances should the Unicode versus native code point be
-of any interest to you.  Use L</utf8_to_uvchr_buf> instead.
+of any interest to you.  See L</utf8_to_uvuni_buf> for alternatives.
  
  If C<s> points to one of the detected malformations, and UTF8 warnings are
  enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
@@ -1064,7 +1101,7 @@ NULL) to -1.  If those warnings are off, the computed value if well-defined (or
  the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
  is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
  next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvoffuni> for details on when the REPLACEMENT CHARACTER is returned.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
  
  =cut
  */
@@ -1268,21 +1305,25 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
  
      /* ensure valid UTF-8 and chars < 256 before updating string */
      while (s < send) {
-        U8 c = *s++;
-
-        if (!UTF8_IS_INVARIANT(c) &&
-            (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
-            || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
-            *len = ((STRLEN) -1);
-            return 0;
+        if (! UTF8_IS_INVARIANT(*s)) {
+            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
+                *len = ((STRLEN) -1);
+                return 0;
+            }
+            s++;
          }
+        s++;
      }
  
      d = s = save;
      while (s < send) {
-        STRLEN ulen;
-        *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
-        s += ulen;
+       U8 c = *s++;
+       if (! UTF8_IS_INVARIANT(c)) {
+           /* Then it is two-byte encoded */
+           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+            s++;
+       }
+       *d++ = c;
      }
      *d = '\0';
      *len = d - save;
@@ -1319,14 +1360,14 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
  
      /* ensure valid UTF-8 and chars < 256 before converting string */
      for (send = s + *len; s < send;) {
-        U8 c = *s++;
-       if (!UTF8_IS_INVARIANT(c)) {
-           if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
-                (c = *s++) && UTF8_IS_CONTINUATION(c))
-               count++;
-           else
+        if (! UTF8_IS_INVARIANT(*s)) {
+            if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
                  return (U8 *)start;
+            }
+            count++;
+            s++;
         }
+        s++;
      }
  
      *is_utf8 = FALSE;
@@ -1335,9 +1376,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
      s = start; start = d;
      while (s < send) {
         U8 c = *s++;
-       if (!UTF8_IS_INVARIANT(c)) {
+       if (! UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s++);
+           c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
+            s++;
         }
         *d++ = c;
      }
@@ -1410,13 +1452,13 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
      while (p < pend) {
         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
         p += 2;
-       if (uv < 0x80) {
-           *d++ = (U8)uv;
+       if (UNI_IS_INVARIANT(uv)) {
+           *d++ = LATIN1_TO_NATIVE((U8) uv);
             continue;
         }
-       if (uv < 0x800) {
-           *d++ = (U8)(( uv >>  6)         | 0xc0);
-           *d++ = (U8)(( uv        & 0x3f) | 0x80);
+       if (uv <= MAX_UTF8_TWO_BYTE) {
+           *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
+           *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
             continue;
         }
  #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
@@ -1437,6 +1479,9 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
         } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
         }
+#ifdef EBCDIC
+        d = uvoffuni_to_utf8_flags(d, uv, 0);
+#else
         if (uv < 0x10000) {
             *d++ = (U8)(( uv >> 12)         | 0xe0);
             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
@@ -1450,6 +1495,7 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
             *d++ = (U8)(( uv        & 0x3f) | 0x80);
             continue;
         }
+#endif
      }
      *newlen = d - dstart;
      return d;
@@ -1645,7 +1691,7 @@ Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_
  
      assert(S_or_s == 'S' || S_or_s == 's');
  
-    if (NATIVE_IS_INVARIANT(converted)) { /* No difference between the two for
+    if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
                                              characters in this range */
         *p = (U8) converted;
         *lenp = 1;
@@ -1746,7 +1792,7 @@ S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
      U8 converted = toLOWER_LATIN1(c);
  
      if (p != NULL) {
-       if (NATIVE_IS_INVARIANT(converted)) {
+       if (NATIVE_BYTE_IS_INVARIANT(converted)) {
             *p = converted;
             *lenp = 1;
         }
@@ -1816,7 +1862,7 @@ Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int f
         converted = toLOWER_LATIN1(c);
      }
  
-    if (NATIVE_IS_INVARIANT(converted)) {
+    if (UVCHR_IS_INVARIANT(converted)) {
         *p = (U8) converted;
         *lenp = 1;
      }
@@ -2563,8 +2609,8 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
@@ -2629,8 +2675,8 @@ Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
@@ -2694,8 +2740,8 @@ Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
@@ -2762,7 +2808,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
  
         if (flags & FOLD_FLAGS_LOCALE) {
  
-            /* Special case this character, as what normally gets returned
+            /* Special case these characters, as what normally gets returned
               * under locale doesn't work */
              if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
                  && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
@@ -2770,6 +2816,12 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
              {
                  goto return_long_s;
              }
+            else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
+                && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
+                          sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
+            {
+                goto return_ligature_st;
+            }
             return check_locale_boundary_crossing(p, result, ustrp, lenp);
         }
         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
@@ -2777,8 +2829,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
         }
         else {
             /* This is called when changing the case of a utf8-encoded
-            * character above the Latin1 range, and the result should not
-            * contain an ASCII character. */
+             * character above the ASCII range, and the result should not
+             * contain an ASCII character. */
  
             UV original;    /* To store the first code point of <p> */
  
@@ -2791,11 +2843,16 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
                     /* Crossed, have to return the original */
                     original = valid_utf8_to_uvchr(p, lenp);
  
-                    /* But in this one instance, there is an alternative we can
+                    /* But in these instances, there is an alternative we can
                       * return that is valid */
-                    if (original == LATIN_CAPITAL_LETTER_SHARP_S) {
+                    if (original == LATIN_CAPITAL_LETTER_SHARP_S
+                        || original == LATIN_SMALL_LETTER_SHARP_S)
+                    {
                          goto return_long_s;
                      }
+                    else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
+                        goto return_ligature_st;
+                    }
                     Copy(p, ustrp, *lenp, char);
                     return original;
                 }
@@ -2813,8 +2870,8 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
         *lenp = 1;
      }
      else {
-       *ustrp = UTF8_EIGHT_BIT_HI(result);
-       *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+       *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+       *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
         *lenp = 2;
      }
  
@@ -2834,6 +2891,14 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, b
      Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
          ustrp, *lenp, U8);
      return LATIN_SMALL_LETTER_LONG_S;
+
+  return_ligature_st:
+    /* Two folds to 'st' are prohibited by the options; instead we pick one and
+     * have the other one fold to it */
+
+    *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
+    Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
+    return LATIN_SMALL_LIGATURE_ST;
  }
  
  /* Note:
@@ -3114,8 +3179,8 @@ Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 m
  /* Note:
   * Returns the value of property/mapping C<swash> for the first character
   * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
- * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
- * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
+ * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
+ * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
   *
   * A "swash" is a hash which contains initially the keys/values set up by
   * SWASHNEW.  The purpose is to be able to completely represent a Unicode
@@ -3157,8 +3222,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
      const U8 *tmps = NULL;
      U32 bit;
      SV *swatch;
-    U8 tmputf8[2];
-    const UV c = *ptr;
+    const U8 c = *ptr;
  
      PERL_ARGS_ASSERT_SWASH_FETCH;
  
@@ -3171,28 +3235,58 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
                                       : c);
      }
  
-    /* Convert to utf8 if not already */
-    if (!do_utf8 && !NATIVE_IS_INVARIANT(c)) {
-       tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
-       tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
-       ptr = tmputf8;
+    /* We store the values in a "swatch" which is a vec() value in a swash
+     * hash.  Code points 0-255 are a single vec() stored with key length
+     * (klen) 0.  All other code points have a UTF-8 representation
+     * 0xAA..0xYY,0xZZ.  A vec() is constructed containing all of them which
+     * share 0xAA..0xYY, which is the key in the hash to that vec.  So the key
+     * length for them is the length of the encoded char - 1.  ptr[klen] is the
+     * final byte in the sequence representing the character */
+    if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
+        klen = 0;
+       needents = 256;
+        off = c;
      }
-    /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
-     * then the "swatch" is a vec() for all the chars which start
-     * with 0xAA..0xYY
-     * So the key in the hash (klen) is length of encoded char -1
-     */
-    klen = UTF8SKIP(ptr) - 1;
-
-    if (klen == 0) {
-      /* If char is invariant then swatch is for all the invariant chars
-       * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
-       */
-       needents = UTF_CONTINUATION_MARK;
-       off      = NATIVE_UTF8_TO_I8(ptr[klen]);
+    else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+        klen = 0;
+       needents = 256;
+        off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
      }
      else {
-      /* If char is encoded then swatch is for the prefix */
+        klen = UTF8SKIP(ptr) - 1;
+
+        /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values.  The offset into
+         * the vec is the final byte in the sequence.  (In EBCDIC this is
+         * converted to I8 to get consecutive values.)  To help you visualize
+         * all this:
+         *                       Straight 1047   After final byte
+         *             UTF-8      UTF-EBCDIC     I8 transform
+         *  U+0400:  \xD0\x80    \xB8\x41\x41    \xB8\x41\xA0
+         *  U+0401:  \xD0\x81    \xB8\x41\x42    \xB8\x41\xA1
+         *    ...
+         *  U+0409:  \xD0\x89    \xB8\x41\x4A    \xB8\x41\xA9
+         *  U+040A:  \xD0\x8A    \xB8\x41\x51    \xB8\x41\xAA
+         *    ...
+         *  U+0412:  \xD0\x92    \xB8\x41\x59    \xB8\x41\xB2
+         *  U+0413:  \xD0\x93    \xB8\x41\x62    \xB8\x41\xB3
+         *    ...
+         *  U+041B:  \xD0\x9B    \xB8\x41\x6A    \xB8\x41\xBB
+         *  U+041C:  \xD0\x9C    \xB8\x41\x70    \xB8\x41\xBC
+         *    ...
+         *  U+041F:  \xD0\x9F    \xB8\x41\x73    \xB8\x41\xBF
+         *  U+0420:  \xD0\xA0    \xB8\x42\x41    \xB8\x42\x41
+         *
+         * (There are no discontinuities in the elided (...) entries.)
+         * The UTF-8 key for these 33 code points is '\xD0' (which also is the
+         * key for the next 31, up through U+043F, whose UTF-8 final byte is
+         * \xBF).  Thus in UTF-8, each key is for a vec() for 64 code points.
+         * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
+         * index into the vec() swatch (after subtracting 0x80, which we
+         * actually do with an '&').
+         * In UTF-EBCDIC, each key is for a 32 code point vec().  The first 32
+         * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
+         * dicontinuities which go away by transforming it into I8, and we
+         * effectively subtract 0xA0 to get the index. */
         needents = (1 << UTF_ACCUMULATION_SHIFT);
         off      = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
      }
@@ -3219,14 +3313,18 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
  
         /* If not cached, generate it via swatch_get */
         if (!svp || !SvPOK(*svp)
-                || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
-           const UV code_point = utf8n_to_uvchr(ptr, UTF8_MAXBYTES, 0,
-                                          ckWARN(WARN_UTF8) ?
-                                          0 : UTF8_ALLOW_ANY);
-           swatch = swatch_get(swash,
-                   /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
-                               (klen) ? (code_point & ~((UV)needents - 1)) : 0,
-                               needents);
+                || !(tmps = (const U8*)SvPV_const(*svp, slen)))
+        {
+            if (klen) {
+                const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
+                swatch = swatch_get(swash,
+                                    code_point & ~((UV)needents - 1),
+                                   needents);
+            }
+            else {  /* For the first 256 code points, the swatch has a key of
+                       length 0 */
+                swatch = swatch_get(swash, 0, needents);
+            }
  
             if (IN_PERL_COMPILETIME)
                 CopHINTS_set(PL_curcop, PL_hints);
@@ -3326,30 +3424,19 @@ S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
             *max = *min;
  
         /* Non-binary tables have a third entry: what the first element of the
-        * range maps to */
+        * range maps to.  The map for those currently read here is in hex */
         if (wants_value) {
             if (isBLANK(*l)) {
                 ++l;
-
-               /* The ToLc, etc table mappings are not in hex, and must be
-                * corrected by adding the code point to them */
-               if (typeto) {
-                   char *after_strtol = (char *) lend;
-                   *val = Strtol((char *)l, &after_strtol, 10);
-                   l = (U8 *) after_strtol;
-               }
-               else { /* Other tables are in hex, and are the correct result
-                         without tweaking */
-                   flags = PERL_SCAN_SILENT_ILLDIGIT
-                       | PERL_SCAN_DISALLOW_PREFIX
-                       | PERL_SCAN_SILENT_NON_PORTABLE;
-                   numlen = lend - l;
-                   *val = grok_hex((char *)l, &numlen, &flags, NULL);
-                   if (numlen)
-                       l += numlen;
-                   else
-                       *val = 0;
-               }
+                flags = PERL_SCAN_SILENT_ILLDIGIT
+                    | PERL_SCAN_DISALLOW_PREFIX
+                    | PERL_SCAN_SILENT_NON_PORTABLE;
+                numlen = lend - l;
+                *val = grok_hex((char *)l, &numlen, &flags, NULL);
+                if (numlen)
+                    l += numlen;
+                else
+                    *val = 0;
             }
             else {
                 *val = 0;
@@ -4173,68 +4260,6 @@ Perl__get_swash_invlist(pTHX_ SV* const swash)
      return *ptr;
  }
  
-/*
-=for apidoc uvchr_to_utf8
-
-Adds the UTF-8 representation of the Native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
-
-    d = uvchr_to_utf8(d, uv);
-
-is the recommended wide native character-aware way of saying
-
-    *(d++) = uv;
-
-=cut
-*/
-
-/* On ASCII machines this is normally a macro but we want a
-   real function in case XS code wants it
-*/
-U8 *
-Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
-{
-    PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
-
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
-}
-
-U8 *
-Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
-{
-    PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
-
-    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
-}
-
-/*
-=for apidoc utf8n_to_uvchr
-
-Returns the native character value of the first character in the string
-C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-C<length> and C<flags> are the same as L</utf8n_to_uvoffuni>().
-
-=cut
-*/
-/* On ASCII machines this is normally a macro but we want
-   a real function in case XS code wants it
-*/
-UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
-U32 flags)
-{
-    const UV uv = Perl_utf8n_to_uvoffuni(aTHX_ s, curlen, retlen, flags);
-
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
-
-    return UNI_TO_NATIVE(uv);
-}
-
  bool
  Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
  {
@@ -4467,10 +4492,18 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
-    /* The algorithm requires that input with the flags on the first line of
-     * the assert not be pre-folded. */
      assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE))
-       && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+           && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+    /* The algorithm is to trial the folds without regard to the flags on
+     * the first line of the above assert(), and then see if the result
+     * violates them.  This means that the inputs can't be pre-folded to a
+     * violating result, hence the assert.  This could be changed, with the
+     * addition of extra tests here for the already-folded case, which would
+     * slow it down.  That cost is more than any possible gain for when these
+     * flags are specified, as the flags indicate /il or /iaa matching which
+     * is less common than /iu, and I (khw) also believe that real-world /il
+     * and /iaa matches are most likely to involve code points 0-255, and this
+     * function only under rare conditions gets called for 0-255. */
  
      if (pe1) {
          e1 = *(U8**)pe1;
@@ -4667,6 +4700,66 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const c
      return 1;
  }
  
+/* XXX The next four functions should likely be moved to mathoms.c once all
+ * occurrences of them are removed from the core; some cpan-upstream modules
+ * still use them */
+
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+    PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
+
+    return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
+
+UV
+Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
+
+    return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
+}
+
+/*
+=for apidoc uvuni_to_utf8_flags
+
+Instead you almost certainly want to use L</uvchr_to_utf8> or
+L</uvchr_to_utf8_flags>>.
+
+This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
+which itself, while not deprecated, should be used only in isolated
+circumstances.  These functions were useful for code that wanted to handle
+both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
+v5.20, the distinctions between the platforms have mostly been made invisible
+to most code, so this function is quite unlikely to be what you want.
+
+=cut
+*/
+
+U8 *
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+    PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
+
+    return uvoffuni_to_utf8_flags(d, uv, flags);
+}
+
+/*
+=for apidoc utf8n_to_uvuni
+
+Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
+
+This function was useful for code that wanted to handle both EBCDIC and
+ASCII platforms with Unicode properties, but starting in Perl v5.20, the
+distinctions between the platforms have mostly been made invisible to most
+code, so this function is quite unlikely to be what you want.  If you do need
+this precise functionality, use instead
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
+or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
+
+=cut
+*/
+
  /*
   * Local variables:
   * c-indentation-style: bsd