utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34 #include "invlist_inline.h"
  35
  36 static const char unees[] =
  37     "Malformed UTF-8 character (unexpected end of string)";
  38 static const char cp_above_legal_max[] =
  39  "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
  40
  41 #define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
  42
  43 /*
  44 =head1 Unicode Support
  45 These are various utility functions for manipulating UTF8-encoded
  46 strings.  For the uninitiated, this is a method of representing arbitrary
  47 Unicode characters as a variable number of bytes, in such a way that
  48 characters in the ASCII range are unmodified, and a zero byte never appears
  49 within non-zero characters.
  50
  51 =cut
  52 */
  53
  54 /*
  55 =for apidoc uvoffuni_to_utf8_flags
  56
  57 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
  58 Instead, B<Almost all code should use L</uvchr_to_utf8> or
  59 L</uvchr_to_utf8_flags>>.
  60
  61 This function is like them, but the input is a strict Unicode
  62 (as opposed to native) code point.  Only in very rare circumstances should code
  63 not be using the native code point.
  64
  65 For details, see the description for L</uvchr_to_utf8_flags>.
  66
  67 =cut
  68 */
  69
  70 #define HANDLE_UNICODE_SURROGATE(uv, flags)                         \
  71     STMT_START {                                                    \
  72         if (flags & UNICODE_WARN_SURROGATE) {                       \
  73             Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),        \
  74                                 "UTF-16 surrogate U+%04"UVXf, uv);  \
  75         }                                                           \
  76         if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
  77             return NULL;                                            \
  78         }                                                           \
  79     } STMT_END;
  80
  81 #define HANDLE_UNICODE_NONCHAR(uv, flags)                           \
  82     STMT_START {                                                    \
  83         if (flags & UNICODE_WARN_NONCHAR) {                         \
  84             Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),          \
  85                  "Unicode non-character U+%04"UVXf" is not "        \
  86                  "recommended for open interchange", uv);           \
  87         }                                                           \
  88         if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
  89             return NULL;                                            \
  90         }                                                           \
  91     } STMT_END;
  92
  93 /*  Use shorter names internally in this file */
  94 #define SHIFT   UTF_ACCUMULATION_SHIFT
  95 #undef  MARK
  96 #define MARK    UTF_CONTINUATION_MARK
  97 #define MASK    UTF_CONTINUATION_MASK
  98
  99 U8 *
 100 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 101 {
 102     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
 103
 104     if (OFFUNI_IS_INVARIANT(uv)) {
 105         *d++ = LATIN1_TO_NATIVE(uv);
 106         return d;
 107     }
 108
 109     if (uv <= MAX_UTF8_TWO_BYTE) {
 110         *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
 111         *d++ = I8_TO_NATIVE_UTF8(( uv           & MASK) |   MARK);
 112         return d;
 113     }
 114
 115     /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
 116      * below, the 16 is for start bytes E0-EF (which are all the possible ones
 117      * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
 118      * contribute SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
 119      * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
 120      * 0x800-0xFFFF on ASCII */
 121     if (uv < (16 * (1U << (2 * SHIFT)))) {
 122         *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
 123         *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
 124         *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */        & MASK) |   MARK);
 125
 126 #ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
 127                    aren't tested here */
 128         /* The most likely code points in this range are below the surrogates.
 129          * Do an extra test to quickly exclude those. */
 130         if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
 131             if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
 132                          || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
 133             {
 134                 HANDLE_UNICODE_NONCHAR(uv, flags);
 135             }
 136             else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
 137                 HANDLE_UNICODE_SURROGATE(uv, flags);
 138             }
 139         }
 140 #endif
 141         return d;
 142     }
 143
 144     /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
 145      * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
 146      * happen starting with 4-byte characters on ASCII platforms.  We unify the
 147      * code for these with EBCDIC, even though some of them require 5-bytes on
 148      * those, because khw believes the code saving is worth the very slight
 149      * performance hit on these high EBCDIC code points. */
 150
 151     if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
 152         if (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
 153             && ckWARN_d(WARN_DEPRECATED))
 154         {
 155             Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
 156                         cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
 157         }
 158         if (   (flags & UNICODE_WARN_SUPER)
 159             || (   UNICODE_IS_ABOVE_31_BIT(uv)
 160                 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
 161         {
 162             Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
 163
 164               /* Choose the more dire applicable warning */
 165               (UNICODE_IS_ABOVE_31_BIT(uv))
 166               ? "Code point 0x%"UVXf" is not Unicode, and not portable"
 167               : "Code point 0x%"UVXf" is not Unicode, may not be portable",
 168              uv);
 169         }
 170         if (flags & UNICODE_DISALLOW_SUPER
 171             || (   UNICODE_IS_ABOVE_31_BIT(uv)
 172                 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
 173         {
 174             return NULL;
 175         }
 176     }
 177     else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
 178         HANDLE_UNICODE_NONCHAR(uv, flags);
 179     }
 180
 181     /* Test for and handle 4-byte result.   In the test immediately below, the
 182      * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
 183      * characters).  The 3 is for 3 continuation bytes; these each contribute
 184      * SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
 185      * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
 186      * 0x1_0000-0x1F_FFFF on ASCII */
 187     if (uv < (8 * (1U << (3 * SHIFT)))) {
 188         *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
 189         *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) |   MARK);
 190         *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
 191         *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */        & MASK) |   MARK);
 192
 193 #ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
 194                    characters.  The end-plane non-characters for EBCDIC were
 195                    handled just above */
 196         if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
 197             HANDLE_UNICODE_NONCHAR(uv, flags);
 198         }
 199         else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
 200             HANDLE_UNICODE_SURROGATE(uv, flags);
 201         }
 202 #endif
 203
 204         return d;
 205     }
 206
 207     /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
 208      * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
 209      * format.  The unrolled version above turns out to not save all that much
 210      * time, and at these high code points (well above the legal Unicode range
 211      * on ASCII platforms, and well above anything in common use in EBCDIC),
 212      * khw believes that less code outweighs slight performance gains. */
 213
 214     {
 215         STRLEN len  = OFFUNISKIP(uv);
 216         U8 *p = d+len-1;
 217         while (p > d) {
 218             *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 219             uv >>= UTF_ACCUMULATION_SHIFT;
 220         }
 221         *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 222         return d+len;
 223     }
 224 }
 225
 226 /*
 227 =for apidoc uvchr_to_utf8
 228
 229 Adds the UTF-8 representation of the native code point C<uv> to the end
 230 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 231 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 232 the byte after the end of the new character.  In other words,
 233
 234     d = uvchr_to_utf8(d, uv);
 235
 236 is the recommended wide native character-aware way of saying
 237
 238     *(d++) = uv;
 239
 240 This function accepts any UV as input, but very high code points (above
 241 C<IV_MAX> on the platform)  will raise a deprecation warning.  This is
 242 typically 0x7FFF_FFFF in a 32-bit word.
 243
 244 It is possible to forbid or warn on non-Unicode code points, or those that may
 245 be problematic by using L</uvchr_to_utf8_flags>.
 246
 247 =cut
 248 */
 249
 250 /* This is also a macro */
 251 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 252
 253 U8 *
 254 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 255 {
 256     return uvchr_to_utf8(d, uv);
 257 }
 258
 259 /*
 260 =for apidoc uvchr_to_utf8_flags
 261
 262 Adds the UTF-8 representation of the native code point C<uv> to the end
 263 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 264 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 265 the byte after the end of the new character.  In other words,
 266
 267     d = uvchr_to_utf8_flags(d, uv, flags);
 268
 269 or, in most cases,
 270
 271     d = uvchr_to_utf8_flags(d, uv, 0);
 272
 273 This is the Unicode-aware way of saying
 274
 275     *(d++) = uv;
 276
 277 If C<flags> is 0, this function accepts any UV as input, but very high code
 278 points (above C<IV_MAX> for the platform)  will raise a deprecation warning.
 279 This is typically 0x7FFF_FFFF in a 32-bit word.
 280
 281 Specifying C<flags> can further restrict what is allowed and not warned on, as
 282 follows:
 283
 284 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
 285 the function will raise a warning, provided UTF8 warnings are enabled.  If
 286 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
 287 NULL.  If both flags are set, the function will both warn and return NULL.
 288
 289 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
 290 affect how the function handles a Unicode non-character.
 291
 292 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
 293 affect the handling of code points that are above the Unicode maximum of
 294 0x10FFFF.  Languages other than Perl may not be able to accept files that
 295 contain these.
 296
 297 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
 298 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
 299 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
 300 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
 301 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
 302 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
 303 above-Unicode and surrogate flags, but not the non-character ones, as
 304 defined in
 305 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
 306 See L<perlunicode/Noncharacter code points>.
 307
 308 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
 309 so using them is more problematic than other above-Unicode code points.  Perl
 310 invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
 311 likely that non-Perl languages will not be able to read files that contain
 312 these that written by the perl interpreter; nor would Perl understand files
 313 written by something that uses a different extension.  For these reasons, there
 314 is a separate set of flags that can warn and/or disallow these extremely high
 315 code points, even if other above-Unicode ones are accepted.  These are the
 316 C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags.  These
 317 are entirely independent from the deprecation warning for code points above
 318 C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
 319 code point that needs more than 31 bits to represent.  When that happens,
 320 effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
 321 32-bit machines.  (Of course C<UNICODE_DISALLOW_SUPER> will treat all
 322 above-Unicode code points, including these, as malformations; and
 323 C<UNICODE_WARN_SUPER> warns on these.)
 324
 325 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
 326 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
 327 than on ASCII.  Prior to that, code points 2**31 and higher were simply
 328 unrepresentable, and a different, incompatible method was used to represent
 329 code points between 2**30 and 2**31 - 1.  The flags C<UNICODE_WARN_ABOVE_31_BIT>
 330 and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
 331 platforms, warning and disallowing 2**31 and higher.
 332
 333 =cut
 334 */
 335
 336 /* This is also a macro */
 337 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 338
 339 U8 *
 340 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 341 {
 342     return uvchr_to_utf8_flags(d, uv, flags);
 343 }
 344
 345 PERL_STATIC_INLINE bool
 346 S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
 347 {
 348     /* Returns TRUE if the first code point represented by the Perl-extended-
 349      * UTF-8-encoded string starting at 's', and looking no further than 'e -
 350      * 1' doesn't fit into 31 bytes.  That is, that if it is >= 2**31.
 351      *
 352      * The function handles the case where the input bytes do not include all
 353      * the ones necessary to represent a full character.  That is, they may be
 354      * the intial bytes of the representation of a code point, but possibly
 355      * the final ones necessary for the complete representation may be beyond
 356      * 'e - 1'.
 357      *
 358      * The function assumes that the sequence is well-formed UTF-8 as far as it
 359      * goes, and is for a UTF-8 variant code point.  If the sequence is
 360      * incomplete, the function returns FALSE if there is any well-formed
 361      * UTF-8 byte sequence that can complete it in such a way that a code point
 362      * < 2**31 is produced; otherwise it returns TRUE.
 363      *
 364      * Getting this exactly right is slightly tricky, and has to be done in
 365      * several places in this file, so is centralized here.  It is based on the
 366      * following table:
 367      *
 368      * U+7FFFFFFF (2 ** 31 - 1)
 369      *      ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
 370      *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
 371      *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
 372      *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
 373      *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
 374      * U+80000000 (2 ** 31):
 375      *      ASCII: \xFE\x82\x80\x80\x80\x80\x80
 376      *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
 377      *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
 378      *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
 379      *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
 380      *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
 381      */
 382
 383 #ifdef EBCDIC
 384
 385         /* [0] is start byte    [1] [2] [3] [4] [5] [6] [7] */
 386     const U8 * const prefix = "\x41\x41\x41\x41\x41\x41\x42";
 387     const STRLEN prefix_len = sizeof(prefix) - 1;
 388     const STRLEN len = e - s;
 389     const cmp_len = MIN(prefix_len, len - 1);
 390
 391 #else
 392
 393     PERL_UNUSED_ARG(e);
 394
 395 #endif
 396
 397     PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
 398
 399     assert(! UTF8_IS_INVARIANT(*s));
 400
 401 #ifndef EBCDIC
 402
 403     /* Technically, a start byte of FE can be for a code point that fits into
 404      * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
 405      * malformation. */
 406     return (*s >= 0xFE);
 407
 408 #else
 409
 410     /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
 411      * larger code point (0xFF is an invariant).  For 0xFE, we need at least 2
 412      * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
 413      * bits. */
 414     if (*s != 0xFE || len == 1) {
 415         return FALSE;
 416     }
 417
 418     /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
 419      * \x41 and \x42. */
 420     return cBOOL(memGT(s + 1, prefix, cmp_len));
 421
 422 #endif
 423
 424 }
 425
 426 STRLEN
 427 Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 428 {
 429     STRLEN len;
 430     const U8 *x, *y;
 431
 432     /* A helper function that should not be called directly.
 433      *
 434      * This function returns non-zero if the string beginning at 's' and
 435      * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
 436      * code point; otherwise it returns 0.  The examination stops after the
 437      * first code point in 's' is validated, not looking at the rest of the
 438      * input.  If 'e' is such that there are not enough bytes to represent a
 439      * complete code point, this function will return non-zero anyway, if the
 440      * bytes it does have are well-formed UTF-8 as far as they go, and aren't
 441      * excluded by 'flags'.
 442      *
 443      * A non-zero return gives the number of bytes required to represent the
 444      * code point.  Be aware that if the input is for a partial character, the
 445      * return will be larger than 'e - s'.
 446      *
 447      * This function assumes that the code point represented is UTF-8 variant.
 448      * The caller should have excluded this possibility before calling this
 449      * function.
 450      *
 451      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
 452      * accepted by L</utf8n_to_uvchr>.  If non-zero, this function will return
 453      * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
 454      * disallowed by the flags.  If the input is only for a partial character,
 455      * the function will return non-zero if there is any sequence of
 456      * well-formed UTF-8 that, when appended to the input sequence, could
 457      * result in an allowed code point; otherwise it returns 0.  Non characters
 458      * cannot be determined based on partial character input.  But many  of the
 459      * other excluded types can be determined with just the first one or two
 460      * bytes.
 461      *
 462      */
 463
 464     PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
 465
 466     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 467                           |UTF8_DISALLOW_ABOVE_31_BIT)));
 468     assert(! UTF8_IS_INVARIANT(*s));
 469
 470     /* A variant char must begin with a start byte */
 471     if (UNLIKELY(! UTF8_IS_START(*s))) {
 472         return 0;
 473     }
 474
 475     /* Examine a maximum of a single whole code point */
 476     if (e - s > UTF8SKIP(s)) {
 477         e = s + UTF8SKIP(s);
 478     }
 479
 480     len = e - s;
 481
 482     if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
 483         const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
 484
 485         /* The code below is derived from this table.  Keep in mind that legal
 486          * continuation bytes range between \x80..\xBF for UTF-8, and
 487          * \xA0..\xBF for I8.  Anything above those aren't continuation bytes.
 488          * Hence, we don't have to test the upper edge because if any of those
 489          * are encountered, the sequence is malformed, and will fail elsewhere
 490          * in this function.
 491          *              UTF-8            UTF-EBCDIC I8
 492          *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
 493          *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
 494          * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
 495          *
 496          */
 497
 498 #ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
 499 #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xFA
 500 #  define IS_SUPER_2_BYTE(s0, s1)                ((s0) == 0xF9 && (s1) >= 0xA2)
 501
 502                                                                /* B6 and B7 */
 503 #  define IS_SURROGATE(s0, s1)         ((s0) == 0xF1 && ((s1) & 0xFE ) == 0xB6)
 504 #else
 505 #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
 506 #  define IS_SUPER_2_BYTE(s0, s1)                ((s0) == 0xF4 && (s1) >= 0x90)
 507 #  define IS_SURROGATE(s0, s1)                   ((s0) == 0xED && (s1) >= 0xA0)
 508 #endif
 509
 510         if (  (flags & UTF8_DISALLOW_SUPER)
 511             && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
 512             return 0;           /* Above Unicode */
 513         }
 514
 515         if (   (flags & UTF8_DISALLOW_ABOVE_31_BIT)
 516             &&  UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
 517         {
 518             return 0;           /* Above 31 bits */
 519         }
 520
 521         if (len > 1) {
 522             const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
 523
 524             if (   (flags & UTF8_DISALLOW_SUPER)
 525                 &&  UNLIKELY(IS_SUPER_2_BYTE(s0, s1)))
 526             {
 527                 return 0;       /* Above Unicode */
 528             }
 529
 530             if (   (flags & UTF8_DISALLOW_SURROGATE)
 531                 &&  UNLIKELY(IS_SURROGATE(s0, s1)))
 532             {
 533                 return 0;       /* Surrogate */
 534             }
 535
 536             if (  (flags & UTF8_DISALLOW_NONCHAR)
 537                 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
 538             {
 539                 return 0;       /* Noncharacter code point */
 540             }
 541         }
 542     }
 543
 544     /* Make sure that all that follows are continuation bytes */
 545     for (x = s + 1; x < e; x++) {
 546         if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
 547             return 0;
 548         }
 549     }
 550
 551     /* Here is syntactically valid.  Next, make sure this isn't the start of an
 552      * overlong.  Overlongs can occur whenever the number of continuation bytes
 553      * changes.  That means whenever the number of leading 1 bits in a start
 554      * byte increases from the next lower start byte.  That happens for start
 555      * bytes C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following
 556      * illegal start bytes have already been excluded, so don't need to be
 557      * tested here;
 558      * ASCII platforms: C0, C1
 559      * EBCDIC platforms C0, C1, C2, C3, C4, E0
 560      *
 561      * At least a second byte is required to determine if other sequences will
 562      * be an overlong. */
 563
 564     if (len > 1) {
 565         const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
 566         const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
 567
 568         /* Each platform has overlongs after the start bytes given above
 569          * (expressed in I8 for EBCDIC).  What constitutes an overlong varies
 570          * by platform, but the logic is the same, except the E0 overlong has
 571          * already been excluded on EBCDIC platforms.   The  values below were
 572          * found by manually inspecting the UTF-8 patterns.  See the tables in
 573          * utf8.h and utfebcdic.h */
 574
 575 #       ifdef EBCDIC
 576 #           define F0_ABOVE_OVERLONG 0xB0
 577 #           define F8_ABOVE_OVERLONG 0xA8
 578 #           define FC_ABOVE_OVERLONG 0xA4
 579 #           define FE_ABOVE_OVERLONG 0xA2
 580 #           define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
 581                                       /* I8(0xfe) is FF */
 582 #       else
 583
 584         if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
 585             return 0;       /* Overlong */
 586         }
 587
 588 #           define F0_ABOVE_OVERLONG 0x90
 589 #           define F8_ABOVE_OVERLONG 0x88
 590 #           define FC_ABOVE_OVERLONG 0x84
 591 #           define FE_ABOVE_OVERLONG 0x82
 592 #           define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
 593 #       endif
 594
 595
 596         if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
 597             || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
 598             || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
 599             || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
 600         {
 601             return 0;       /* Overlong */
 602         }
 603
 604 #   if defined(UV_IS_QUAD) || defined(EBCDIC)
 605
 606         /* Check for the FF overlong.  This happens only if all these bytes
 607          * match; what comes after them doesn't matter.  See tables in utf8.h,
 608          * utfebcdic.h.  (Can't happen on ASCII 32-bit platforms, as overflows
 609          * instead.) */
 610
 611         if (   len >= sizeof(FF_OVERLONG_PREFIX) - 1
 612             && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
 613                                                sizeof(FF_OVERLONG_PREFIX) - 1)))
 614         {
 615             return 0;       /* Overlong */
 616         }
 617
 618 #endif
 619
 620     }
 621
 622     /* Finally, see if this would overflow a UV on this platform.  See if the
 623      * UTF8 for this code point is larger than that for the highest
 624      * representable code point.  (For ASCII platforms, we could use memcmp()
 625      * because we don't have to convert each byte to I8, but it's very rare
 626      * input indeed that would approach overflow, so the loop below will likely
 627      * only get executed once */
 628     y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
 629
 630     for (x = s; x < e; x++, y++) {
 631
 632         /* If the same as this byte, go on to the next */
 633         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
 634             continue;
 635         }
 636
 637         /* If this is larger, it overflows */
 638         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
 639             return 0;
 640         }
 641
 642         /* But if smaller, it won't */
 643         break;
 644     }
 645
 646     return UTF8SKIP(s);
 647 }
 648
 649 #undef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
 650 #undef IS_SUPER_2_BYTE
 651 #undef IS_SURROGATE
 652 #undef F0_ABOVE_OVERLONG
 653 #undef F8_ABOVE_OVERLONG
 654 #undef FC_ABOVE_OVERLONG
 655 #undef FE_ABOVE_OVERLONG
 656 #undef FF_OVERLONG_PREFIX
 657
 658 /*
 659
 660 =for apidoc utf8n_to_uvchr
 661
 662 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 663 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 664
 665 Bottom level UTF-8 decode routine.
 666 Returns the native code point value of the first character in the string C<s>,
 667 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
 668 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
 669 the length, in bytes, of that character.
 670
 671 The value of C<flags> determines the behavior when C<s> does not point to a
 672 well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
 673 zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
 674 next possible position in C<s> that could begin a non-malformed character.
 675 Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
 676
 677 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
 678 individual types of malformations, such as the sequence being overlong (that
 679 is, when there is a shorter sequence that can express the same code point;
 680 overlong sequences are expressly forbidden in the UTF-8 standard due to
 681 potential security issues).  Another malformation example is the first byte of
 682 a character not being a legal first byte.  See F<utf8.h> for the list of such
 683 flags.  For allowed 0 length strings, this function returns 0; for allowed
 684 overlong sequences, the computed code point is returned; for all other allowed
 685 malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
 686 determinable reasonable value.
 687
 688 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
 689 flags) malformation is found.  If this flag is set, the routine assumes that
 690 the caller will raise a warning, and this function will silently just set
 691 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
 692
 693 Note that this API requires disambiguation between successful decoding a C<NUL>
 694 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
 695 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
 696 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
 697 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
 698 error.
 699
 700 Certain code points are considered problematic.  These are Unicode surrogates,
 701 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
 702 By default these are considered regular code points, but certain situations
 703 warrant special handling for them, which can be specified using the C<flags>
 704 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
 705 three classes are treated as malformations and handled as such.  The flags
 706 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
 707 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
 708 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
 709 restricts the allowed inputs to the strict UTF-8 traditionally defined by
 710 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
 711 definition given by
 712 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
 713 The difference between traditional strictness and C9 strictness is that the
 714 latter does not forbid non-character code points.  (They are still discouraged,
 715 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
 716
 717 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
 718 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
 719 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
 720 raised for their respective categories, but otherwise the code points are
 721 considered valid (not malformations).  To get a category to both be treated as
 722 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
 723 (But note that warnings are not raised if lexically disabled nor if
 724 C<UTF8_CHECK_ONLY> is also specified.)
 725
 726 It is now deprecated to have very high code points (above C<IV_MAX> on the
 727 platforms) and this function will raise a deprecation warning for these (unless
 728 such warnings are turned off).  This value, is typically 0x7FFF_FFFF (2**31 -1)
 729 in a 32-bit word.
 730
 731 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
 732 so using them is more problematic than other above-Unicode code points.  Perl
 733 invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
 734 likely that non-Perl languages will not be able to read files that contain
 735 these that written by the perl interpreter; nor would Perl understand files
 736 written by something that uses a different extension.  For these reasons, there
 737 is a separate set of flags that can warn and/or disallow these extremely high
 738 code points, even if other above-Unicode ones are accepted.  These are the
 739 C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags.  These
 740 are entirely independent from the deprecation warning for code points above
 741 C<IV_MAX>.  On 32-bit machines, it will eventually be forbidden to have any
 742 code point that needs more than 31 bits to represent.  When that happens,
 743 effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
 744 32-bit machines.  (Of course C<UTF8_DISALLOW_SUPER> will treat all
 745 above-Unicode code points, including these, as malformations; and
 746 C<UTF8_WARN_SUPER> warns on these.)
 747
 748 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
 749 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
 750 than on ASCII.  Prior to that, code points 2**31 and higher were simply
 751 unrepresentable, and a different, incompatible method was used to represent
 752 code points between 2**30 and 2**31 - 1.  The flags C<UTF8_WARN_ABOVE_31_BIT>
 753 and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
 754 platforms, warning and disallowing 2**31 and higher.
 755
 756 All other code points corresponding to Unicode characters, including private
 757 use and those yet to be assigned, are never considered malformed and never
 758 warn.
 759
 760 =cut
 761 */
 762
 763 UV
 764 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 765 {
 766     const U8 * const s0 = s;
 767     U8 overflow_byte = '\0';    /* Save byte in case of overflow */
 768     U8 * send;
 769     UV uv = *s;
 770     STRLEN expectlen;
 771     SV* sv = NULL;
 772     UV outlier_ret = 0; /* return value when input is in error or problematic
 773                          */
 774     UV pack_warn = 0;   /* Save result of packWARN() for later */
 775     bool unexpected_non_continuation = FALSE;
 776     bool overflowed = FALSE;
 777     bool do_overlong_test = TRUE;   /* May have to skip this test */
 778
 779     const char* const malformed_text = "Malformed UTF-8 character";
 780
 781     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
 782
 783     /* The order of malformation tests here is important.  We should consume as
 784      * few bytes as possible in order to not skip any valid character.  This is
 785      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
 786      * http://unicode.org/reports/tr36 for more discussion as to why.  For
 787      * example, once we've done a UTF8SKIP, we can tell the expected number of
 788      * bytes, and could fail right off the bat if the input parameters indicate
 789      * that there are too few available.  But it could be that just that first
 790      * byte is garbled, and the intended character occupies fewer bytes.  If we
 791      * blindly assumed that the first byte is correct, and skipped based on
 792      * that number, we could skip over a valid input character.  So instead, we
 793      * always examine the sequence byte-by-byte.
 794      *
 795      * We also should not consume too few bytes, otherwise someone could inject
 796      * things.  For example, an input could be deliberately designed to
 797      * overflow, and if this code bailed out immediately upon discovering that,
 798      * returning to the caller C<*retlen> pointing to the very next byte (one
 799      * which is actually part of of the overflowing sequence), that could look
 800      * legitimate to the caller, which could discard the initial partial
 801      * sequence and process the rest, inappropriately */
 802
 803     /* Zero length strings, if allowed, of necessity are zero */
 804     if (UNLIKELY(curlen == 0)) {
 805         if (retlen) {
 806             *retlen = 0;
 807         }
 808
 809         if (flags & UTF8_ALLOW_EMPTY) {
 810             return 0;
 811         }
 812         if (! (flags & UTF8_CHECK_ONLY)) {
 813             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
 814         }
 815         goto malformed;
 816     }
 817
 818     expectlen = UTF8SKIP(s);
 819
 820     /* A well-formed UTF-8 character, as the vast majority of calls to this
 821      * function will be for, has this expected length.  For efficiency, set
 822      * things up here to return it.  It will be overriden only in those rare
 823      * cases where a malformation is found */
 824     if (retlen) {
 825         *retlen = expectlen;
 826     }
 827
 828     /* An invariant is trivially well-formed */
 829     if (UTF8_IS_INVARIANT(uv)) {
 830         return uv;
 831     }
 832
 833     /* A continuation character can't start a valid sequence */
 834     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
 835         if (flags & UTF8_ALLOW_CONTINUATION) {
 836             if (retlen) {
 837                 *retlen = 1;
 838             }
 839             return UNICODE_REPLACEMENT;
 840         }
 841
 842         if (! (flags & UTF8_CHECK_ONLY)) {
 843             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
 844         }
 845         curlen = 1;
 846         goto malformed;
 847     }
 848
 849     /* Here is not a continuation byte, nor an invariant.  The only thing left
 850      * is a start byte (possibly for an overlong) */
 851
 852     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
 853      * that indicate the number of bytes in the character's whole UTF-8
 854      * sequence, leaving just the bits that are part of the value.  */
 855     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 856
 857     /* Now, loop through the remaining bytes in the character's sequence,
 858      * accumulating each into the working value as we go.  Be sure to not look
 859      * past the end of the input string */
 860     send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
 861
 862     for (s = s0 + 1; s < send; s++) {
 863         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
 864             if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
 865
 866                 /* The original implementors viewed this malformation as more
 867                  * serious than the others (though I, khw, don't understand
 868                  * why, since other malformations also give very very wrong
 869                  * results), so there is no way to turn off checking for it.
 870                  * Set a flag, but keep going in the loop, so that we absorb
 871                  * the rest of the bytes that comprise the character. */
 872                 overflowed = TRUE;
 873                 overflow_byte = *s; /* Save for warning message's use */
 874             }
 875             uv = UTF8_ACCUMULATE(uv, *s);
 876         }
 877         else {
 878             /* Here, found a non-continuation before processing all expected
 879              * bytes.  This byte begins a new character, so quit, even if
 880              * allowing this malformation. */
 881             unexpected_non_continuation = TRUE;
 882             break;
 883         }
 884     } /* End of loop through the character's bytes */
 885
 886     /* Save how many bytes were actually in the character */
 887     curlen = s - s0;
 888
 889     /* The loop above finds two types of malformations: non-continuation and/or
 890      * overflow.  The non-continuation malformation is really a too-short
 891      * malformation, as it means that the current character ended before it was
 892      * expected to (being terminated prematurely by the beginning of the next
 893      * character, whereas in the too-short malformation there just are too few
 894      * bytes available to hold the character.  In both cases, the check below
 895      * that we have found the expected number of bytes would fail if executed.)
 896      * Thus the non-continuation malformation is really unnecessary, being a
 897      * subset of the too-short malformation.  But there may be existing
 898      * applications that are expecting the non-continuation type, so we retain
 899      * it, and return it in preference to the too-short malformation.  (If this
 900      * code were being written from scratch, the two types might be collapsed
 901      * into one.)  I, khw, am also giving priority to returning the
 902      * non-continuation and too-short malformations over overflow when multiple
 903      * ones are present.  I don't know of any real reason to prefer one over
 904      * the other, except that it seems to me that multiple-byte errors trumps
 905      * errors from a single byte */
 906     if (UNLIKELY(unexpected_non_continuation)) {
 907         if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 908             if (! (flags & UTF8_CHECK_ONLY)) {
 909                 if (curlen == 1) {
 910                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
 911                 }
 912                 else {
 913                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
 914                 }
 915             }
 916             goto malformed;
 917         }
 918         uv = UNICODE_REPLACEMENT;
 919
 920         /* Skip testing for overlongs, as the REPLACEMENT may not be the same
 921          * as what the original expectations were. */
 922         do_overlong_test = FALSE;
 923         if (retlen) {
 924             *retlen = curlen;
 925         }
 926     }
 927     else if (UNLIKELY(curlen < expectlen)) {
 928         if (! (flags & UTF8_ALLOW_SHORT)) {
 929             if (! (flags & UTF8_CHECK_ONLY)) {
 930                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
 931             }
 932             goto malformed;
 933         }
 934         uv = UNICODE_REPLACEMENT;
 935         do_overlong_test = FALSE;
 936         if (retlen) {
 937             *retlen = curlen;
 938         }
 939     }
 940
 941     if (UNLIKELY(overflowed)) {
 942         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
 943         goto malformed;
 944     }
 945
 946     if (do_overlong_test
 947         && expectlen > (STRLEN) OFFUNISKIP(uv)
 948         && ! (flags & UTF8_ALLOW_LONG))
 949     {
 950         /* The overlong malformation has lower precedence than the others.
 951          * Note that if this malformation is allowed, we return the actual
 952          * value, instead of the replacement character.  This is because this
 953          * value is actually well-defined. */
 954         if (! (flags & UTF8_CHECK_ONLY)) {
 955             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
 956         }
 957         goto malformed;
 958     }
 959
 960     /* Here, the input is considered to be well-formed, but it still could be a
 961      * problematic code point that is not allowed by the input parameters. */
 962     if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
 963         && ((flags & ( UTF8_DISALLOW_NONCHAR
 964                       |UTF8_DISALLOW_SURROGATE
 965                       |UTF8_DISALLOW_SUPER
 966                       |UTF8_DISALLOW_ABOVE_31_BIT
 967                       |UTF8_WARN_NONCHAR
 968                       |UTF8_WARN_SURROGATE
 969                       |UTF8_WARN_SUPER
 970                       |UTF8_WARN_ABOVE_31_BIT))
 971             || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
 972                 && ckWARN_d(WARN_DEPRECATED))))
 973     {
 974         if (UNICODE_IS_SURROGATE(uv)) {
 975
 976             /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
 977              * generation of the sv, since no warnings are raised under CHECK */
 978             if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
 979                 && ckWARN_d(WARN_SURROGATE))
 980             {
 981                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
 982                 pack_warn = packWARN(WARN_SURROGATE);
 983             }
 984             if (flags & UTF8_DISALLOW_SURROGATE) {
 985                 goto disallowed;
 986             }
 987         }
 988         else if ((uv > PERL_UNICODE_MAX)) {
 989             if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
 990                 && ckWARN_d(WARN_NON_UNICODE))
 991             {
 992                 sv = sv_2mortal(Perl_newSVpvf(aTHX_
 993                    "Code point 0x%04"UVXf" is not Unicode, may not be portable",
 994                    uv));
 995                 pack_warn = packWARN(WARN_NON_UNICODE);
 996             }
 997
 998             /* The maximum code point ever specified by a standard was
 999              * 2**31 - 1.  Anything larger than that is a Perl extension that
1000              * very well may not be understood by other applications (including
1001              * earlier perl versions on EBCDIC platforms).  We test for these
1002              * after the regular SUPER ones, and before possibly bailing out,
1003              * so that the slightly more dire warning will override the regular
1004              * one. */
1005             if (   (flags & (UTF8_WARN_ABOVE_31_BIT
1006                             |UTF8_WARN_SUPER
1007                             |UTF8_DISALLOW_ABOVE_31_BIT))
1008                 && UNLIKELY(is_utf8_cp_above_31_bits(s0, send)))
1009             {
1010                 if (  ! (flags & UTF8_CHECK_ONLY)
1011                     &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1012                     &&  ckWARN_d(WARN_UTF8))
1013                 {
1014                     sv = sv_2mortal(Perl_newSVpvf(aTHX_
1015                         "Code point 0x%"UVXf" is not Unicode, and not portable",
1016                         uv));
1017                     pack_warn = packWARN(WARN_UTF8);
1018                 }
1019                 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1020                     goto disallowed;
1021                 }
1022             }
1023
1024             if (flags & UTF8_DISALLOW_SUPER) {
1025                 goto disallowed;
1026             }
1027
1028             /* The deprecated warning overrides any non-deprecated one */
1029             if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
1030             {
1031                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
1032                                               uv, MAX_NON_DEPRECATED_CP));
1033                 pack_warn = packWARN(WARN_DEPRECATED);
1034             }
1035         }
1036         else if (UNICODE_IS_NONCHAR(uv)) {
1037             if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
1038                 && ckWARN_d(WARN_NONCHAR))
1039             {
1040                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
1041                 pack_warn = packWARN(WARN_NONCHAR);
1042             }
1043             if (flags & UTF8_DISALLOW_NONCHAR) {
1044                 goto disallowed;
1045             }
1046         }
1047
1048         if (sv) {
1049             outlier_ret = uv;   /* Note we don't bother to convert to native,
1050                                    as all the outlier code points are the same
1051                                    in both ASCII and EBCDIC */
1052             goto do_warn;
1053         }
1054
1055         /* Here, this is not considered a malformed character, so drop through
1056          * to return it */
1057     }
1058
1059     return UNI_TO_NATIVE(uv);
1060
1061     /* There are three cases which get to beyond this point.  In all 3 cases:
1062      * <sv>         if not null points to a string to print as a warning.
1063      * <curlen>     is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
1064      *              set.
1065      * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
1066      *              This is done by initializing it to 0, and changing it only
1067      *              for case 1).
1068      * The 3 cases are:
1069      * 1)   The input is valid but problematic, and to be warned about.  The
1070      *      return value is the resultant code point; <*retlen> is set to
1071      *      <curlen>, the number of bytes that comprise the code point.
1072      *      <pack_warn> contains the result of packWARN() for the warning
1073      *      types.  The entry point for this case is the label <do_warn>;
1074      * 2)   The input is a valid code point but disallowed by the parameters to
1075      *      this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
1076      *      <*relen> is -1; otherwise it is <curlen>, the number of bytes that
1077      *      comprise the code point.  <pack_warn> contains the result of
1078      *      packWARN() for the warning types.  The entry point for this case is
1079      *      the label <disallowed>.
1080      * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
1081      *      is set, <*relen> is -1; otherwise it is <curlen>, the number of
1082      *      bytes that comprise the malformation.  All such malformations are
1083      *      assumed to be warning type <utf8>.  The entry point for this case
1084      *      is the label <malformed>.
1085      */
1086
1087   malformed:
1088
1089     if (sv && ckWARN_d(WARN_UTF8)) {
1090         pack_warn = packWARN(WARN_UTF8);
1091     }
1092
1093   disallowed:
1094
1095     if (flags & UTF8_CHECK_ONLY) {
1096         if (retlen)
1097             *retlen = ((STRLEN) -1);
1098         return 0;
1099     }
1100
1101   do_warn:
1102
1103     if (pack_warn) {    /* <pack_warn> was initialized to 0, and changed only
1104                            if warnings are to be raised. */
1105         const char * const string = SvPVX_const(sv);
1106
1107         if (PL_op)
1108             Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
1109         else
1110             Perl_warner(aTHX_ pack_warn, "%s", string);
1111     }
1112
1113     if (retlen) {
1114         *retlen = curlen;
1115     }
1116
1117     return outlier_ret;
1118 }
1119
1120 /*
1121 =for apidoc utf8_to_uvchr_buf
1122
1123 Returns the native code point of the first character in the string C<s> which
1124 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1125 C<*retlen> will be set to the length, in bytes, of that character.
1126
1127 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1128 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1129 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
1130 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
1131 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
1132 the next possible position in C<s> that could begin a non-malformed character.
1133 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
1134 returned.
1135
1136 Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1137 unless those are turned off.
1138
1139 =cut
1140
1141 Also implemented as a macro in utf8.h
1142
1143 */
1144
1145
1146 UV
1147 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1148 {
1149     assert(s < send);
1150
1151     return utf8n_to_uvchr(s, send - s, retlen,
1152                           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1153 }
1154
1155 /* This is marked as deprecated
1156  *
1157 =for apidoc utf8_to_uvuni_buf
1158
1159 Only in very rare circumstances should code need to be dealing in Unicode
1160 (as opposed to native) code points.  In those few cases, use
1161 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
1162
1163 Returns the Unicode (not-native) code point of the first character in the
1164 string C<s> which
1165 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1166 C<retlen> will be set to the length, in bytes, of that character.
1167
1168 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1169 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1170 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
1171 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1172 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1173 next possible position in C<s> that could begin a non-malformed character.
1174 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1175
1176 Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1177 unless those are turned off.
1178
1179 =cut
1180 */
1181
1182 UV
1183 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1184 {
1185     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1186
1187     assert(send > s);
1188
1189     /* Call the low level routine, asking for checks */
1190     return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
1191 }
1192
1193 /*
1194 =for apidoc utf8_length
1195
1196 Return the length of the UTF-8 char encoded string C<s> in characters.
1197 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
1198 up past C<e>, croaks.
1199
1200 =cut
1201 */
1202
1203 STRLEN
1204 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
1205 {
1206     STRLEN len = 0;
1207
1208     PERL_ARGS_ASSERT_UTF8_LENGTH;
1209
1210     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1211      * the bitops (especially ~) can create illegal UTF-8.
1212      * In other words: in Perl UTF-8 is not just for Unicode. */
1213
1214     if (e < s)
1215         goto warn_and_return;
1216     while (s < e) {
1217         s += UTF8SKIP(s);
1218         len++;
1219     }
1220
1221     if (e != s) {
1222         len--;
1223         warn_and_return:
1224         if (PL_op)
1225             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1226                              "%s in %s", unees, OP_DESC(PL_op));
1227         else
1228             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1229     }
1230
1231     return len;
1232 }
1233
1234 /*
1235 =for apidoc bytes_cmp_utf8
1236
1237 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1238 sequence of characters (stored as UTF-8)
1239 in C<u>, C<ulen>.  Returns 0 if they are
1240 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1241 if the first string is greater than the second string.
1242
1243 -1 or +1 is returned if the shorter string was identical to the start of the
1244 longer string.  -2 or +2 is returned if
1245 there was a difference between characters
1246 within the strings.
1247
1248 =cut
1249 */
1250
1251 int
1252 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1253 {
1254     const U8 *const bend = b + blen;
1255     const U8 *const uend = u + ulen;
1256
1257     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1258
1259     while (b < bend && u < uend) {
1260         U8 c = *u++;
1261         if (!UTF8_IS_INVARIANT(c)) {
1262             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1263                 if (u < uend) {
1264                     U8 c1 = *u++;
1265                     if (UTF8_IS_CONTINUATION(c1)) {
1266                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
1267                     } else {
1268                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1269                                          "Malformed UTF-8 character "
1270                                          "(unexpected non-continuation byte 0x%02x"
1271                                          ", immediately after start byte 0x%02x)"
1272                                          /* Dear diag.t, it's in the pod.  */
1273                                          "%s%s", c1, c,
1274                                          PL_op ? " in " : "",
1275                                          PL_op ? OP_DESC(PL_op) : "");
1276                         return -2;
1277                     }
1278                 } else {
1279                     if (PL_op)
1280                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1281                                          "%s in %s", unees, OP_DESC(PL_op));
1282                     else
1283                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1284                     return -2; /* Really want to return undef :-)  */
1285                 }
1286             } else {
1287                 return -2;
1288             }
1289         }
1290         if (*b != c) {
1291             return *b < c ? -2 : +2;
1292         }
1293         ++b;
1294     }
1295
1296     if (b == bend && u == uend)
1297         return 0;
1298
1299     return b < bend ? +1 : -1;
1300 }
1301
1302 /*
1303 =for apidoc utf8_to_bytes
1304
1305 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1306 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1307 updates C<len> to contain the new length.
1308 Returns zero on failure, setting C<len> to -1.
1309
1310 If you need a copy of the string, see L</bytes_from_utf8>.
1311
1312 =cut
1313 */
1314
1315 U8 *
1316 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1317 {
1318     U8 * const save = s;
1319     U8 * const send = s + *len;
1320     U8 *d;
1321
1322     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1323     PERL_UNUSED_CONTEXT;
1324
1325     /* ensure valid UTF-8 and chars < 256 before updating string */
1326     while (s < send) {
1327         if (! UTF8_IS_INVARIANT(*s)) {
1328             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1329                 *len = ((STRLEN) -1);
1330                 return 0;
1331             }
1332             s++;
1333         }
1334         s++;
1335     }
1336
1337     d = s = save;
1338     while (s < send) {
1339         U8 c = *s++;
1340         if (! UTF8_IS_INVARIANT(c)) {
1341             /* Then it is two-byte encoded */
1342             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1343             s++;
1344         }
1345         *d++ = c;
1346     }
1347     *d = '\0';
1348     *len = d - save;
1349     return save;
1350 }
1351
1352 /*
1353 =for apidoc bytes_from_utf8
1354
1355 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1356 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
1357 the newly-created string, and updates C<len> to contain the new
1358 length.  Returns the original string if no conversion occurs, C<len>
1359 is unchanged.  Do nothing if C<is_utf8> points to 0.  Sets C<is_utf8> to
1360 0 if C<s> is converted or consisted entirely of characters that are invariant
1361 in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
1362
1363 =cut
1364 */
1365
1366 U8 *
1367 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1368 {
1369     U8 *d;
1370     const U8 *start = s;
1371     const U8 *send;
1372     I32 count = 0;
1373
1374     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1375     PERL_UNUSED_CONTEXT;
1376     if (!*is_utf8)
1377         return (U8 *)start;
1378
1379     /* ensure valid UTF-8 and chars < 256 before converting string */
1380     for (send = s + *len; s < send;) {
1381         if (! UTF8_IS_INVARIANT(*s)) {
1382             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1383                 return (U8 *)start;
1384             }
1385             count++;
1386             s++;
1387         }
1388         s++;
1389     }
1390
1391     *is_utf8 = FALSE;
1392
1393     Newx(d, (*len) - count + 1, U8);
1394     s = start; start = d;
1395     while (s < send) {
1396         U8 c = *s++;
1397         if (! UTF8_IS_INVARIANT(c)) {
1398             /* Then it is two-byte encoded */
1399             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1400             s++;
1401         }
1402         *d++ = c;
1403     }
1404     *d = '\0';
1405     *len = d - start;
1406     return (U8 *)start;
1407 }
1408
1409 /*
1410 =for apidoc bytes_to_utf8
1411
1412 Converts a string C<s> of length C<len> bytes from the native encoding into
1413 UTF-8.
1414 Returns a pointer to the newly-created string, and sets C<len> to
1415 reflect the new length in bytes.
1416
1417 A C<NUL> character will be written after the end of the string.
1418
1419 If you want to convert to UTF-8 from encodings other than
1420 the native (Latin1 or EBCDIC),
1421 see L</sv_recode_to_utf8>().
1422
1423 =cut
1424 */
1425
1426 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1427    likewise need duplication. */
1428
1429 U8*
1430 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1431 {
1432     const U8 * const send = s + (*len);
1433     U8 *d;
1434     U8 *dst;
1435
1436     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1437     PERL_UNUSED_CONTEXT;
1438
1439     Newx(d, (*len) * 2 + 1, U8);
1440     dst = d;
1441
1442     while (s < send) {
1443         append_utf8_from_native_byte(*s, &d);
1444         s++;
1445     }
1446     *d = '\0';
1447     *len = d-dst;
1448     return dst;
1449 }
1450
1451 /*
1452  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1453  *
1454  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
1455  * We optimize for native, for obvious reasons. */
1456
1457 U8*
1458 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1459 {
1460     U8* pend;
1461     U8* dstart = d;
1462
1463     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1464
1465     if (bytelen & 1)
1466         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1467
1468     pend = p + bytelen;
1469
1470     while (p < pend) {
1471         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1472         p += 2;
1473         if (OFFUNI_IS_INVARIANT(uv)) {
1474             *d++ = LATIN1_TO_NATIVE((U8) uv);
1475             continue;
1476         }
1477         if (uv <= MAX_UTF8_TWO_BYTE) {
1478             *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1479             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
1480             continue;
1481         }
1482 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1483 #define LAST_HIGH_SURROGATE  0xDBFF
1484 #define FIRST_LOW_SURROGATE  0xDC00
1485 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
1486
1487         /* This assumes that most uses will be in the first Unicode plane, not
1488          * needing surrogates */
1489         if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1490                   && uv <= UNICODE_SURROGATE_LAST))
1491         {
1492             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1493                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1494             }
1495             else {
1496                 UV low = (p[0] << 8) + p[1];
1497                 if (   UNLIKELY(low < FIRST_LOW_SURROGATE)
1498                     || UNLIKELY(low > LAST_LOW_SURROGATE))
1499                 {
1500                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1501                 }
1502                 p += 2;
1503                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1504                                        + (low - FIRST_LOW_SURROGATE) + 0x10000;
1505             }
1506         }
1507 #ifdef EBCDIC
1508         d = uvoffuni_to_utf8_flags(d, uv, 0);
1509 #else
1510         if (uv < 0x10000) {
1511             *d++ = (U8)(( uv >> 12)         | 0xe0);
1512             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1513             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1514             continue;
1515         }
1516         else {
1517             *d++ = (U8)(( uv >> 18)         | 0xf0);
1518             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1519             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1520             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1521             continue;
1522         }
1523 #endif
1524     }
1525     *newlen = d - dstart;
1526     return d;
1527 }
1528
1529 /* Note: this one is slightly destructive of the source. */
1530
1531 U8*
1532 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1533 {
1534     U8* s = (U8*)p;
1535     U8* const send = s + bytelen;
1536
1537     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1538
1539     if (bytelen & 1)
1540         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1541                    (UV)bytelen);
1542
1543     while (s < send) {
1544         const U8 tmp = s[0];
1545         s[0] = s[1];
1546         s[1] = tmp;
1547         s += 2;
1548     }
1549     return utf16_to_utf8(p, d, bytelen, newlen);
1550 }
1551
1552 bool
1553 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1554 {
1555     U8 tmpbuf[UTF8_MAXBYTES+1];
1556     uvchr_to_utf8(tmpbuf, c);
1557     return _is_utf8_FOO(classnum, tmpbuf);
1558 }
1559
1560 /* Internal function so we can deprecate the external one, and call
1561    this one from other deprecated functions in this file */
1562
1563 bool
1564 Perl__is_utf8_idstart(pTHX_ const U8 *p)
1565 {
1566     PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
1567
1568     if (*p == '_')
1569         return TRUE;
1570     return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
1571 }
1572
1573 bool
1574 Perl__is_uni_perl_idcont(pTHX_ UV c)
1575 {
1576     U8 tmpbuf[UTF8_MAXBYTES+1];
1577     uvchr_to_utf8(tmpbuf, c);
1578     return _is_utf8_perl_idcont(tmpbuf);
1579 }
1580
1581 bool
1582 Perl__is_uni_perl_idstart(pTHX_ UV c)
1583 {
1584     U8 tmpbuf[UTF8_MAXBYTES+1];
1585     uvchr_to_utf8(tmpbuf, c);
1586     return _is_utf8_perl_idstart(tmpbuf);
1587 }
1588
1589 UV
1590 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1591 {
1592     /* We have the latin1-range values compiled into the core, so just use
1593      * those, converting the result to UTF-8.  The only difference between upper
1594      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1595      * either "SS" or "Ss".  Which one to use is passed into the routine in
1596      * 'S_or_s' to avoid a test */
1597
1598     UV converted = toUPPER_LATIN1_MOD(c);
1599
1600     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1601
1602     assert(S_or_s == 'S' || S_or_s == 's');
1603
1604     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
1605                                              characters in this range */
1606         *p = (U8) converted;
1607         *lenp = 1;
1608         return converted;
1609     }
1610
1611     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1612      * which it maps to one of them, so as to only have to have one check for
1613      * it in the main case */
1614     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1615         switch (c) {
1616             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1617                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1618                 break;
1619             case MICRO_SIGN:
1620                 converted = GREEK_CAPITAL_LETTER_MU;
1621                 break;
1622 #if    UNICODE_MAJOR_VERSION > 2                                        \
1623    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
1624                                   && UNICODE_DOT_DOT_VERSION >= 8)
1625             case LATIN_SMALL_LETTER_SHARP_S:
1626                 *(p)++ = 'S';
1627                 *p = S_or_s;
1628                 *lenp = 2;
1629                 return 'S';
1630 #endif
1631             default:
1632                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1633                 NOT_REACHED; /* NOTREACHED */
1634         }
1635     }
1636
1637     *(p)++ = UTF8_TWO_BYTE_HI(converted);
1638     *p = UTF8_TWO_BYTE_LO(converted);
1639     *lenp = 2;
1640
1641     return converted;
1642 }
1643
1644 /* Call the function to convert a UTF-8 encoded character to the specified case.
1645  * Note that there may be more than one character in the result.
1646  * INP is a pointer to the first byte of the input character
1647  * OUTP will be set to the first byte of the string of changed characters.  It
1648  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1649  * LENP will be set to the length in bytes of the string of changed characters
1650  *
1651  * The functions return the ordinal of the first character in the string of OUTP */
1652 #define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
1653 #define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
1654 #define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
1655
1656 /* This additionally has the input parameter 'specials', which if non-zero will
1657  * cause this to use the specials hash for folding (meaning get full case
1658  * folding); otherwise, when zero, this implies a simple case fold */
1659 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
1660
1661 UV
1662 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1663 {
1664     /* Convert the Unicode character whose ordinal is <c> to its uppercase
1665      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1666      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1667      * the changed version may be longer than the original character.
1668      *
1669      * The ordinal of the first character of the changed version is returned
1670      * (but note, as explained above, that there may be more.) */
1671
1672     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1673
1674     if (c < 256) {
1675         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1676     }
1677
1678     uvchr_to_utf8(p, c);
1679     return CALL_UPPER_CASE(c, p, p, lenp);
1680 }
1681
1682 UV
1683 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1684 {
1685     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1686
1687     if (c < 256) {
1688         return _to_upper_title_latin1((U8) c, p, lenp, 's');
1689     }
1690
1691     uvchr_to_utf8(p, c);
1692     return CALL_TITLE_CASE(c, p, p, lenp);
1693 }
1694
1695 STATIC U8
1696 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
1697 {
1698     /* We have the latin1-range values compiled into the core, so just use
1699      * those, converting the result to UTF-8.  Since the result is always just
1700      * one character, we allow <p> to be NULL */
1701
1702     U8 converted = toLOWER_LATIN1(c);
1703
1704     if (p != NULL) {
1705         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
1706             *p = converted;
1707             *lenp = 1;
1708         }
1709         else {
1710             /* Result is known to always be < 256, so can use the EIGHT_BIT
1711              * macros */
1712             *p = UTF8_EIGHT_BIT_HI(converted);
1713             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
1714             *lenp = 2;
1715         }
1716     }
1717     return converted;
1718 }
1719
1720 UV
1721 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1722 {
1723     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1724
1725     if (c < 256) {
1726         return to_lower_latin1((U8) c, p, lenp);
1727     }
1728
1729     uvchr_to_utf8(p, c);
1730     return CALL_LOWER_CASE(c, p, p, lenp);
1731 }
1732
1733 UV
1734 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
1735 {
1736     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1737      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1738      *      FOLD_FLAGS_FULL  iff full folding is to be used;
1739      *
1740      *  Not to be used for locale folds
1741      */
1742
1743     UV converted;
1744
1745     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1746     PERL_UNUSED_CONTEXT;
1747
1748     assert (! (flags & FOLD_FLAGS_LOCALE));
1749
1750     if (UNLIKELY(c == MICRO_SIGN)) {
1751         converted = GREEK_SMALL_LETTER_MU;
1752     }
1753 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
1754    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
1755                                       || UNICODE_DOT_DOT_VERSION > 0)
1756     else if (   (flags & FOLD_FLAGS_FULL)
1757              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
1758     {
1759         /* If can't cross 127/128 boundary, can't return "ss"; instead return
1760          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1761          * under those circumstances. */
1762         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1763             *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1764             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1765                  p, *lenp, U8);
1766             return LATIN_SMALL_LETTER_LONG_S;
1767         }
1768         else {
1769             *(p)++ = 's';
1770             *p = 's';
1771             *lenp = 2;
1772             return 's';
1773         }
1774     }
1775 #endif
1776     else { /* In this range the fold of all other characters is their lower
1777               case */
1778         converted = toLOWER_LATIN1(c);
1779     }
1780
1781     if (UVCHR_IS_INVARIANT(converted)) {
1782         *p = (U8) converted;
1783         *lenp = 1;
1784     }
1785     else {
1786         *(p)++ = UTF8_TWO_BYTE_HI(converted);
1787         *p = UTF8_TWO_BYTE_LO(converted);
1788         *lenp = 2;
1789     }
1790
1791     return converted;
1792 }
1793
1794 UV
1795 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
1796 {
1797
1798     /* Not currently externally documented, and subject to change
1799      *  <flags> bits meanings:
1800      *      FOLD_FLAGS_FULL  iff full folding is to be used;
1801      *      FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
1802      *                        locale are to be used.
1803      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1804      */
1805
1806     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1807
1808     if (flags & FOLD_FLAGS_LOCALE) {
1809         /* Treat a UTF-8 locale as not being in locale at all */
1810         if (IN_UTF8_CTYPE_LOCALE) {
1811             flags &= ~FOLD_FLAGS_LOCALE;
1812         }
1813         else {
1814             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1815             goto needs_full_generality;
1816         }
1817     }
1818
1819     if (c < 256) {
1820         return _to_fold_latin1((U8) c, p, lenp,
1821                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
1822     }
1823
1824     /* Here, above 255.  If no special needs, just use the macro */
1825     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1826         uvchr_to_utf8(p, c);
1827         return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
1828     }
1829     else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1830                the special flags. */
1831         U8 utf8_c[UTF8_MAXBYTES + 1];
1832
1833       needs_full_generality:
1834         uvchr_to_utf8(utf8_c, c);
1835         return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
1836     }
1837 }
1838
1839 PERL_STATIC_INLINE bool
1840 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1841                  const char *const swashname, SV* const invlist)
1842 {
1843     /* returns a boolean giving whether or not the UTF8-encoded character that
1844      * starts at <p> is in the swash indicated by <swashname>.  <swash>
1845      * contains a pointer to where the swash indicated by <swashname>
1846      * is to be stored; which this routine will do, so that future calls will
1847      * look at <*swash> and only generate a swash if it is not null.  <invlist>
1848      * is NULL or an inversion list that defines the swash.  If not null, it
1849      * saves time during initialization of the swash.
1850      *
1851      * Note that it is assumed that the buffer length of <p> is enough to
1852      * contain all the bytes that comprise the character.  Thus, <*p> should
1853      * have been checked before this call for mal-formedness enough to assure
1854      * that. */
1855
1856     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1857
1858     /* The API should have included a length for the UTF-8 character in <p>,
1859      * but it doesn't.  We therefore assume that p has been validated at least
1860      * as far as there being enough bytes available in it to accommodate the
1861      * character without reading beyond the end, and pass that number on to the
1862      * validating routine */
1863     if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
1864         if (ckWARN_d(WARN_UTF8)) {
1865             Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
1866                     "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
1867             if (ckWARN(WARN_UTF8)) {    /* This will output details as to the
1868                                            what the malformation is */
1869                 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
1870             }
1871         }
1872         return FALSE;
1873     }
1874     if (!*swash) {
1875         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1876         *swash = _core_swash_init("utf8",
1877
1878                                   /* Only use the name if there is no inversion
1879                                    * list; otherwise will go out to disk */
1880                                   (invlist) ? "" : swashname,
1881
1882                                   &PL_sv_undef, 1, 0, invlist, &flags);
1883     }
1884
1885     return swash_fetch(*swash, p, TRUE) != 0;
1886 }
1887
1888 bool
1889 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
1890 {
1891     PERL_ARGS_ASSERT__IS_UTF8_FOO;
1892
1893     assert(classnum < _FIRST_NON_SWASH_CC);
1894
1895     return is_utf8_common(p,
1896                           &PL_utf8_swash_ptrs[classnum],
1897                           swash_property_names[classnum],
1898                           PL_XPosix_ptrs[classnum]);
1899 }
1900
1901 bool
1902 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
1903 {
1904     SV* invlist = NULL;
1905
1906     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
1907
1908     if (! PL_utf8_perl_idstart) {
1909         invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
1910     }
1911     return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
1912 }
1913
1914 bool
1915 Perl__is_utf8_xidstart(pTHX_ const U8 *p)
1916 {
1917     PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
1918
1919     if (*p == '_')
1920         return TRUE;
1921     return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
1922 }
1923
1924 bool
1925 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
1926 {
1927     SV* invlist = NULL;
1928
1929     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
1930
1931     if (! PL_utf8_perl_idcont) {
1932         invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
1933     }
1934     return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
1935 }
1936
1937 bool
1938 Perl__is_utf8_idcont(pTHX_ const U8 *p)
1939 {
1940     PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
1941
1942     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
1943 }
1944
1945 bool
1946 Perl__is_utf8_xidcont(pTHX_ const U8 *p)
1947 {
1948     PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
1949
1950     return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
1951 }
1952
1953 bool
1954 Perl__is_utf8_mark(pTHX_ const U8 *p)
1955 {
1956     PERL_ARGS_ASSERT__IS_UTF8_MARK;
1957
1958     return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
1959 }
1960
1961 /*
1962 =for apidoc to_utf8_case
1963
1964 Instead use the appropriate one of L</toUPPER_utf8>,
1965 L</toTITLE_utf8>,
1966 L</toLOWER_utf8>,
1967 or L</toFOLD_utf8>.
1968
1969 C<p> contains the pointer to the UTF-8 string encoding
1970 the character that is being converted.  This routine assumes that the character
1971 at C<p> is well-formed.
1972
1973 C<ustrp> is a pointer to the character buffer to put the
1974 conversion result to.  C<lenp> is a pointer to the length
1975 of the result.
1976
1977 C<swashp> is a pointer to the swash to use.
1978
1979 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
1980 and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>.  C<special> (usually,
1981 but not always, a multicharacter mapping), is tried first.
1982
1983 C<special> is a string, normally C<NULL> or C<"">.  C<NULL> means to not use
1984 any special mappings; C<""> means to use the special mappings.  Values other
1985 than these two are treated as the name of the hash containing the special
1986 mappings, like C<"utf8::ToSpecLower">.
1987
1988 C<normal> is a string like C<"ToLower"> which means the swash
1989 C<%utf8::ToLower>.
1990
1991 Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1992 unless those are turned off.
1993
1994 =cut */
1995
1996 UV
1997 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1998                         SV **swashp, const char *normal, const char *special)
1999 {
2000     PERL_ARGS_ASSERT_TO_UTF8_CASE;
2001
2002     return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2003 }
2004
2005     /* change namve uv1 to 'from' */
2006 STATIC UV
2007 S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2008                 SV **swashp, const char *normal, const char *special)
2009 {
2010     STRLEN len = 0;
2011
2012     PERL_ARGS_ASSERT__TO_UTF8_CASE;
2013
2014     /* For code points that don't change case, we already know that the output
2015      * of this function is the unchanged input, so we can skip doing look-ups
2016      * for them.  Unfortunately the case-changing code points are scattered
2017      * around.  But there are some long consecutive ranges where there are no
2018      * case changing code points.  By adding tests, we can eliminate the lookup
2019      * for all the ones in such ranges.  This is currently done here only for
2020      * just a few cases where the scripts are in common use in modern commerce
2021      * (and scripts adjacent to those which can be included without additional
2022      * tests). */
2023
2024     if (uv1 >= 0x0590) {
2025         /* This keeps from needing further processing the code points most
2026          * likely to be used in the following non-cased scripts: Hebrew,
2027          * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2028          * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2029          * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2030         if (uv1 < 0x10A0) {
2031             goto cases_to_self;
2032         }
2033
2034         /* The following largish code point ranges also don't have case
2035          * changes, but khw didn't think they warranted extra tests to speed
2036          * them up (which would slightly slow down everything else above them):
2037          * 1100..139F   Hangul Jamo, Ethiopic
2038          * 1400..1CFF   Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2039          *              Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2040          *              Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2041          *              Combining Diacritical Marks Extended, Balinese,
2042          *              Sundanese, Batak, Lepcha, Ol Chiki
2043          * 2000..206F   General Punctuation
2044          */
2045
2046         if (uv1 >= 0x2D30) {
2047
2048             /* This keeps the from needing further processing the code points
2049              * most likely to be used in the following non-cased major scripts:
2050              * CJK, Katakana, Hiragana, plus some less-likely scripts.
2051              *
2052              * (0x2D30 above might have to be changed to 2F00 in the unlikely
2053              * event that Unicode eventually allocates the unused block as of
2054              * v8.0 2FE0..2FEF to code points that are cased.  khw has verified
2055              * that the test suite will start having failures to alert you
2056              * should that happen) */
2057             if (uv1 < 0xA640) {
2058                 goto cases_to_self;
2059             }
2060
2061             if (uv1 >= 0xAC00) {
2062                 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
2063                     if (ckWARN_d(WARN_SURROGATE)) {
2064                         const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2065                         Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2066                             "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2067                     }
2068                     goto cases_to_self;
2069                 }
2070
2071                 /* AC00..FAFF Catches Hangul syllables and private use, plus
2072                  * some others */
2073                 if (uv1 < 0xFB00) {
2074                     goto cases_to_self;
2075
2076                 }
2077
2078                 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2079                     if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2080                         && ckWARN_d(WARN_DEPRECATED))
2081                     {
2082                         Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2083                                 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2084                     }
2085                     if (ckWARN_d(WARN_NON_UNICODE)) {
2086                         const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2087                         Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2088                             "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2089                     }
2090                     goto cases_to_self;
2091                 }
2092 #ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2093                 if (UNLIKELY(uv1
2094                     > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2095                 {
2096
2097                     /* As of this writing, this means we avoid swash creation
2098                      * for anything beyond low Plane 1 */
2099                     goto cases_to_self;
2100                 }
2101 #endif
2102             }
2103         }
2104
2105         /* Note that non-characters are perfectly legal, so no warning should
2106          * be given.  There are so few of them, that it isn't worth the extra
2107          * tests to avoid swash creation */
2108     }
2109
2110     if (!*swashp) /* load on-demand */
2111          *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
2112
2113     if (special) {
2114          /* It might be "special" (sometimes, but not always,
2115           * a multicharacter mapping) */
2116          HV *hv = NULL;
2117          SV **svp;
2118
2119          /* If passed in the specials name, use that; otherwise use any
2120           * given in the swash */
2121          if (*special != '\0') {
2122             hv = get_hv(special, 0);
2123         }
2124         else {
2125             svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2126             if (svp) {
2127                 hv = MUTABLE_HV(SvRV(*svp));
2128             }
2129         }
2130
2131          if (hv
2132              && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
2133              && (*svp))
2134          {
2135              const char *s;
2136
2137               s = SvPV_const(*svp, len);
2138               if (len == 1)
2139                   /* EIGHTBIT */
2140                    len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2141               else {
2142                    Copy(s, ustrp, len, U8);
2143               }
2144          }
2145     }
2146
2147     if (!len && *swashp) {
2148         const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
2149
2150          if (uv2) {
2151               /* It was "normal" (a single character mapping). */
2152               len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2153          }
2154     }
2155
2156     if (len) {
2157         if (lenp) {
2158             *lenp = len;
2159         }
2160         return valid_utf8_to_uvchr(ustrp, 0);
2161     }
2162
2163     /* Here, there was no mapping defined, which means that the code point maps
2164      * to itself.  Return the inputs */
2165   cases_to_self:
2166     len = UTF8SKIP(p);
2167     if (p != ustrp) {   /* Don't copy onto itself */
2168         Copy(p, ustrp, len, U8);
2169     }
2170
2171     if (lenp)
2172          *lenp = len;
2173
2174     return uv1;
2175
2176 }
2177
2178 STATIC UV
2179 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2180 {
2181     /* This is called when changing the case of a UTF-8-encoded character above
2182      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
2183      * result contains a character that crosses the 255/256 boundary, disallow
2184      * the change, and return the original code point.  See L<perlfunc/lc> for
2185      * why;
2186      *
2187      * p        points to the original string whose case was changed; assumed
2188      *          by this routine to be well-formed
2189      * result   the code point of the first character in the changed-case string
2190      * ustrp    points to the changed-case string (<result> represents its first char)
2191      * lenp     points to the length of <ustrp> */
2192
2193     UV original;    /* To store the first code point of <p> */
2194
2195     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2196
2197     assert(UTF8_IS_ABOVE_LATIN1(*p));
2198
2199     /* We know immediately if the first character in the string crosses the
2200      * boundary, so can skip */
2201     if (result > 255) {
2202
2203         /* Look at every character in the result; if any cross the
2204         * boundary, the whole thing is disallowed */
2205         U8* s = ustrp + UTF8SKIP(ustrp);
2206         U8* e = ustrp + *lenp;
2207         while (s < e) {
2208             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
2209                 goto bad_crossing;
2210             }
2211             s += UTF8SKIP(s);
2212         }
2213
2214         /* Here, no characters crossed, result is ok as-is, but we warn. */
2215         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
2216         return result;
2217     }
2218
2219   bad_crossing:
2220
2221     /* Failed, have to return the original */
2222     original = valid_utf8_to_uvchr(p, lenp);
2223
2224     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2225     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2226                            "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
2227                            "resolved to \"\\x{%"UVXf"}\".",
2228                            OP_DESC(PL_op),
2229                            original,
2230                            original);
2231     Copy(p, ustrp, *lenp, char);
2232     return original;
2233 }
2234
2235 /*
2236 =for apidoc to_utf8_upper
2237
2238 Instead use L</toUPPER_utf8>.
2239
2240 =cut */
2241
2242 /* Not currently externally documented, and subject to change:
2243  * <flags> is set iff iff the rules from the current underlying locale are to
2244  *         be used. */
2245
2246 UV
2247 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2248 {
2249     UV result;
2250
2251     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2252
2253     if (flags) {
2254         /* Treat a UTF-8 locale as not being in locale at all */
2255         if (IN_UTF8_CTYPE_LOCALE) {
2256             flags = FALSE;
2257         }
2258         else {
2259             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2260         }
2261     }
2262
2263     if (UTF8_IS_INVARIANT(*p)) {
2264         if (flags) {
2265             result = toUPPER_LC(*p);
2266         }
2267         else {
2268             return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2269         }
2270     }
2271     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2272         if (flags) {
2273             U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
2274             result = toUPPER_LC(c);
2275         }
2276         else {
2277             return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
2278                                           ustrp, lenp, 'S');
2279         }
2280     }
2281     else {  /* UTF-8, ord above 255 */
2282         result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
2283
2284         if (flags) {
2285             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2286         }
2287         return result;
2288     }
2289
2290     /* Here, used locale rules.  Convert back to UTF-8 */
2291     if (UTF8_IS_INVARIANT(result)) {
2292         *ustrp = (U8) result;
2293         *lenp = 1;
2294     }
2295     else {
2296         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2297         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2298         *lenp = 2;
2299     }
2300
2301     return result;
2302 }
2303
2304 /*
2305 =for apidoc to_utf8_title
2306
2307 Instead use L</toTITLE_utf8>.
2308
2309 =cut */
2310
2311 /* Not currently externally documented, and subject to change:
2312  * <flags> is set iff the rules from the current underlying locale are to be
2313  *         used.  Since titlecase is not defined in POSIX, for other than a
2314  *         UTF-8 locale, uppercase is used instead for code points < 256.
2315  */
2316
2317 UV
2318 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2319 {
2320     UV result;
2321
2322     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2323
2324     if (flags) {
2325         /* Treat a UTF-8 locale as not being in locale at all */
2326         if (IN_UTF8_CTYPE_LOCALE) {
2327             flags = FALSE;
2328         }
2329         else {
2330             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2331         }
2332     }
2333
2334     if (UTF8_IS_INVARIANT(*p)) {
2335         if (flags) {
2336             result = toUPPER_LC(*p);
2337         }
2338         else {
2339             return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2340         }
2341     }
2342     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2343         if (flags) {
2344             U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
2345             result = toUPPER_LC(c);
2346         }
2347         else {
2348             return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
2349                                           ustrp, lenp, 's');
2350         }
2351     }
2352     else {  /* UTF-8, ord above 255 */
2353         result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
2354
2355         if (flags) {
2356             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2357         }
2358         return result;
2359     }
2360
2361     /* Here, used locale rules.  Convert back to UTF-8 */
2362     if (UTF8_IS_INVARIANT(result)) {
2363         *ustrp = (U8) result;
2364         *lenp = 1;
2365     }
2366     else {
2367         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2368         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2369         *lenp = 2;
2370     }
2371
2372     return result;
2373 }
2374
2375 /*
2376 =for apidoc to_utf8_lower
2377
2378 Instead use L</toLOWER_utf8>.
2379
2380 =cut */
2381
2382 /* Not currently externally documented, and subject to change:
2383  * <flags> is set iff iff the rules from the current underlying locale are to
2384  *         be used.
2385  */
2386
2387 UV
2388 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2389 {
2390     UV result;
2391
2392     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2393
2394     if (flags) {
2395         /* Treat a UTF-8 locale as not being in locale at all */
2396         if (IN_UTF8_CTYPE_LOCALE) {
2397             flags = FALSE;
2398         }
2399         else {
2400             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2401         }
2402     }
2403
2404     if (UTF8_IS_INVARIANT(*p)) {
2405         if (flags) {
2406             result = toLOWER_LC(*p);
2407         }
2408         else {
2409             return to_lower_latin1(*p, ustrp, lenp);
2410         }
2411     }
2412     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2413         if (flags) {
2414             U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
2415             result = toLOWER_LC(c);
2416         }
2417         else {
2418             return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
2419                                    ustrp, lenp);
2420         }
2421     }
2422     else {  /* UTF-8, ord above 255 */
2423         result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
2424
2425         if (flags) {
2426             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2427         }
2428
2429         return result;
2430     }
2431
2432     /* Here, used locale rules.  Convert back to UTF-8 */
2433     if (UTF8_IS_INVARIANT(result)) {
2434         *ustrp = (U8) result;
2435         *lenp = 1;
2436     }
2437     else {
2438         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2439         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2440         *lenp = 2;
2441     }
2442
2443     return result;
2444 }
2445
2446 /*
2447 =for apidoc to_utf8_fold
2448
2449 Instead use L</toFOLD_utf8>.
2450
2451 =cut */
2452
2453 /* Not currently externally documented, and subject to change,
2454  * in <flags>
2455  *      bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2456  *                            locale are to be used.
2457  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
2458  *                            otherwise simple folds
2459  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2460  *                            prohibited
2461  */
2462
2463 UV
2464 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
2465 {
2466     UV result;
2467
2468     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2469
2470     /* These are mutually exclusive */
2471     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2472
2473     assert(p != ustrp); /* Otherwise overwrites */
2474
2475     if (flags & FOLD_FLAGS_LOCALE) {
2476         /* Treat a UTF-8 locale as not being in locale at all */
2477         if (IN_UTF8_CTYPE_LOCALE) {
2478             flags &= ~FOLD_FLAGS_LOCALE;
2479         }
2480         else {
2481             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2482         }
2483     }
2484
2485     if (UTF8_IS_INVARIANT(*p)) {
2486         if (flags & FOLD_FLAGS_LOCALE) {
2487             result = toFOLD_LC(*p);
2488         }
2489         else {
2490             return _to_fold_latin1(*p, ustrp, lenp,
2491                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2492         }
2493     }
2494     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2495         if (flags & FOLD_FLAGS_LOCALE) {
2496             U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
2497             result = toFOLD_LC(c);
2498         }
2499         else {
2500             return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
2501                             ustrp, lenp,
2502                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2503         }
2504     }
2505     else {  /* UTF-8, ord above 255 */
2506         result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
2507
2508         if (flags & FOLD_FLAGS_LOCALE) {
2509
2510 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
2511             const unsigned int long_s_t_len    = sizeof(LONG_S_T) - 1;
2512
2513 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2514 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2515
2516             const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
2517
2518             /* Special case these two characters, as what normally gets
2519              * returned under locale doesn't work */
2520             if (UTF8SKIP(p) == cap_sharp_s_len
2521                 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
2522             {
2523                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2524                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2525                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
2526                               "resolved to \"\\x{17F}\\x{17F}\".");
2527                 goto return_long_s;
2528             }
2529             else
2530 #endif
2531                  if (UTF8SKIP(p) == long_s_t_len
2532                      && memEQ((char *) p, LONG_S_T, long_s_t_len))
2533             {
2534                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2535                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2536                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
2537                               "resolved to \"\\x{FB06}\".");
2538                 goto return_ligature_st;
2539             }
2540
2541 #if    UNICODE_MAJOR_VERSION   == 3         \
2542     && UNICODE_DOT_VERSION     == 0         \
2543     && UNICODE_DOT_DOT_VERSION == 1
2544 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
2545
2546             /* And special case this on this Unicode version only, for the same
2547              * reaons the other two are special cased.  They would cross the
2548              * 255/256 boundary which is forbidden under /l, and so the code
2549              * wouldn't catch that they are equivalent (which they are only in
2550              * this release) */
2551             else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
2552                      && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
2553             {
2554                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2555                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2556                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
2557                               "resolved to \"\\x{0131}\".");
2558                 goto return_dotless_i;
2559             }
2560 #endif
2561
2562             return check_locale_boundary_crossing(p, result, ustrp, lenp);
2563         }
2564         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2565             return result;
2566         }
2567         else {
2568             /* This is called when changing the case of a UTF-8-encoded
2569              * character above the ASCII range, and the result should not
2570              * contain an ASCII character. */
2571
2572             UV original;    /* To store the first code point of <p> */
2573
2574             /* Look at every character in the result; if any cross the
2575             * boundary, the whole thing is disallowed */
2576             U8* s = ustrp;
2577             U8* e = ustrp + *lenp;
2578             while (s < e) {
2579                 if (isASCII(*s)) {
2580                     /* Crossed, have to return the original */
2581                     original = valid_utf8_to_uvchr(p, lenp);
2582
2583                     /* But in these instances, there is an alternative we can
2584                      * return that is valid */
2585                     if (original == LATIN_SMALL_LETTER_SHARP_S
2586 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
2587                         || original == LATIN_CAPITAL_LETTER_SHARP_S
2588 #endif
2589                     ) {
2590                         goto return_long_s;
2591                     }
2592                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2593                         goto return_ligature_st;
2594                     }
2595 #if    UNICODE_MAJOR_VERSION   == 3         \
2596     && UNICODE_DOT_VERSION     == 0         \
2597     && UNICODE_DOT_DOT_VERSION == 1
2598
2599                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
2600                         goto return_dotless_i;
2601                     }
2602 #endif
2603                     Copy(p, ustrp, *lenp, char);
2604                     return original;
2605                 }
2606                 s += UTF8SKIP(s);
2607             }
2608
2609             /* Here, no characters crossed, result is ok as-is */
2610             return result;
2611         }
2612     }
2613
2614     /* Here, used locale rules.  Convert back to UTF-8 */
2615     if (UTF8_IS_INVARIANT(result)) {
2616         *ustrp = (U8) result;
2617         *lenp = 1;
2618     }
2619     else {
2620         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2621         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2622         *lenp = 2;
2623     }
2624
2625     return result;
2626
2627   return_long_s:
2628     /* Certain folds to 'ss' are prohibited by the options, but they do allow
2629      * folds to a string of two of these characters.  By returning this
2630      * instead, then, e.g.,
2631      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2632      * works. */
2633
2634     *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2635     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2636         ustrp, *lenp, U8);
2637     return LATIN_SMALL_LETTER_LONG_S;
2638
2639   return_ligature_st:
2640     /* Two folds to 'st' are prohibited by the options; instead we pick one and
2641      * have the other one fold to it */
2642
2643     *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2644     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2645     return LATIN_SMALL_LIGATURE_ST;
2646
2647 #if    UNICODE_MAJOR_VERSION   == 3         \
2648     && UNICODE_DOT_VERSION     == 0         \
2649     && UNICODE_DOT_DOT_VERSION == 1
2650
2651   return_dotless_i:
2652     *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
2653     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
2654     return LATIN_SMALL_LETTER_DOTLESS_I;
2655
2656 #endif
2657
2658 }
2659
2660 /* Note:
2661  * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2662  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2663  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2664  */
2665
2666 SV*
2667 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2668 {
2669     PERL_ARGS_ASSERT_SWASH_INIT;
2670
2671     /* Returns a copy of a swash initiated by the called function.  This is the
2672      * public interface, and returning a copy prevents others from doing
2673      * mischief on the original */
2674
2675     return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
2676 }
2677
2678 SV*
2679 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
2680 {
2681
2682     /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
2683      * use the following define */
2684
2685 #define CORE_SWASH_INIT_RETURN(x)   \
2686     PL_curpm= old_PL_curpm;         \
2687     return x
2688
2689     /* Initialize and return a swash, creating it if necessary.  It does this
2690      * by calling utf8_heavy.pl in the general case.  The returned value may be
2691      * the swash's inversion list instead if the input parameters allow it.
2692      * Which is returned should be immaterial to callers, as the only
2693      * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2694      * and swash_to_invlist() handle both these transparently.
2695      *
2696      * This interface should only be used by functions that won't destroy or
2697      * adversely change the swash, as doing so affects all other uses of the
2698      * swash in the program; the general public should use 'Perl_swash_init'
2699      * instead.
2700      *
2701      * pkg  is the name of the package that <name> should be in.
2702      * name is the name of the swash to find.  Typically it is a Unicode
2703      *      property name, including user-defined ones
2704      * listsv is a string to initialize the swash with.  It must be of the form
2705      *      documented as the subroutine return value in
2706      *      L<perlunicode/User-Defined Character Properties>
2707      * minbits is the number of bits required to represent each data element.
2708      *      It is '1' for binary properties.
2709      * none I (khw) do not understand this one, but it is used only in tr///.
2710      * invlist is an inversion list to initialize the swash with (or NULL)
2711      * flags_p if non-NULL is the address of various input and output flag bits
2712      *      to the routine, as follows:  ('I' means is input to the routine;
2713      *      'O' means output from the routine.  Only flags marked O are
2714      *      meaningful on return.)
2715      *  _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2716      *      came from a user-defined property.  (I O)
2717      *  _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2718      *      when the swash cannot be located, to simply return NULL. (I)
2719      *  _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2720      *      return of an inversion list instead of a swash hash if this routine
2721      *      thinks that would result in faster execution of swash_fetch() later
2722      *      on. (I)
2723      *
2724      * Thus there are three possible inputs to find the swash: <name>,
2725      * <listsv>, and <invlist>.  At least one must be specified.  The result
2726      * will be the union of the specified ones, although <listsv>'s various
2727      * actions can intersect, etc. what <name> gives.  To avoid going out to
2728      * disk at all, <invlist> should specify completely what the swash should
2729      * have, and <listsv> should be &PL_sv_undef and <name> should be "".
2730      *
2731      * <invlist> is only valid for binary properties */
2732
2733     PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
2734
2735     SV* retval = &PL_sv_undef;
2736     HV* swash_hv = NULL;
2737     const int invlist_swash_boundary =
2738         (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2739         ? 512    /* Based on some benchmarking, but not extensive, see commit
2740                     message */
2741         : -1;   /* Never return just an inversion list */
2742
2743     assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2744     assert(! invlist || minbits == 1);
2745
2746     PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
2747                        that triggered the swash init and the swash init perl logic itself.
2748                        See perl #122747 */
2749
2750     /* If data was passed in to go out to utf8_heavy to find the swash of, do
2751      * so */
2752     if (listsv != &PL_sv_undef || strNE(name, "")) {
2753         dSP;
2754         const size_t pkg_len = strlen(pkg);
2755         const size_t name_len = strlen(name);
2756         HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2757         SV* errsv_save;
2758         GV *method;
2759
2760         PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2761
2762         PUSHSTACKi(PERLSI_MAGIC);
2763         ENTER;
2764         SAVEHINTS();
2765         save_re_context();
2766         /* We might get here via a subroutine signature which uses a utf8
2767          * parameter name, at which point PL_subname will have been set
2768          * but not yet used. */
2769         save_item(PL_subname);
2770         if (PL_parser && PL_parser->error_count)
2771             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2772         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2773         if (!method) {  /* demand load UTF-8 */
2774             ENTER;
2775             if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2776             GvSV(PL_errgv) = NULL;
2777 #ifndef NO_TAINT_SUPPORT
2778             /* It is assumed that callers of this routine are not passing in
2779              * any user derived data.  */
2780             /* Need to do this after save_re_context() as it will set
2781              * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2782              * in Perl_magic_get).  Even line to create errsv_save can turn on
2783              * PL_tainted.  */
2784             SAVEBOOL(TAINT_get);
2785             TAINT_NOT;
2786 #endif
2787             Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2788                              NULL);
2789             {
2790                 /* Not ERRSV, as there is no need to vivify a scalar we are
2791                    about to discard. */
2792                 SV * const errsv = GvSV(PL_errgv);
2793                 if (!SvTRUE(errsv)) {
2794                     GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2795                     SvREFCNT_dec(errsv);
2796                 }
2797             }
2798             LEAVE;
2799         }
2800         SPAGAIN;
2801         PUSHMARK(SP);
2802         EXTEND(SP,5);
2803         mPUSHp(pkg, pkg_len);
2804         mPUSHp(name, name_len);
2805         PUSHs(listsv);
2806         mPUSHi(minbits);
2807         mPUSHi(none);
2808         PUTBACK;
2809         if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2810         GvSV(PL_errgv) = NULL;
2811         /* If we already have a pointer to the method, no need to use
2812          * call_method() to repeat the lookup.  */
2813         if (method
2814             ? call_sv(MUTABLE_SV(method), G_SCALAR)
2815             : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2816         {
2817             retval = *PL_stack_sp--;
2818             SvREFCNT_inc(retval);
2819         }
2820         {
2821             /* Not ERRSV.  See above. */
2822             SV * const errsv = GvSV(PL_errgv);
2823             if (!SvTRUE(errsv)) {
2824                 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2825                 SvREFCNT_dec(errsv);
2826             }
2827         }
2828         LEAVE;
2829         POPSTACK;
2830         if (IN_PERL_COMPILETIME) {
2831             CopHINTS_set(PL_curcop, PL_hints);
2832         }
2833         if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2834             if (SvPOK(retval)) {
2835
2836                 /* If caller wants to handle missing properties, let them */
2837                 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2838                     CORE_SWASH_INIT_RETURN(NULL);
2839                 }
2840                 Perl_croak(aTHX_
2841                            "Can't find Unicode property definition \"%"SVf"\"",
2842                            SVfARG(retval));
2843                 NOT_REACHED; /* NOTREACHED */
2844             }
2845         }
2846     } /* End of calling the module to find the swash */
2847
2848     /* If this operation fetched a swash, and we will need it later, get it */
2849     if (retval != &PL_sv_undef
2850         && (minbits == 1 || (flags_p
2851                             && ! (*flags_p
2852                                   & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
2853     {
2854         swash_hv = MUTABLE_HV(SvRV(retval));
2855
2856         /* If we don't already know that there is a user-defined component to
2857          * this swash, and the user has indicated they wish to know if there is
2858          * one (by passing <flags_p>), find out */
2859         if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
2860             SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
2861             if (user_defined && SvUV(*user_defined)) {
2862                 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
2863             }
2864         }
2865     }
2866
2867     /* Make sure there is an inversion list for binary properties */
2868     if (minbits == 1) {
2869         SV** swash_invlistsvp = NULL;
2870         SV* swash_invlist = NULL;
2871         bool invlist_in_swash_is_valid = FALSE;
2872         bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
2873                                             an unclaimed reference count */
2874
2875         /* If this operation fetched a swash, get its already existing
2876          * inversion list, or create one for it */
2877
2878         if (swash_hv) {
2879             swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
2880             if (swash_invlistsvp) {
2881                 swash_invlist = *swash_invlistsvp;
2882                 invlist_in_swash_is_valid = TRUE;
2883             }
2884             else {
2885                 swash_invlist = _swash_to_invlist(retval);
2886                 swash_invlist_unclaimed = TRUE;
2887             }
2888         }
2889
2890         /* If an inversion list was passed in, have to include it */
2891         if (invlist) {
2892
2893             /* Any fetched swash will by now have an inversion list in it;
2894              * otherwise <swash_invlist>  will be NULL, indicating that we
2895              * didn't fetch a swash */
2896             if (swash_invlist) {
2897
2898                 /* Add the passed-in inversion list, which invalidates the one
2899                  * already stored in the swash */
2900                 invlist_in_swash_is_valid = FALSE;
2901                 _invlist_union(invlist, swash_invlist, &swash_invlist);
2902             }
2903             else {
2904
2905                 /* Here, there is no swash already.  Set up a minimal one, if
2906                  * we are going to return a swash */
2907                 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
2908                     swash_hv = newHV();
2909                     retval = newRV_noinc(MUTABLE_SV(swash_hv));
2910                 }
2911                 swash_invlist = invlist;
2912             }
2913         }
2914
2915         /* Here, we have computed the union of all the passed-in data.  It may
2916          * be that there was an inversion list in the swash which didn't get
2917          * touched; otherwise save the computed one */
2918         if (! invlist_in_swash_is_valid
2919             && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
2920         {
2921             if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
2922             {
2923                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2924             }
2925             /* We just stole a reference count. */
2926             if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
2927             else SvREFCNT_inc_simple_void_NN(swash_invlist);
2928         }
2929
2930         SvREADONLY_on(swash_invlist);
2931
2932         /* Use the inversion list stand-alone if small enough */
2933         if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
2934             SvREFCNT_dec(retval);
2935             if (!swash_invlist_unclaimed)
2936                 SvREFCNT_inc_simple_void_NN(swash_invlist);
2937             retval = newRV_noinc(swash_invlist);
2938         }
2939     }
2940
2941     CORE_SWASH_INIT_RETURN(retval);
2942 #undef CORE_SWASH_INIT_RETURN
2943 }
2944
2945
2946 /* This API is wrong for special case conversions since we may need to
2947  * return several Unicode characters for a single Unicode character
2948  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2949  * the lower-level routine, and it is similarly broken for returning
2950  * multiple values.  --jhi
2951  * For those, you should use S__to_utf8_case() instead */
2952 /* Now SWASHGET is recasted into S_swatch_get in this file. */
2953
2954 /* Note:
2955  * Returns the value of property/mapping C<swash> for the first character
2956  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
2957  * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
2958  * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
2959  *
2960  * A "swash" is a hash which contains initially the keys/values set up by
2961  * SWASHNEW.  The purpose is to be able to completely represent a Unicode
2962  * property for all possible code points.  Things are stored in a compact form
2963  * (see utf8_heavy.pl) so that calculation is required to find the actual
2964  * property value for a given code point.  As code points are looked up, new
2965  * key/value pairs are added to the hash, so that the calculation doesn't have
2966  * to ever be re-done.  Further, each calculation is done, not just for the
2967  * desired one, but for a whole block of code points adjacent to that one.
2968  * For binary properties on ASCII machines, the block is usually for 64 code
2969  * points, starting with a code point evenly divisible by 64.  Thus if the
2970  * property value for code point 257 is requested, the code goes out and
2971  * calculates the property values for all 64 code points between 256 and 319,
2972  * and stores these as a single 64-bit long bit vector, called a "swatch",
2973  * under the key for code point 256.  The key is the UTF-8 encoding for code
2974  * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
2975  * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
2976  * for code point 258 is then requested, this code realizes that it would be
2977  * stored under the key for 256, and would find that value and extract the
2978  * relevant bit, offset from 256.
2979  *
2980  * Non-binary properties are stored in as many bits as necessary to represent
2981  * their values (32 currently, though the code is more general than that), not
2982  * as single bits, but the principle is the same: the value for each key is a
2983  * vector that encompasses the property values for all code points whose UTF-8
2984  * representations are represented by the key.  That is, for all code points
2985  * whose UTF-8 representations are length N bytes, and the key is the first N-1
2986  * bytes of that.
2987  */
2988 UV
2989 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
2990 {
2991     HV *const hv = MUTABLE_HV(SvRV(swash));
2992     U32 klen;
2993     U32 off;
2994     STRLEN slen = 0;
2995     STRLEN needents;
2996     const U8 *tmps = NULL;
2997     SV *swatch;
2998     const U8 c = *ptr;
2999
3000     PERL_ARGS_ASSERT_SWASH_FETCH;
3001
3002     /* If it really isn't a hash, it isn't really swash; must be an inversion
3003      * list */
3004     if (SvTYPE(hv) != SVt_PVHV) {
3005         return _invlist_contains_cp((SV*)hv,
3006                                     (do_utf8)
3007                                      ? valid_utf8_to_uvchr(ptr, NULL)
3008                                      : c);
3009     }
3010
3011     /* We store the values in a "swatch" which is a vec() value in a swash
3012      * hash.  Code points 0-255 are a single vec() stored with key length
3013      * (klen) 0.  All other code points have a UTF-8 representation
3014      * 0xAA..0xYY,0xZZ.  A vec() is constructed containing all of them which
3015      * share 0xAA..0xYY, which is the key in the hash to that vec.  So the key
3016      * length for them is the length of the encoded char - 1.  ptr[klen] is the
3017      * final byte in the sequence representing the character */
3018     if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3019         klen = 0;
3020         needents = 256;
3021         off = c;
3022     }
3023     else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3024         klen = 0;
3025         needents = 256;
3026         off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
3027     }
3028     else {
3029         klen = UTF8SKIP(ptr) - 1;
3030
3031         /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values.  The offset into
3032          * the vec is the final byte in the sequence.  (In EBCDIC this is
3033          * converted to I8 to get consecutive values.)  To help you visualize
3034          * all this:
3035          *                       Straight 1047   After final byte
3036          *             UTF-8      UTF-EBCDIC     I8 transform
3037          *  U+0400:  \xD0\x80    \xB8\x41\x41    \xB8\x41\xA0
3038          *  U+0401:  \xD0\x81    \xB8\x41\x42    \xB8\x41\xA1
3039          *    ...
3040          *  U+0409:  \xD0\x89    \xB8\x41\x4A    \xB8\x41\xA9
3041          *  U+040A:  \xD0\x8A    \xB8\x41\x51    \xB8\x41\xAA
3042          *    ...
3043          *  U+0412:  \xD0\x92    \xB8\x41\x59    \xB8\x41\xB2
3044          *  U+0413:  \xD0\x93    \xB8\x41\x62    \xB8\x41\xB3
3045          *    ...
3046          *  U+041B:  \xD0\x9B    \xB8\x41\x6A    \xB8\x41\xBB
3047          *  U+041C:  \xD0\x9C    \xB8\x41\x70    \xB8\x41\xBC
3048          *    ...
3049          *  U+041F:  \xD0\x9F    \xB8\x41\x73    \xB8\x41\xBF
3050          *  U+0420:  \xD0\xA0    \xB8\x42\x41    \xB8\x42\x41
3051          *
3052          * (There are no discontinuities in the elided (...) entries.)
3053          * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3054          * key for the next 31, up through U+043F, whose UTF-8 final byte is
3055          * \xBF).  Thus in UTF-8, each key is for a vec() for 64 code points.
3056          * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3057          * index into the vec() swatch (after subtracting 0x80, which we
3058          * actually do with an '&').
3059          * In UTF-EBCDIC, each key is for a 32 code point vec().  The first 32
3060          * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3061          * dicontinuities which go away by transforming it into I8, and we
3062          * effectively subtract 0xA0 to get the index. */
3063         needents = (1 << UTF_ACCUMULATION_SHIFT);
3064         off      = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
3065     }
3066
3067     /*
3068      * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
3069      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
3070      * it's nothing to sniff at.)  Pity we usually come through at least
3071      * two function calls to get here...
3072      *
3073      * NB: this code assumes that swatches are never modified, once generated!
3074      */
3075
3076     if (hv   == PL_last_swash_hv &&
3077         klen == PL_last_swash_klen &&
3078         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
3079     {
3080         tmps = PL_last_swash_tmps;
3081         slen = PL_last_swash_slen;
3082     }
3083     else {
3084         /* Try our second-level swatch cache, kept in a hash. */
3085         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
3086
3087         /* If not cached, generate it via swatch_get */
3088         if (!svp || !SvPOK(*svp)
3089                  || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3090         {
3091             if (klen) {
3092                 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3093                 swatch = swatch_get(swash,
3094                                     code_point & ~((UV)needents - 1),
3095                                     needents);
3096             }
3097             else {  /* For the first 256 code points, the swatch has a key of
3098                        length 0 */
3099                 swatch = swatch_get(swash, 0, needents);
3100             }
3101
3102             if (IN_PERL_COMPILETIME)
3103                 CopHINTS_set(PL_curcop, PL_hints);
3104
3105             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
3106
3107             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3108                      || (slen << 3) < needents)
3109                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3110                            "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3111                            svp, tmps, (UV)slen, (UV)needents);
3112         }
3113
3114         PL_last_swash_hv = hv;
3115         assert(klen <= sizeof(PL_last_swash_key));
3116         PL_last_swash_klen = (U8)klen;
3117         /* FIXME change interpvar.h?  */
3118         PL_last_swash_tmps = (U8 *) tmps;
3119         PL_last_swash_slen = slen;
3120         if (klen)
3121             Copy(ptr, PL_last_swash_key, klen, U8);
3122     }
3123
3124     switch ((int)((slen << 3) / needents)) {
3125     case 1:
3126         return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
3127     case 8:
3128         return ((UV) tmps[off]);
3129     case 16:
3130         off <<= 1;
3131         return
3132             ((UV) tmps[off    ] << 8) +
3133             ((UV) tmps[off + 1]);
3134     case 32:
3135         off <<= 2;
3136         return
3137             ((UV) tmps[off    ] << 24) +
3138             ((UV) tmps[off + 1] << 16) +
3139             ((UV) tmps[off + 2] <<  8) +
3140             ((UV) tmps[off + 3]);
3141     }
3142     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3143                "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
3144     NORETURN_FUNCTION_END;
3145 }
3146
3147 /* Read a single line of the main body of the swash input text.  These are of
3148  * the form:
3149  * 0053 0056    0073
3150  * where each number is hex.  The first two numbers form the minimum and
3151  * maximum of a range, and the third is the value associated with the range.
3152  * Not all swashes should have a third number
3153  *
3154  * On input: l    points to the beginning of the line to be examined; it points
3155  *                to somewhere in the string of the whole input text, and is
3156  *                terminated by a \n or the null string terminator.
3157  *           lend   points to the null terminator of that string
3158  *           wants_value    is non-zero if the swash expects a third number
3159  *           typestr is the name of the swash's mapping, like 'ToLower'
3160  * On output: *min, *max, and *val are set to the values read from the line.
3161  *            returns a pointer just beyond the line examined.  If there was no
3162  *            valid min number on the line, returns lend+1
3163  */
3164
3165 STATIC U8*
3166 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3167                              const bool wants_value, const U8* const typestr)
3168 {
3169     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
3170     STRLEN numlen;          /* Length of the number */
3171     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3172                 | PERL_SCAN_DISALLOW_PREFIX
3173                 | PERL_SCAN_SILENT_NON_PORTABLE;
3174
3175     /* nl points to the next \n in the scan */
3176     U8* const nl = (U8*)memchr(l, '\n', lend - l);
3177
3178     PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3179
3180     /* Get the first number on the line: the range minimum */
3181     numlen = lend - l;
3182     *min = grok_hex((char *)l, &numlen, &flags, NULL);
3183     *max = *min;    /* So can never return without setting max */
3184     if (numlen)     /* If found a hex number, position past it */
3185         l += numlen;
3186     else if (nl) {          /* Else, go handle next line, if any */
3187         return nl + 1;  /* 1 is length of "\n" */
3188     }
3189     else {              /* Else, no next line */
3190         return lend + 1;        /* to LIST's end at which \n is not found */
3191     }
3192
3193     /* The max range value follows, separated by a BLANK */
3194     if (isBLANK(*l)) {
3195         ++l;
3196         flags = PERL_SCAN_SILENT_ILLDIGIT
3197                 | PERL_SCAN_DISALLOW_PREFIX
3198                 | PERL_SCAN_SILENT_NON_PORTABLE;
3199         numlen = lend - l;
3200         *max = grok_hex((char *)l, &numlen, &flags, NULL);
3201         if (numlen)
3202             l += numlen;
3203         else    /* If no value here, it is a single element range */
3204             *max = *min;
3205
3206         /* Non-binary tables have a third entry: what the first element of the
3207          * range maps to.  The map for those currently read here is in hex */
3208         if (wants_value) {
3209             if (isBLANK(*l)) {
3210                 ++l;
3211                 flags = PERL_SCAN_SILENT_ILLDIGIT
3212                     | PERL_SCAN_DISALLOW_PREFIX
3213                     | PERL_SCAN_SILENT_NON_PORTABLE;
3214                 numlen = lend - l;
3215                 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3216                 if (numlen)
3217                     l += numlen;
3218                 else
3219                     *val = 0;
3220             }
3221             else {
3222                 *val = 0;
3223                 if (typeto) {
3224                     /* diag_listed_as: To%s: illegal mapping '%s' */
3225                     Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3226                                      typestr, l);
3227                 }
3228             }
3229         }
3230         else
3231             *val = 0; /* bits == 1, then any val should be ignored */
3232     }
3233     else { /* Nothing following range min, should be single element with no
3234               mapping expected */
3235         if (wants_value) {
3236             *val = 0;
3237             if (typeto) {
3238                 /* diag_listed_as: To%s: illegal mapping '%s' */
3239                 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3240             }
3241         }
3242         else
3243             *val = 0; /* bits == 1, then val should be ignored */
3244     }
3245
3246     /* Position to next line if any, or EOF */
3247     if (nl)
3248         l = nl + 1;
3249     else
3250         l = lend;
3251
3252     return l;
3253 }
3254
3255 /* Note:
3256  * Returns a swatch (a bit vector string) for a code point sequence
3257  * that starts from the value C<start> and comprises the number C<span>.
3258  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3259  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3260  */
3261 STATIC SV*
3262 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3263 {
3264     SV *swatch;
3265     U8 *l, *lend, *x, *xend, *s, *send;
3266     STRLEN lcur, xcur, scur;
3267     HV *const hv = MUTABLE_HV(SvRV(swash));
3268     SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
3269
3270     SV** listsvp = NULL; /* The string containing the main body of the table */
3271     SV** extssvp = NULL;
3272     SV** invert_it_svp = NULL;
3273     U8* typestr = NULL;
3274     STRLEN bits;
3275     STRLEN octets; /* if bits == 1, then octets == 0 */
3276     UV  none;
3277     UV  end = start + span;
3278
3279     if (invlistsvp == NULL) {
3280         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3281         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3282         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3283         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3284         listsvp = hv_fetchs(hv, "LIST", FALSE);
3285         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3286
3287         bits  = SvUV(*bitssvp);
3288         none  = SvUV(*nonesvp);
3289         typestr = (U8*)SvPV_nolen(*typesvp);
3290     }
3291     else {
3292         bits = 1;
3293         none = 0;
3294     }
3295     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3296
3297     PERL_ARGS_ASSERT_SWATCH_GET;
3298
3299     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
3300         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3301                                                  (UV)bits);
3302     }
3303
3304     /* If overflowed, use the max possible */
3305     if (end < start) {
3306         end = UV_MAX;
3307         span = end - start;
3308     }
3309
3310     /* create and initialize $swatch */
3311     scur   = octets ? (span * octets) : (span + 7) / 8;
3312     swatch = newSV(scur);
3313     SvPOK_on(swatch);
3314     s = (U8*)SvPVX(swatch);
3315     if (octets && none) {
3316         const U8* const e = s + scur;
3317         while (s < e) {
3318             if (bits == 8)
3319                 *s++ = (U8)(none & 0xff);
3320             else if (bits == 16) {
3321                 *s++ = (U8)((none >>  8) & 0xff);
3322                 *s++ = (U8)( none        & 0xff);
3323             }
3324             else if (bits == 32) {
3325                 *s++ = (U8)((none >> 24) & 0xff);
3326                 *s++ = (U8)((none >> 16) & 0xff);
3327                 *s++ = (U8)((none >>  8) & 0xff);
3328                 *s++ = (U8)( none        & 0xff);
3329             }
3330         }
3331         *s = '\0';
3332     }
3333     else {
3334         (void)memzero((U8*)s, scur + 1);
3335     }
3336     SvCUR_set(swatch, scur);
3337     s = (U8*)SvPVX(swatch);
3338
3339     if (invlistsvp) {   /* If has an inversion list set up use that */
3340         _invlist_populate_swatch(*invlistsvp, start, end, s);
3341         return swatch;
3342     }
3343
3344     /* read $swash->{LIST} */
3345     l = (U8*)SvPV(*listsvp, lcur);
3346     lend = l + lcur;
3347     while (l < lend) {
3348         UV min, max, val, upper;
3349         l = swash_scan_list_line(l, lend, &min, &max, &val,
3350                                                         cBOOL(octets), typestr);
3351         if (l > lend) {
3352             break;
3353         }
3354
3355         /* If looking for something beyond this range, go try the next one */
3356         if (max < start)
3357             continue;
3358
3359         /* <end> is generally 1 beyond where we want to set things, but at the
3360          * platform's infinity, where we can't go any higher, we want to
3361          * include the code point at <end> */
3362         upper = (max < end)
3363                 ? max
3364                 : (max != UV_MAX || end != UV_MAX)
3365                   ? end - 1
3366                   : end;
3367
3368         if (octets) {
3369             UV key;
3370             if (min < start) {
3371                 if (!none || val < none) {
3372                     val += start - min;
3373                 }
3374                 min = start;
3375             }
3376             for (key = min; key <= upper; key++) {
3377                 STRLEN offset;
3378                 /* offset must be non-negative (start <= min <= key < end) */
3379                 offset = octets * (key - start);
3380                 if (bits == 8)
3381                     s[offset] = (U8)(val & 0xff);
3382                 else if (bits == 16) {
3383                     s[offset    ] = (U8)((val >>  8) & 0xff);
3384                     s[offset + 1] = (U8)( val        & 0xff);
3385                 }
3386                 else if (bits == 32) {
3387                     s[offset    ] = (U8)((val >> 24) & 0xff);
3388                     s[offset + 1] = (U8)((val >> 16) & 0xff);
3389                     s[offset + 2] = (U8)((val >>  8) & 0xff);
3390                     s[offset + 3] = (U8)( val        & 0xff);
3391                 }
3392
3393                 if (!none || val < none)
3394                     ++val;
3395             }
3396         }
3397         else { /* bits == 1, then val should be ignored */
3398             UV key;
3399             if (min < start)
3400                 min = start;
3401
3402             for (key = min; key <= upper; key++) {
3403                 const STRLEN offset = (STRLEN)(key - start);
3404                 s[offset >> 3] |= 1 << (offset & 7);
3405             }
3406         }
3407     } /* while */
3408
3409     /* Invert if the data says it should be.  Assumes that bits == 1 */
3410     if (invert_it_svp && SvUV(*invert_it_svp)) {
3411
3412         /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3413          * be 0, and their inversion should also be 0, as we don't succeed any
3414          * Unicode property matches for non-Unicode code points */
3415         if (start <= PERL_UNICODE_MAX) {
3416
3417             /* The code below assumes that we never cross the
3418              * Unicode/above-Unicode boundary in a range, as otherwise we would
3419              * have to figure out where to stop flipping the bits.  Since this
3420              * boundary is divisible by a large power of 2, and swatches comes
3421              * in small powers of 2, this should be a valid assumption */
3422             assert(start + span - 1 <= PERL_UNICODE_MAX);
3423
3424             send = s + scur;
3425             while (s < send) {
3426                 *s = ~(*s);
3427                 s++;
3428             }
3429         }
3430     }
3431
3432     /* read $swash->{EXTRAS}
3433      * This code also copied to swash_to_invlist() below */
3434     x = (U8*)SvPV(*extssvp, xcur);
3435     xend = x + xcur;
3436     while (x < xend) {
3437         STRLEN namelen;
3438         U8 *namestr;
3439         SV** othersvp;
3440         HV* otherhv;
3441         STRLEN otherbits;
3442         SV **otherbitssvp, *other;
3443         U8 *s, *o, *nl;
3444         STRLEN slen, olen;
3445
3446         const U8 opc = *x++;
3447         if (opc == '\n')
3448             continue;
3449
3450         nl = (U8*)memchr(x, '\n', xend - x);
3451
3452         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3453             if (nl) {
3454                 x = nl + 1; /* 1 is length of "\n" */
3455                 continue;
3456             }
3457             else {
3458                 x = xend; /* to EXTRAS' end at which \n is not found */
3459                 break;
3460             }
3461         }
3462
3463         namestr = x;
3464         if (nl) {
3465             namelen = nl - namestr;
3466             x = nl + 1;
3467         }
3468         else {
3469             namelen = xend - namestr;
3470             x = xend;
3471         }
3472
3473         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3474         otherhv = MUTABLE_HV(SvRV(*othersvp));
3475         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3476         otherbits = (STRLEN)SvUV(*otherbitssvp);
3477         if (bits < otherbits)
3478             Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3479                        "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3480
3481         /* The "other" swatch must be destroyed after. */
3482         other = swatch_get(*othersvp, start, span);
3483         o = (U8*)SvPV(other, olen);
3484
3485         if (!olen)
3486             Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3487
3488         s = (U8*)SvPV(swatch, slen);
3489         if (bits == 1 && otherbits == 1) {
3490             if (slen != olen)
3491                 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3492                            "mismatch, slen=%"UVuf", olen=%"UVuf,
3493                            (UV)slen, (UV)olen);
3494
3495             switch (opc) {
3496             case '+':
3497                 while (slen--)
3498                     *s++ |= *o++;
3499                 break;
3500             case '!':
3501                 while (slen--)
3502                     *s++ |= ~*o++;
3503                 break;
3504             case '-':
3505                 while (slen--)
3506                     *s++ &= ~*o++;
3507                 break;
3508             case '&':
3509                 while (slen--)
3510                     *s++ &= *o++;
3511                 break;
3512             default:
3513                 break;
3514             }
3515         }
3516         else {
3517             STRLEN otheroctets = otherbits >> 3;
3518             STRLEN offset = 0;
3519             U8* const send = s + slen;
3520
3521             while (s < send) {
3522                 UV otherval = 0;
3523
3524                 if (otherbits == 1) {
3525                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3526                     ++offset;
3527                 }
3528                 else {
3529                     STRLEN vlen = otheroctets;
3530                     otherval = *o++;
3531                     while (--vlen) {
3532                         otherval <<= 8;
3533                         otherval |= *o++;
3534                     }
3535                 }
3536
3537                 if (opc == '+' && otherval)
3538                     NOOP;   /* replace with otherval */
3539                 else if (opc == '!' && !otherval)
3540                     otherval = 1;
3541                 else if (opc == '-' && otherval)
3542                     otherval = 0;
3543                 else if (opc == '&' && !otherval)
3544                     otherval = 0;
3545                 else {
3546                     s += octets; /* no replacement */
3547                     continue;
3548                 }
3549
3550                 if (bits == 8)
3551                     *s++ = (U8)( otherval & 0xff);
3552                 else if (bits == 16) {
3553                     *s++ = (U8)((otherval >>  8) & 0xff);
3554                     *s++ = (U8)( otherval        & 0xff);
3555                 }
3556                 else if (bits == 32) {
3557                     *s++ = (U8)((otherval >> 24) & 0xff);
3558                     *s++ = (U8)((otherval >> 16) & 0xff);
3559                     *s++ = (U8)((otherval >>  8) & 0xff);
3560                     *s++ = (U8)( otherval        & 0xff);
3561                 }
3562             }
3563         }
3564         sv_free(other); /* through with it! */
3565     } /* while */
3566     return swatch;
3567 }
3568
3569 HV*
3570 Perl__swash_inversion_hash(pTHX_ SV* const swash)
3571 {
3572
3573    /* Subject to change or removal.  For use only in regcomp.c and regexec.c
3574     * Can't be used on a property that is subject to user override, as it
3575     * relies on the value of SPECIALS in the swash which would be set by
3576     * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3577     * for overridden properties
3578     *
3579     * Returns a hash which is the inversion and closure of a swash mapping.
3580     * For example, consider the input lines:
3581     * 004B              006B
3582     * 004C              006C
3583     * 212A              006B
3584     *
3585     * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
3586     * 006C.  The value for each key is an array.  For 006C, the array would
3587     * have two elements, the UTF-8 for itself, and for 004C.  For 006B, there
3588     * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
3589     *
3590     * Note that there are no elements in the hash for 004B, 004C, 212A.  The
3591     * keys are only code points that are folded-to, so it isn't a full closure.
3592     *
3593     * Essentially, for any code point, it gives all the code points that map to
3594     * it, or the list of 'froms' for that point.
3595     *
3596     * Currently it ignores any additions or deletions from other swashes,
3597     * looking at just the main body of the swash, and if there are SPECIALS
3598     * in the swash, at that hash
3599     *
3600     * The specials hash can be extra code points, and most likely consists of
3601     * maps from single code points to multiple ones (each expressed as a string
3602     * of UTF-8 characters).   This function currently returns only 1-1 mappings.
3603     * However consider this possible input in the specials hash:
3604     * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
3605     * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
3606     *
3607     * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3608     * currently handle.  But it also means that FB05 and FB06 are equivalent in
3609     * a 1-1 mapping which we should handle, and this relationship may not be in
3610     * the main table.  Therefore this function examines all the multi-char
3611     * sequences and adds the 1-1 mappings that come out of that.
3612     *
3613     * XXX This function was originally intended to be multipurpose, but its
3614     * only use is quite likely to remain for constructing the inversion of
3615     * the CaseFolding (//i) property.  If it were more general purpose for
3616     * regex patterns, it would have to do the FB05/FB06 game for simple folds,
3617     * because certain folds are prohibited under /iaa and /il.  As an example,
3618     * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
3619     * equivalent under /i.  But under /iaa and /il, the folds to 'i' are
3620     * prohibited, so we would not figure out that they fold to each other.
3621     * Code could be written to automatically figure this out, similar to the
3622     * code that does this for multi-character folds, but this is the only case
3623     * where something like this is ever likely to happen, as all the single
3624     * char folds to the 0-255 range are now quite settled.  Instead there is a
3625     * little special code that is compiled only for this Unicode version.  This
3626     * is smaller and didn't require much coding time to do.  But this makes
3627     * this routine strongly tied to being used just for CaseFolding.  If ever
3628     * it should be generalized, this would have to be fixed */
3629
3630     U8 *l, *lend;
3631     STRLEN lcur;
3632     HV *const hv = MUTABLE_HV(SvRV(swash));
3633
3634     /* The string containing the main body of the table.  This will have its
3635      * assertion fail if the swash has been converted to its inversion list */
3636     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3637
3638     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3639     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3640     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3641     /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3642     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3643     const STRLEN bits  = SvUV(*bitssvp);
3644     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3645     const UV     none  = SvUV(*nonesvp);
3646     SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3647
3648     HV* ret = newHV();
3649
3650     PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3651
3652     /* Must have at least 8 bits to get the mappings */
3653     if (bits != 8 && bits != 16 && bits != 32) {
3654         Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3655                                                  (UV)bits);
3656     }
3657
3658     if (specials_p) { /* It might be "special" (sometimes, but not always, a
3659                         mapping to more than one character */
3660
3661         /* Construct an inverse mapping hash for the specials */
3662         HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3663         HV * specials_inverse = newHV();
3664         char *char_from; /* the lhs of the map */
3665         I32 from_len;   /* its byte length */
3666         char *char_to;  /* the rhs of the map */
3667         I32 to_len;     /* its byte length */
3668         SV *sv_to;      /* and in a sv */
3669         AV* from_list;  /* list of things that map to each 'to' */
3670
3671         hv_iterinit(specials_hv);
3672
3673         /* The keys are the characters (in UTF-8) that map to the corresponding
3674          * UTF-8 string value.  Iterate through the list creating the inverse
3675          * list. */
3676         while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3677             SV** listp;
3678             if (! SvPOK(sv_to)) {
3679                 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3680                            "unexpectedly is not a string, flags=%lu",
3681                            (unsigned long)SvFLAGS(sv_to));
3682             }
3683             /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3684
3685             /* Each key in the inverse list is a mapped-to value, and the key's
3686              * hash value is a list of the strings (each in UTF-8) that map to
3687              * it.  Those strings are all one character long */
3688             if ((listp = hv_fetch(specials_inverse,
3689                                     SvPVX(sv_to),
3690                                     SvCUR(sv_to), 0)))
3691             {
3692                 from_list = (AV*) *listp;
3693             }
3694             else { /* No entry yet for it: create one */
3695                 from_list = newAV();
3696                 if (! hv_store(specials_inverse,
3697                                 SvPVX(sv_to),
3698                                 SvCUR(sv_to),
3699                                 (SV*) from_list, 0))
3700                 {
3701                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3702                 }
3703             }
3704
3705             /* Here have the list associated with this 'to' (perhaps newly
3706              * created and empty).  Just add to it.  Note that we ASSUME that
3707              * the input is guaranteed to not have duplications, so we don't
3708              * check for that.  Duplications just slow down execution time. */
3709             av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3710         }
3711
3712         /* Here, 'specials_inverse' contains the inverse mapping.  Go through
3713          * it looking for cases like the FB05/FB06 examples above.  There would
3714          * be an entry in the hash like
3715         *       'st' => [ FB05, FB06 ]
3716         * In this example we will create two lists that get stored in the
3717         * returned hash, 'ret':
3718         *       FB05 => [ FB05, FB06 ]
3719         *       FB06 => [ FB05, FB06 ]
3720         *
3721         * Note that there is nothing to do if the array only has one element.
3722         * (In the normal 1-1 case handled below, we don't have to worry about
3723         * two lists, as everything gets tied to the single list that is
3724         * generated for the single character 'to'.  But here, we are omitting
3725         * that list, ('st' in the example), so must have multiple lists.) */
3726         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3727                                                  &char_to, &to_len)))
3728         {
3729             if (av_tindex_nomg(from_list) > 0) {
3730                 SSize_t i;
3731
3732                 /* We iterate over all combinations of i,j to place each code
3733                  * point on each list */
3734                 for (i = 0; i <= av_tindex_nomg(from_list); i++) {
3735                     SSize_t j;
3736                     AV* i_list = newAV();
3737                     SV** entryp = av_fetch(from_list, i, FALSE);
3738                     if (entryp == NULL) {
3739                         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3740                     }
3741                     if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3742                         Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3743                     }
3744                     if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3745                                    (SV*) i_list, FALSE))
3746                     {
3747                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3748                     }
3749
3750                     /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3751                     for (j = 0; j <= av_tindex_nomg(from_list); j++) {
3752                         entryp = av_fetch(from_list, j, FALSE);
3753                         if (entryp == NULL) {
3754                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3755                         }
3756
3757                         /* When i==j this adds itself to the list */
3758                         av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3759                                         (U8*) SvPVX(*entryp),
3760                                         (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3761                                         0)));
3762                         /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3763                     }
3764                 }
3765             }
3766         }
3767         SvREFCNT_dec(specials_inverse); /* done with it */
3768     } /* End of specials */
3769
3770     /* read $swash->{LIST} */
3771
3772 #if    UNICODE_MAJOR_VERSION   == 3         \
3773     && UNICODE_DOT_VERSION     == 0         \
3774     && UNICODE_DOT_DOT_VERSION == 1
3775
3776     /* For this version only U+130 and U+131 are equivalent under qr//i.  Add a
3777      * rule so that things work under /iaa and /il */
3778
3779     SV * mod_listsv = sv_mortalcopy(*listsvp);
3780     sv_catpv(mod_listsv, "130\t130\t131\n");
3781     l = (U8*)SvPV(mod_listsv, lcur);
3782
3783 #else
3784
3785     l = (U8*)SvPV(*listsvp, lcur);
3786
3787 #endif
3788
3789     lend = l + lcur;
3790
3791     /* Go through each input line */
3792     while (l < lend) {
3793         UV min, max, val;
3794         UV inverse;
3795         l = swash_scan_list_line(l, lend, &min, &max, &val,
3796                                                      cBOOL(octets), typestr);
3797         if (l > lend) {
3798             break;
3799         }
3800
3801         /* Each element in the range is to be inverted */
3802         for (inverse = min; inverse <= max; inverse++) {
3803             AV* list;
3804             SV** listp;
3805             IV i;
3806             bool found_key = FALSE;
3807             bool found_inverse = FALSE;
3808
3809             /* The key is the inverse mapping */
3810             char key[UTF8_MAXBYTES+1];
3811             char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
3812             STRLEN key_len = key_end - key;
3813
3814             /* Get the list for the map */
3815             if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3816                 list = (AV*) *listp;
3817             }
3818             else { /* No entry yet for it: create one */
3819                 list = newAV();
3820                 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3821                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3822                 }
3823             }
3824
3825             /* Look through list to see if this inverse mapping already is
3826              * listed, or if there is a mapping to itself already */
3827             for (i = 0; i <= av_tindex_nomg(list); i++) {
3828                 SV** entryp = av_fetch(list, i, FALSE);
3829                 SV* entry;
3830                 UV uv;
3831                 if (entryp == NULL) {
3832                     Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3833                 }
3834                 entry = *entryp;
3835                 uv = SvUV(entry);
3836                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));*/
3837                 if (uv == val) {
3838                     found_key = TRUE;
3839                 }
3840                 if (uv == inverse) {
3841                     found_inverse = TRUE;
3842                 }
3843
3844                 /* No need to continue searching if found everything we are
3845                  * looking for */
3846                 if (found_key && found_inverse) {
3847                     break;
3848                 }
3849             }
3850
3851             /* Make sure there is a mapping to itself on the list */
3852             if (! found_key) {
3853                 av_push(list, newSVuv(val));
3854                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
3855             }
3856
3857
3858             /* Simply add the value to the list */
3859             if (! found_inverse) {
3860                 av_push(list, newSVuv(inverse));
3861                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
3862             }
3863
3864             /* swatch_get() increments the value of val for each element in the
3865              * range.  That makes more compact tables possible.  You can
3866              * express the capitalization, for example, of all consecutive
3867              * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3868              * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
3869              * and it's not documented; it appears to be used only in
3870              * implementing tr//; I copied the semantics from swatch_get(), just
3871              * in case */
3872             if (!none || val < none) {
3873                 ++val;
3874             }
3875         }
3876     }
3877
3878     return ret;
3879 }
3880
3881 SV*
3882 Perl__swash_to_invlist(pTHX_ SV* const swash)
3883 {
3884
3885    /* Subject to change or removal.  For use only in one place in regcomp.c.
3886     * Ownership is given to one reference count in the returned SV* */
3887
3888     U8 *l, *lend;
3889     char *loc;
3890     STRLEN lcur;
3891     HV *const hv = MUTABLE_HV(SvRV(swash));
3892     UV elements = 0;    /* Number of elements in the inversion list */
3893     U8 empty[] = "";
3894     SV** listsvp;
3895     SV** typesvp;
3896     SV** bitssvp;
3897     SV** extssvp;
3898     SV** invert_it_svp;
3899
3900     U8* typestr;
3901     STRLEN bits;
3902     STRLEN octets; /* if bits == 1, then octets == 0 */
3903     U8 *x, *xend;
3904     STRLEN xcur;
3905
3906     SV* invlist;
3907
3908     PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
3909
3910     /* If not a hash, it must be the swash's inversion list instead */
3911     if (SvTYPE(hv) != SVt_PVHV) {
3912         return SvREFCNT_inc_simple_NN((SV*) hv);
3913     }
3914
3915     /* The string containing the main body of the table */
3916     listsvp = hv_fetchs(hv, "LIST", FALSE);
3917     typesvp = hv_fetchs(hv, "TYPE", FALSE);
3918     bitssvp = hv_fetchs(hv, "BITS", FALSE);
3919     extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3920     invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3921
3922     typestr = (U8*)SvPV_nolen(*typesvp);
3923     bits  = SvUV(*bitssvp);
3924     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3925
3926     /* read $swash->{LIST} */
3927     if (SvPOK(*listsvp)) {
3928         l = (U8*)SvPV(*listsvp, lcur);
3929     }
3930     else {
3931         /* LIST legitimately doesn't contain a string during compilation phases
3932          * of Perl itself, before the Unicode tables are generated.  In this
3933          * case, just fake things up by creating an empty list */
3934         l = empty;
3935         lcur = 0;
3936     }
3937     loc = (char *) l;
3938     lend = l + lcur;
3939
3940     if (*l == 'V') {    /*  Inversion list format */
3941         const char *after_atou = (char *) lend;
3942         UV element0;
3943         UV* other_elements_ptr;
3944
3945         /* The first number is a count of the rest */
3946         l++;
3947         if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
3948             Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
3949         }
3950         if (elements == 0) {
3951             invlist = _new_invlist(0);
3952         }
3953         else {
3954             while (isSPACE(*l)) l++;
3955             l = (U8 *) after_atou;
3956
3957             /* Get the 0th element, which is needed to setup the inversion list */
3958             while (isSPACE(*l)) l++;
3959             if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
3960                 Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
3961             }
3962             l = (U8 *) after_atou;
3963             invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
3964             elements--;
3965
3966             /* Then just populate the rest of the input */
3967             while (elements-- > 0) {
3968                 if (l > lend) {
3969                     Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
3970                 }
3971                 while (isSPACE(*l)) l++;
3972                 if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
3973                     Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
3974                 }
3975                 l = (U8 *) after_atou;
3976             }
3977         }
3978     }
3979     else {
3980
3981         /* Scan the input to count the number of lines to preallocate array
3982          * size based on worst possible case, which is each line in the input
3983          * creates 2 elements in the inversion list: 1) the beginning of a
3984          * range in the list; 2) the beginning of a range not in the list.  */
3985         while ((loc = (strchr(loc, '\n'))) != NULL) {
3986             elements += 2;
3987             loc++;
3988         }
3989
3990         /* If the ending is somehow corrupt and isn't a new line, add another
3991          * element for the final range that isn't in the inversion list */
3992         if (! (*lend == '\n'
3993             || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
3994         {
3995             elements++;
3996         }
3997
3998         invlist = _new_invlist(elements);
3999
4000         /* Now go through the input again, adding each range to the list */
4001         while (l < lend) {
4002             UV start, end;
4003             UV val;             /* Not used by this function */
4004
4005             l = swash_scan_list_line(l, lend, &start, &end, &val,
4006                                                         cBOOL(octets), typestr);
4007
4008             if (l > lend) {
4009                 break;
4010             }
4011
4012             invlist = _add_range_to_invlist(invlist, start, end);
4013         }
4014     }
4015
4016     /* Invert if the data says it should be */
4017     if (invert_it_svp && SvUV(*invert_it_svp)) {
4018         _invlist_invert(invlist);
4019     }
4020
4021     /* This code is copied from swatch_get()
4022      * read $swash->{EXTRAS} */
4023     x = (U8*)SvPV(*extssvp, xcur);
4024     xend = x + xcur;
4025     while (x < xend) {
4026         STRLEN namelen;
4027         U8 *namestr;
4028         SV** othersvp;
4029         HV* otherhv;
4030         STRLEN otherbits;
4031         SV **otherbitssvp, *other;
4032         U8 *nl;
4033
4034         const U8 opc = *x++;
4035         if (opc == '\n')
4036             continue;
4037
4038         nl = (U8*)memchr(x, '\n', xend - x);
4039
4040         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
4041             if (nl) {
4042                 x = nl + 1; /* 1 is length of "\n" */
4043                 continue;
4044             }
4045             else {
4046                 x = xend; /* to EXTRAS' end at which \n is not found */
4047                 break;
4048             }
4049         }
4050
4051         namestr = x;
4052         if (nl) {
4053             namelen = nl - namestr;
4054             x = nl + 1;
4055         }
4056         else {
4057             namelen = xend - namestr;
4058             x = xend;
4059         }
4060
4061         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
4062         otherhv = MUTABLE_HV(SvRV(*othersvp));
4063         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
4064         otherbits = (STRLEN)SvUV(*otherbitssvp);
4065
4066         if (bits != otherbits || bits != 1) {
4067             Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
4068                        "properties, bits=%"UVuf", otherbits=%"UVuf,
4069                        (UV)bits, (UV)otherbits);
4070         }
4071
4072         /* The "other" swatch must be destroyed after. */
4073         other = _swash_to_invlist((SV *)*othersvp);
4074
4075         /* End of code copied from swatch_get() */
4076         switch (opc) {
4077         case '+':
4078             _invlist_union(invlist, other, &invlist);
4079             break;
4080         case '!':
4081             _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
4082             break;
4083         case '-':
4084             _invlist_subtract(invlist, other, &invlist);
4085             break;
4086         case '&':
4087             _invlist_intersection(invlist, other, &invlist);
4088             break;
4089         default:
4090             break;
4091         }
4092         sv_free(other); /* through with it! */
4093     }
4094
4095     SvREADONLY_on(invlist);
4096     return invlist;
4097 }
4098
4099 SV*
4100 Perl__get_swash_invlist(pTHX_ SV* const swash)
4101 {
4102     SV** ptr;
4103
4104     PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
4105
4106     if (! SvROK(swash)) {
4107         return NULL;
4108     }
4109
4110     /* If it really isn't a hash, it isn't really swash; must be an inversion
4111      * list */
4112     if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
4113         return SvRV(swash);
4114     }
4115
4116     ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
4117     if (! ptr) {
4118         return NULL;
4119     }
4120
4121     return *ptr;
4122 }
4123
4124 bool
4125 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4126 {
4127     /* May change: warns if surrogates, non-character code points, or
4128      * non-Unicode code points are in s which has length len bytes.  Returns
4129      * TRUE if none found; FALSE otherwise.  The only other validity check is
4130      * to make sure that this won't exceed the string's length.
4131      *
4132      * Code points above the platform's C<IV_MAX> will raise a deprecation
4133      * warning, unless those are turned off.  */
4134
4135     const U8* const e = s + len;
4136     bool ok = TRUE;
4137
4138     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4139
4140     while (s < e) {
4141         if (UTF8SKIP(s) > len) {
4142             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4143                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4144             return FALSE;
4145         }
4146         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
4147             STRLEN char_len;
4148             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
4149                 if (   ckWARN_d(WARN_NON_UNICODE)
4150                     || (   ckWARN_d(WARN_DEPRECATED)
4151 #ifndef UV_IS_QUAD
4152                         && UNLIKELY(is_utf8_cp_above_31_bits(s, e))
4153 #else   /* Below is 64-bit words */
4154                         /* 2**63 and up meet these conditions provided we have
4155                          * a 64-bit word. */
4156 #   ifdef EBCDIC
4157                         && *s == 0xFE
4158                         && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8
4159 #   else
4160                         && *s == 0xFF
4161                            /* s[1] being above 0x80 overflows */
4162                         && s[2] >= 0x88
4163 #   endif
4164 #endif
4165                 )) {
4166                     /* A side effect of this function will be to warn */
4167                     (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_SUPER);
4168                     ok = FALSE;
4169                 }
4170             }
4171             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4172                 if (ckWARN_d(WARN_SURROGATE)) {
4173                     /* This has a different warning than the one the called
4174                      * function would output, so can't just call it, unlike we
4175                      * do for the non-chars and above-unicodes */
4176                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4177                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4178                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
4179                     ok = FALSE;
4180                 }
4181             }
4182             else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
4183                 /* A side effect of this function will be to warn */
4184                 (void) utf8n_to_uvchr(s, e - s, &char_len, UTF8_WARN_NONCHAR);
4185                 ok = FALSE;
4186             }
4187         }
4188         s += UTF8SKIP(s);
4189     }
4190
4191     return ok;
4192 }
4193
4194 /*
4195 =for apidoc pv_uni_display
4196
4197 Build to the scalar C<dsv> a displayable version of the string C<spv>,
4198 length C<len>, the displayable version being at most C<pvlim> bytes long
4199 (if longer, the rest is truncated and C<"..."> will be appended).
4200
4201 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4202 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4203 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4204 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4205 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4206 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
4207
4208 The pointer to the PV of the C<dsv> is returned.
4209
4210 See also L</sv_uni_display>.
4211
4212 =cut */
4213 char *
4214 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
4215 {
4216     int truncated = 0;
4217     const char *s, *e;
4218
4219     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4220
4221     sv_setpvs(dsv, "");
4222     SvUTF8_off(dsv);
4223     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4224          UV u;
4225           /* This serves double duty as a flag and a character to print after
4226              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4227           */
4228          char ok = 0;
4229
4230          if (pvlim && SvCUR(dsv) >= pvlim) {
4231               truncated++;
4232               break;
4233          }
4234          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4235          if (u < 256) {
4236              const unsigned char c = (unsigned char)u & 0xFF;
4237              if (flags & UNI_DISPLAY_BACKSLASH) {
4238                  switch (c) {
4239                  case '\n':
4240                      ok = 'n'; break;
4241                  case '\r':
4242                      ok = 'r'; break;
4243                  case '\t':
4244                      ok = 't'; break;
4245                  case '\f':
4246                      ok = 'f'; break;
4247                  case '\a':
4248                      ok = 'a'; break;
4249                  case '\\':
4250                      ok = '\\'; break;
4251                  default: break;
4252                  }
4253                  if (ok) {
4254                      const char string = ok;
4255                      sv_catpvs(dsv, "\\");
4256                      sv_catpvn(dsv, &string, 1);
4257                  }
4258              }
4259              /* isPRINT() is the locale-blind version. */
4260              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4261                  const char string = c;
4262                  sv_catpvn(dsv, &string, 1);
4263                  ok = 1;
4264              }
4265          }
4266          if (!ok)
4267              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
4268     }
4269     if (truncated)
4270          sv_catpvs(dsv, "...");
4271
4272     return SvPVX(dsv);
4273 }
4274
4275 /*
4276 =for apidoc sv_uni_display
4277
4278 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4279 the displayable version being at most C<pvlim> bytes long
4280 (if longer, the rest is truncated and "..." will be appended).
4281
4282 The C<flags> argument is as in L</pv_uni_display>().
4283
4284 The pointer to the PV of the C<dsv> is returned.
4285
4286 =cut
4287 */
4288 char *
4289 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4290 {
4291     const char * const ptr =
4292         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4293
4294     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4295
4296     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4297                                 SvCUR(ssv), pvlim, flags);
4298 }
4299
4300 /*
4301 =for apidoc foldEQ_utf8
4302
4303 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
4304 of which may be in UTF-8) are the same case-insensitively; false otherwise.
4305 How far into the strings to compare is determined by other input parameters.
4306
4307 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4308 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
4309 with respect to C<s2>.
4310
4311 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
4312 equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
4313 scan will not be considered to be a match unless the goal is reached, and
4314 scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
4315 C<s2>.
4316
4317 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that pointer is
4318 considered an end pointer to the position 1 byte past the maximum point
4319 in C<s1> beyond which scanning will not continue under any circumstances.
4320 (This routine assumes that UTF-8 encoded input strings are not malformed;
4321 malformed input can cause it to read past C<pe1>).
4322 This means that if both C<l1> and C<pe1> are specified, and C<pe1>
4323 is less than C<s1>+C<l1>, the match will never be successful because it can
4324 never
4325 get as far as its goal (and in fact is asserted against).  Correspondingly for
4326 C<pe2> with respect to C<s2>.
4327
4328 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4329 C<l2> must be non-zero), and if both do, both have to be
4330 reached for a successful match.   Also, if the fold of a character is multiple
4331 characters, all of them must be matched (see tr21 reference below for
4332 'folding').
4333
4334 Upon a successful match, if C<pe1> is non-C<NULL>,
4335 it will be set to point to the beginning of the I<next> character of C<s1>
4336 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4337
4338 For case-insensitiveness, the "casefolding" of Unicode is used
4339 instead of upper/lowercasing both the characters, see
4340 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
4341
4342 =cut */
4343
4344 /* A flags parameter has been added which may change, and hence isn't
4345  * externally documented.  Currently it is:
4346  *  0 for as-documented above
4347  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4348                             ASCII one, to not match
4349  *  FOLDEQ_LOCALE           is set iff the rules from the current underlying
4350  *                          locale are to be used.
4351  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4352  *                          routine.  This allows that step to be skipped.
4353  *                          Currently, this requires s1 to be encoded as UTF-8
4354  *                          (u1 must be true), which is asserted for.
4355  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4356  *                          cross certain boundaries.  Hence, the caller should
4357  *                          let this function do the folding instead of
4358  *                          pre-folding.  This code contains an assertion to
4359  *                          that effect.  However, if the caller knows what
4360  *                          it's doing, it can pass this flag to indicate that,
4361  *                          and the assertion is skipped.
4362  *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
4363  *  FOLDEQ_S2_FOLDS_SANE
4364  */
4365 I32
4366 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
4367 {
4368     const U8 *p1  = (const U8*)s1; /* Point to current char */
4369     const U8 *p2  = (const U8*)s2;
4370     const U8 *g1 = NULL;       /* goal for s1 */
4371     const U8 *g2 = NULL;
4372     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4373     U8 *f1 = NULL;             /* Point to current folded */
4374     const U8 *e2 = NULL;
4375     U8 *f2 = NULL;
4376     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4377     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4378     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4379     U8 flags_for_folder = FOLD_FLAGS_FULL;
4380
4381     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4382
4383     assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4384                && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
4385                      && !(flags & FOLDEQ_S1_FOLDS_SANE))
4386                    || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
4387                        && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
4388     /* The algorithm is to trial the folds without regard to the flags on
4389      * the first line of the above assert(), and then see if the result
4390      * violates them.  This means that the inputs can't be pre-folded to a
4391      * violating result, hence the assert.  This could be changed, with the
4392      * addition of extra tests here for the already-folded case, which would
4393      * slow it down.  That cost is more than any possible gain for when these
4394      * flags are specified, as the flags indicate /il or /iaa matching which
4395      * is less common than /iu, and I (khw) also believe that real-world /il
4396      * and /iaa matches are most likely to involve code points 0-255, and this
4397      * function only under rare conditions gets called for 0-255. */
4398
4399     if (flags & FOLDEQ_LOCALE) {
4400         if (IN_UTF8_CTYPE_LOCALE) {
4401             flags &= ~FOLDEQ_LOCALE;
4402         }
4403         else {
4404             flags_for_folder |= FOLD_FLAGS_LOCALE;
4405         }
4406     }
4407
4408     if (pe1) {
4409         e1 = *(U8**)pe1;
4410     }
4411
4412     if (l1) {
4413         g1 = (const U8*)s1 + l1;
4414     }
4415
4416     if (pe2) {
4417         e2 = *(U8**)pe2;
4418     }
4419
4420     if (l2) {
4421         g2 = (const U8*)s2 + l2;
4422     }
4423
4424     /* Must have at least one goal */
4425     assert(g1 || g2);
4426
4427     if (g1) {
4428
4429         /* Will never match if goal is out-of-bounds */
4430         assert(! e1  || e1 >= g1);
4431
4432         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4433         * only go as far as the goal */
4434         e1 = g1;
4435     }
4436     else {
4437         assert(e1);    /* Must have an end for looking at s1 */
4438     }
4439
4440     /* Same for goal for s2 */
4441     if (g2) {
4442         assert(! e2  || e2 >= g2);
4443         e2 = g2;
4444     }
4445     else {
4446         assert(e2);
4447     }
4448
4449     /* If both operands are already folded, we could just do a memEQ on the
4450      * whole strings at once, but it would be better if the caller realized
4451      * this and didn't even call us */
4452
4453     /* Look through both strings, a character at a time */
4454     while (p1 < e1 && p2 < e2) {
4455
4456         /* If at the beginning of a new character in s1, get its fold to use
4457          * and the length of the fold. */
4458         if (n1 == 0) {
4459             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4460                 f1 = (U8 *) p1;
4461                 assert(u1);
4462                 n1 = UTF8SKIP(f1);
4463             }
4464             else {
4465                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4466
4467                     /* We have to forbid mixing ASCII with non-ASCII if the
4468                      * flags so indicate.  And, we can short circuit having to
4469                      * call the general functions for this common ASCII case,
4470                      * all of whose non-locale folds are also ASCII, and hence
4471                      * UTF-8 invariants, so the UTF8ness of the strings is not
4472                      * relevant. */
4473                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4474                         return 0;
4475                     }
4476                     n1 = 1;
4477                     *foldbuf1 = toFOLD(*p1);
4478                 }
4479                 else if (u1) {
4480                     _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
4481                 }
4482                 else {  /* Not UTF-8, get UTF-8 fold */
4483                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4484                 }
4485                 f1 = foldbuf1;
4486             }
4487         }
4488
4489         if (n2 == 0) {    /* Same for s2 */
4490             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4491                 f2 = (U8 *) p2;
4492                 assert(u2);
4493                 n2 = UTF8SKIP(f2);
4494             }
4495             else {
4496                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4497                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4498                         return 0;
4499                     }
4500                     n2 = 1;
4501                     *foldbuf2 = toFOLD(*p2);
4502                 }
4503                 else if (u2) {
4504                     _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
4505                 }
4506                 else {
4507                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4508                 }
4509                 f2 = foldbuf2;
4510             }
4511         }
4512
4513         /* Here f1 and f2 point to the beginning of the strings to compare.
4514          * These strings are the folds of the next character from each input
4515          * string, stored in UTF-8. */
4516
4517         /* While there is more to look for in both folds, see if they
4518         * continue to match */
4519         while (n1 && n2) {
4520             U8 fold_length = UTF8SKIP(f1);
4521             if (fold_length != UTF8SKIP(f2)
4522                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4523                                                        function call for single
4524                                                        byte */
4525                 || memNE((char*)f1, (char*)f2, fold_length))
4526             {
4527                 return 0; /* mismatch */
4528             }
4529
4530             /* Here, they matched, advance past them */
4531             n1 -= fold_length;
4532             f1 += fold_length;
4533             n2 -= fold_length;
4534             f2 += fold_length;
4535         }
4536
4537         /* When reach the end of any fold, advance the input past it */
4538         if (n1 == 0) {
4539             p1 += u1 ? UTF8SKIP(p1) : 1;
4540         }
4541         if (n2 == 0) {
4542             p2 += u2 ? UTF8SKIP(p2) : 1;
4543         }
4544     } /* End of loop through both strings */
4545
4546     /* A match is defined by each scan that specified an explicit length
4547     * reaching its final goal, and the other not having matched a partial
4548     * character (which can happen when the fold of a character is more than one
4549     * character). */
4550     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4551         return 0;
4552     }
4553
4554     /* Successful match.  Set output pointers */
4555     if (pe1) {
4556         *pe1 = (char*)p1;
4557     }
4558     if (pe2) {
4559         *pe2 = (char*)p2;
4560     }
4561     return 1;
4562 }
4563
4564 /* XXX The next two functions should likely be moved to mathoms.c once all
4565  * occurrences of them are removed from the core; some cpan-upstream modules
4566  * still use them */
4567
4568 U8 *
4569 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
4570 {
4571     PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
4572
4573     return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
4574 }
4575
4576 /*
4577 =for apidoc utf8n_to_uvuni
4578
4579 Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
4580
4581 This function was useful for code that wanted to handle both EBCDIC and
4582 ASCII platforms with Unicode properties, but starting in Perl v5.20, the
4583 distinctions between the platforms have mostly been made invisible to most
4584 code, so this function is quite unlikely to be what you want.  If you do need
4585 this precise functionality, use instead
4586 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
4587 or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
4588
4589 =cut
4590 */
4591
4592 UV
4593 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
4594 {
4595     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
4596
4597     return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
4598 }
4599
4600 /*
4601 =for apidoc uvuni_to_utf8_flags
4602
4603 Instead you almost certainly want to use L</uvchr_to_utf8> or
4604 L</uvchr_to_utf8_flags>.
4605
4606 This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
4607 which itself, while not deprecated, should be used only in isolated
4608 circumstances.  These functions were useful for code that wanted to handle
4609 both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
4610 v5.20, the distinctions between the platforms have mostly been made invisible
4611 to most code, so this function is quite unlikely to be what you want.
4612
4613 =cut
4614 */
4615
4616 U8 *
4617 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
4618 {
4619     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
4620
4621     return uvoffuni_to_utf8_flags(d, uv, flags);
4622 }
4623
4624 /*
4625  * ex: set ts=8 sts=4 sw=4 et:
4626  */