utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34 #include "inline_invlist.c"
  35 #include "charclass_invlists.h"
  36
  37 static const char unees[] =
  38     "Malformed UTF-8 character (unexpected end of string)";
  39
  40 /*
  41 =head1 Unicode Support
  42 These are various utility functions for manipulating UTF8-encoded
  43 strings.  For the uninitiated, this is a method of representing arbitrary
  44 Unicode characters as a variable number of bytes, in such a way that
  45 characters in the ASCII range are unmodified, and a zero byte never appears
  46 within non-zero characters.
  47
  48 =cut
  49 */
  50
  51 /*
  52 =for apidoc is_ascii_string
  53
  54 Returns true if the first C<len> bytes of the string C<s> are the same whether
  55 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  56 is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  57 fit this definition, hence the function's name.
  58
  59 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
  60 use this option, that C<s> can't have embedded C<NUL> characters and has to
  61 have a terminating C<NUL> byte).
  62
  63 See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  64
  65 =cut
  66 */
  67
  68 bool
  69 Perl_is_ascii_string(const U8 *s, STRLEN len)
  70 {
  71     const U8* const send = s + (len ? len : strlen((const char *)s));
  72     const U8* x = s;
  73
  74     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  75
  76     for (; x < send; ++x) {
  77         if (!UTF8_IS_INVARIANT(*x))
  78             break;
  79     }
  80
  81     return x == send;
  82 }
  83
  84 /*
  85 =for apidoc uvoffuni_to_utf8_flags
  86
  87 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
  88 Instead, B<Almost all code should use L</uvchr_to_utf8> or
  89 L</uvchr_to_utf8_flags>>.
  90
  91 This function is like them, but the input is a strict Unicode
  92 (as opposed to native) code point.  Only in very rare circumstances should code
  93 not be using the native code point.
  94
  95 For details, see the description for L</uvchr_to_utf8_flags>>.
  96
  97 =cut
  98 */
  99
 100 U8 *
 101 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 102 {
 103     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
 104
 105     if (UNI_IS_INVARIANT(uv)) {
 106         *d++ = (U8) LATIN1_TO_NATIVE(uv);
 107         return d;
 108     }
 109
 110     /* The first problematic code point is the first surrogate */
 111     if (uv >= UNICODE_SURROGATE_FIRST
 112         && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
 113     {
 114         if (UNICODE_IS_SURROGATE(uv)) {
 115             if (flags & UNICODE_WARN_SURROGATE) {
 116                 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
 117                                             "UTF-16 surrogate U+%04"UVXf, uv);
 118             }
 119             if (flags & UNICODE_DISALLOW_SURROGATE) {
 120                 return NULL;
 121             }
 122         }
 123         else if (UNICODE_IS_SUPER(uv)) {
 124             if (flags & UNICODE_WARN_SUPER
 125                 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
 126             {
 127                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
 128                           "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
 129             }
 130             if (flags & UNICODE_DISALLOW_SUPER
 131                 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
 132             {
 133                 return NULL;
 134             }
 135         }
 136         else if (UNICODE_IS_NONCHAR(uv)) {
 137             if (flags & UNICODE_WARN_NONCHAR) {
 138                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
 139                  "Unicode non-character U+%04"UVXf" is illegal for open interchange",
 140                  uv);
 141             }
 142             if (flags & UNICODE_DISALLOW_NONCHAR) {
 143                 return NULL;
 144             }
 145         }
 146     }
 147
 148 #if defined(EBCDIC)
 149     {
 150         STRLEN len  = OFFUNISKIP(uv);
 151         U8 *p = d+len-1;
 152         while (p > d) {
 153             *p-- = (U8) I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 154             uv >>= UTF_ACCUMULATION_SHIFT;
 155         }
 156         *p = (U8) I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 157         return d+len;
 158     }
 159 #else /* Non loop style */
 160     if (uv < 0x800) {
 161         *d++ = (U8)(( uv >>  6)         | 0xc0);
 162         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 163         return d;
 164     }
 165     if (uv < 0x10000) {
 166         *d++ = (U8)(( uv >> 12)         | 0xe0);
 167         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 168         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 169         return d;
 170     }
 171     if (uv < 0x200000) {
 172         *d++ = (U8)(( uv >> 18)         | 0xf0);
 173         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 174         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 175         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 176         return d;
 177     }
 178     if (uv < 0x4000000) {
 179         *d++ = (U8)(( uv >> 24)         | 0xf8);
 180         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 181         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 182         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 183         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 184         return d;
 185     }
 186     if (uv < 0x80000000) {
 187         *d++ = (U8)(( uv >> 30)         | 0xfc);
 188         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 189         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 190         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 192         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 193         return d;
 194     }
 195 #ifdef UTF8_QUAD_MAX
 196     if (uv < UTF8_QUAD_MAX)
 197 #endif
 198     {
 199         *d++ =                            0xfe; /* Can't match U+FEFF! */
 200         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 201         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 202         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 203         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 205         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 206         return d;
 207     }
 208 #ifdef UTF8_QUAD_MAX
 209     {
 210         *d++ =                            0xff;         /* Can't match U+FFFE! */
 211         *d++ =                            0x80;         /* 6 Reserved bits */
 212         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 213         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 214         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 215         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 216         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 217         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 218         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 219         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 220         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 221         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 222         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 223         return d;
 224     }
 225 #endif
 226 #endif /* Non loop style */
 227 }
 228 /*
 229 =for apidoc uvchr_to_utf8
 230
 231 Adds the UTF-8 representation of the native code point C<uv> to the end
 232 of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
 233 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 234 the byte after the end of the new character.  In other words,
 235
 236     d = uvchr_to_utf8(d, uv);
 237
 238 is the recommended wide native character-aware way of saying
 239
 240     *(d++) = uv;
 241
 242 This function accepts any UV as input.  To forbid or warn on non-Unicode code
 243 points, or those that may be problematic, see L</uvchr_to_utf8_flags>.
 244
 245 =cut
 246 */
 247
 248 /* This is also a macro */
 249 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 250
 251 U8 *
 252 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 253 {
 254     return uvchr_to_utf8(d, uv);
 255 }
 256
 257 /*
 258 =for apidoc uvchr_to_utf8_flags
 259
 260 Adds the UTF-8 representation of the native code point C<uv> to the end
 261 of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
 262 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 263 the byte after the end of the new character.  In other words,
 264
 265     d = uvchr_to_utf8_flags(d, uv, flags);
 266
 267 or, in most cases,
 268
 269     d = uvchr_to_utf8_flags(d, uv, 0);
 270
 271 This is the Unicode-aware way of saying
 272
 273     *(d++) = uv;
 274
 275 This function will convert to UTF-8 (and not warn) even code points that aren't
 276 legal Unicode or are problematic, unless C<flags> contains one or more of the
 277 following flags:
 278
 279 If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
 280 the function will raise a warning, provided UTF8 warnings are enabled.  If instead
 281 UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
 282 If both flags are set, the function will both warn and return NULL.
 283
 284 The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
 285 affect how the function handles a Unicode non-character.  And likewise, the
 286 UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
 287 code points that are
 288 above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
 289 even less portable) can be warned and/or disallowed even if other above-Unicode
 290 code points are accepted, by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
 291 flags.
 292
 293 And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
 294 above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
 295 DISALLOW flags.
 296
 297 =cut
 298 */
 299
 300 /* This is also a macro */
 301 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 302
 303 U8 *
 304 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 305 {
 306     return uvchr_to_utf8_flags(d, uv, flags);
 307 }
 308
 309 /*
 310 =for apidoc is_utf8_char_buf
 311
 312 This is identical to the macro L</isUTF8_CHAR>.
 313
 314 =cut */
 315
 316 STRLEN
 317 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
 318 {
 319
 320     PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
 321
 322     return isUTF8_CHAR(buf, buf_end);
 323 }
 324
 325 /*
 326 =for apidoc is_utf8_string
 327
 328 Returns true if the first C<len> bytes of string C<s> form a valid
 329 UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
 330 using C<strlen(s)> (which means if you use this option, that C<s> can't have
 331 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
 332 that all characters being ASCII constitute 'a valid UTF-8 string'.
 333
 334 See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
 335
 336 =cut
 337 */
 338
 339 bool
 340 Perl_is_utf8_string(const U8 *s, STRLEN len)
 341 {
 342     const U8* const send = s + (len ? len : strlen((const char *)s));
 343     const U8* x = s;
 344
 345     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 346
 347     while (x < send) {
 348         STRLEN len = isUTF8_CHAR(x, send);
 349         if (UNLIKELY(! len)) {
 350             return FALSE;
 351         }
 352         x += len;
 353     }
 354
 355     return TRUE;
 356 }
 357
 358 /*
 359 Implemented as a macro in utf8.h
 360
 361 =for apidoc is_utf8_string_loc
 362
 363 Like L</is_utf8_string> but stores the location of the failure (in the
 364 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 365 "utf8ness success") in the C<ep>.
 366
 367 See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
 368
 369 =for apidoc is_utf8_string_loclen
 370
 371 Like L</is_utf8_string>() but stores the location of the failure (in the
 372 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 373 "utf8ness success") in the C<ep>, and the number of UTF-8
 374 encoded characters in the C<el>.
 375
 376 See also L</is_utf8_string_loc>() and L</is_utf8_string>().
 377
 378 =cut
 379 */
 380
 381 bool
 382 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 383 {
 384     const U8* const send = s + (len ? len : strlen((const char *)s));
 385     const U8* x = s;
 386     STRLEN outlen = 0;
 387
 388     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 389
 390     while (x < send) {
 391         STRLEN len = isUTF8_CHAR(x, send);
 392         if (UNLIKELY(! len)) {
 393             goto out;
 394         }
 395         x += len;
 396         outlen++;
 397     }
 398
 399  out:
 400     if (el)
 401         *el = outlen;
 402
 403     if (ep)
 404         *ep = x;
 405     return (x == send);
 406 }
 407
 408 /*
 409
 410 =for apidoc utf8n_to_uvchr
 411
 412 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 413 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 414
 415 Bottom level UTF-8 decode routine.
 416 Returns the native code point value of the first character in the string C<s>,
 417 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
 418 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
 419 the length, in bytes, of that character.
 420
 421 The value of C<flags> determines the behavior when C<s> does not point to a
 422 well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
 423 zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
 424 next possible position in C<s> that could begin a non-malformed character.
 425 Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
 426
 427 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
 428 individual types of malformations, such as the sequence being overlong (that
 429 is, when there is a shorter sequence that can express the same code point;
 430 overlong sequences are expressly forbidden in the UTF-8 standard due to
 431 potential security issues).  Another malformation example is the first byte of
 432 a character not being a legal first byte.  See F<utf8.h> for the list of such
 433 flags.  For allowed 0 length strings, this function returns 0; for allowed
 434 overlong sequences, the computed code point is returned; for all other allowed
 435 malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
 436 determinable reasonable value.
 437
 438 The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
 439 flags) malformation is found.  If this flag is set, the routine assumes that
 440 the caller will raise a warning, and this function will silently just set
 441 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
 442
 443 Note that this API requires disambiguation between successful decoding a C<NUL>
 444 character, and an error return (unless the UTF8_CHECK_ONLY flag is set), as
 445 in both cases, 0 is returned.  To disambiguate, upon a zero return, see if the
 446 first byte of C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the
 447 input had an error.
 448
 449 Certain code points are considered problematic.  These are Unicode surrogates,
 450 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
 451 By default these are considered regular code points, but certain situations
 452 warrant special handling for them.  If C<flags> contains
 453 UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
 454 malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
 455 UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
 456 maximum) can be set to disallow these categories individually.
 457
 458 The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
 459 UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
 460 for their respective categories, but otherwise the code points are considered
 461 valid (not malformations).  To get a category to both be treated as a
 462 malformation and raise a warning, specify both the WARN and DISALLOW flags.
 463 (But note that warnings are not raised if lexically disabled nor if
 464 UTF8_CHECK_ONLY is also specified.)
 465
 466 Very large code points (above 0x7FFF_FFFF) are considered more problematic than
 467 the others that are above the Unicode legal maximum.  There are several
 468 reasons: they requre at least 32 bits to represent them on ASCII platforms, are
 469 not representable at all on EBCDIC platforms, and the original UTF-8
 470 specification never went above this number (the current 0x10FFFF limit was
 471 imposed later).  (The smaller ones, those that fit into 32 bits, are
 472 representable by a UV on ASCII platforms, but not by an IV, which means that
 473 the number of operations that can be performed on them is quite restricted.)
 474 The UTF-8 encoding on ASCII platforms for these large code points begins with a
 475 byte containing 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to
 476 be treated as malformations, while allowing smaller above-Unicode code points.
 477 (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
 478 including these, as malformations.)
 479 Similarly, UTF8_WARN_FE_FF acts just like
 480 the other WARN flags, but applies just to these code points.
 481
 482 All other code points corresponding to Unicode characters, including private
 483 use and those yet to be assigned, are never considered malformed and never
 484 warn.
 485
 486 =cut
 487 */
 488
 489 UV
 490 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 491 {
 492     dVAR;
 493     const U8 * const s0 = s;
 494     U8 overflow_byte = '\0';    /* Save byte in case of overflow */
 495     U8 * send;
 496     UV uv = *s;
 497     STRLEN expectlen;
 498     SV* sv = NULL;
 499     UV outlier_ret = 0; /* return value when input is in error or problematic
 500                          */
 501     UV pack_warn = 0;   /* Save result of packWARN() for later */
 502     bool unexpected_non_continuation = FALSE;
 503     bool overflowed = FALSE;
 504     bool do_overlong_test = TRUE;   /* May have to skip this test */
 505
 506     const char* const malformed_text = "Malformed UTF-8 character";
 507
 508     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
 509
 510     /* The order of malformation tests here is important.  We should consume as
 511      * few bytes as possible in order to not skip any valid character.  This is
 512      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
 513      * http://unicode.org/reports/tr36 for more discussion as to why.  For
 514      * example, once we've done a UTF8SKIP, we can tell the expected number of
 515      * bytes, and could fail right off the bat if the input parameters indicate
 516      * that there are too few available.  But it could be that just that first
 517      * byte is garbled, and the intended character occupies fewer bytes.  If we
 518      * blindly assumed that the first byte is correct, and skipped based on
 519      * that number, we could skip over a valid input character.  So instead, we
 520      * always examine the sequence byte-by-byte.
 521      *
 522      * We also should not consume too few bytes, otherwise someone could inject
 523      * things.  For example, an input could be deliberately designed to
 524      * overflow, and if this code bailed out immediately upon discovering that,
 525      * returning to the caller C<*retlen> pointing to the very next byte (one
 526      * which is actually part of of the overflowing sequence), that could look
 527      * legitimate to the caller, which could discard the initial partial
 528      * sequence and process the rest, inappropriately */
 529
 530     /* Zero length strings, if allowed, of necessity are zero */
 531     if (UNLIKELY(curlen == 0)) {
 532         if (retlen) {
 533             *retlen = 0;
 534         }
 535
 536         if (flags & UTF8_ALLOW_EMPTY) {
 537             return 0;
 538         }
 539         if (! (flags & UTF8_CHECK_ONLY)) {
 540             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
 541         }
 542         goto malformed;
 543     }
 544
 545     expectlen = UTF8SKIP(s);
 546
 547     /* A well-formed UTF-8 character, as the vast majority of calls to this
 548      * function will be for, has this expected length.  For efficiency, set
 549      * things up here to return it.  It will be overriden only in those rare
 550      * cases where a malformation is found */
 551     if (retlen) {
 552         *retlen = expectlen;
 553     }
 554
 555     /* An invariant is trivially well-formed */
 556     if (UTF8_IS_INVARIANT(uv)) {
 557         return uv;
 558     }
 559
 560     /* A continuation character can't start a valid sequence */
 561     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
 562         if (flags & UTF8_ALLOW_CONTINUATION) {
 563             if (retlen) {
 564                 *retlen = 1;
 565             }
 566             return UNICODE_REPLACEMENT;
 567         }
 568
 569         if (! (flags & UTF8_CHECK_ONLY)) {
 570             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
 571         }
 572         curlen = 1;
 573         goto malformed;
 574     }
 575
 576     /* Here is not a continuation byte, nor an invariant.  The only thing left
 577      * is a start byte (possibly for an overlong) */
 578
 579 #ifdef EBCDIC
 580     uv = NATIVE_UTF8_TO_I8(uv);
 581 #endif
 582
 583     /* Remove the leading bits that indicate the number of bytes in the
 584      * character's whole UTF-8 sequence, leaving just the bits that are part of
 585      * the value */
 586     uv &= UTF_START_MASK(expectlen);
 587
 588     /* Now, loop through the remaining bytes in the character's sequence,
 589      * accumulating each into the working value as we go.  Be sure to not look
 590      * past the end of the input string */
 591     send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
 592
 593     for (s = s0 + 1; s < send; s++) {
 594         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
 595 #ifndef EBCDIC  /* Can't overflow in EBCDIC */
 596             if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
 597
 598                 /* The original implementors viewed this malformation as more
 599                  * serious than the others (though I, khw, don't understand
 600                  * why, since other malformations also give very very wrong
 601                  * results), so there is no way to turn off checking for it.
 602                  * Set a flag, but keep going in the loop, so that we absorb
 603                  * the rest of the bytes that comprise the character. */
 604                 overflowed = TRUE;
 605                 overflow_byte = *s; /* Save for warning message's use */
 606             }
 607 #endif
 608             uv = UTF8_ACCUMULATE(uv, *s);
 609         }
 610         else {
 611             /* Here, found a non-continuation before processing all expected
 612              * bytes.  This byte begins a new character, so quit, even if
 613              * allowing this malformation. */
 614             unexpected_non_continuation = TRUE;
 615             break;
 616         }
 617     } /* End of loop through the character's bytes */
 618
 619     /* Save how many bytes were actually in the character */
 620     curlen = s - s0;
 621
 622     /* The loop above finds two types of malformations: non-continuation and/or
 623      * overflow.  The non-continuation malformation is really a too-short
 624      * malformation, as it means that the current character ended before it was
 625      * expected to (being terminated prematurely by the beginning of the next
 626      * character, whereas in the too-short malformation there just are too few
 627      * bytes available to hold the character.  In both cases, the check below
 628      * that we have found the expected number of bytes would fail if executed.)
 629      * Thus the non-continuation malformation is really unnecessary, being a
 630      * subset of the too-short malformation.  But there may be existing
 631      * applications that are expecting the non-continuation type, so we retain
 632      * it, and return it in preference to the too-short malformation.  (If this
 633      * code were being written from scratch, the two types might be collapsed
 634      * into one.)  I, khw, am also giving priority to returning the
 635      * non-continuation and too-short malformations over overflow when multiple
 636      * ones are present.  I don't know of any real reason to prefer one over
 637      * the other, except that it seems to me that multiple-byte errors trumps
 638      * errors from a single byte */
 639     if (UNLIKELY(unexpected_non_continuation)) {
 640         if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 641             if (! (flags & UTF8_CHECK_ONLY)) {
 642                 if (curlen == 1) {
 643                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
 644                 }
 645                 else {
 646                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
 647                 }
 648             }
 649             goto malformed;
 650         }
 651         uv = UNICODE_REPLACEMENT;
 652
 653         /* Skip testing for overlongs, as the REPLACEMENT may not be the same
 654          * as what the original expectations were. */
 655         do_overlong_test = FALSE;
 656         if (retlen) {
 657             *retlen = curlen;
 658         }
 659     }
 660     else if (UNLIKELY(curlen < expectlen)) {
 661         if (! (flags & UTF8_ALLOW_SHORT)) {
 662             if (! (flags & UTF8_CHECK_ONLY)) {
 663                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
 664             }
 665             goto malformed;
 666         }
 667         uv = UNICODE_REPLACEMENT;
 668         do_overlong_test = FALSE;
 669         if (retlen) {
 670             *retlen = curlen;
 671         }
 672     }
 673
 674 #ifndef EBCDIC  /* EBCDIC can't overflow */
 675     if (UNLIKELY(overflowed)) {
 676         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
 677         goto malformed;
 678     }
 679 #endif
 680
 681     if (do_overlong_test
 682         && expectlen > (STRLEN) OFFUNISKIP(uv)
 683         && ! (flags & UTF8_ALLOW_LONG))
 684     {
 685         /* The overlong malformation has lower precedence than the others.
 686          * Note that if this malformation is allowed, we return the actual
 687          * value, instead of the replacement character.  This is because this
 688          * value is actually well-defined. */
 689         if (! (flags & UTF8_CHECK_ONLY)) {
 690             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", OFFUNISKIP(uv), *s0));
 691         }
 692         goto malformed;
 693     }
 694
 695     /* Here, the input is considered to be well-formed, but it still could be a
 696      * problematic code point that is not allowed by the input parameters. */
 697     if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
 698         && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 699                      |UTF8_WARN_ILLEGAL_INTERCHANGE)))
 700     {
 701         if (UNICODE_IS_SURROGATE(uv)) {
 702
 703             /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
 704              * generation of the sv, since no warnings are raised under CHECK */
 705             if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
 706                 && ckWARN_d(WARN_SURROGATE))
 707             {
 708                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
 709                 pack_warn = packWARN(WARN_SURROGATE);
 710             }
 711             if (flags & UTF8_DISALLOW_SURROGATE) {
 712                 goto disallowed;
 713             }
 714         }
 715         else if ((uv > PERL_UNICODE_MAX)) {
 716             if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
 717                 && ckWARN_d(WARN_NON_UNICODE))
 718             {
 719                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
 720                 pack_warn = packWARN(WARN_NON_UNICODE);
 721             }
 722 #ifndef EBCDIC  /* EBCDIC always allows FE, FF */
 723
 724             /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
 725              * points.  We test for these after the regular SUPER ones, and
 726              * before possibly bailing out, so that the more dire warning
 727              * overrides the regular one, if applicable */
 728             if ((*s0 & 0xFE) == 0xFE    /* matches both FE, FF */
 729                 && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
 730             {
 731                 if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
 732                                                             == UTF8_WARN_FE_FF
 733                     && ckWARN_d(WARN_UTF8))
 734                 {
 735                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
 736                     pack_warn = packWARN(WARN_UTF8);
 737                 }
 738                 if (flags & UTF8_DISALLOW_FE_FF) {
 739                     goto disallowed;
 740                 }
 741             }
 742 #endif
 743             if (flags & UTF8_DISALLOW_SUPER) {
 744                 goto disallowed;
 745             }
 746         }
 747         else if (UNICODE_IS_NONCHAR(uv)) {
 748             if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
 749                 && ckWARN_d(WARN_NONCHAR))
 750             {
 751                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
 752                 pack_warn = packWARN(WARN_NONCHAR);
 753             }
 754             if (flags & UTF8_DISALLOW_NONCHAR) {
 755                 goto disallowed;
 756             }
 757         }
 758
 759         if (sv) {
 760             outlier_ret = uv;   /* Note we don't bother to convert to native,
 761                                    as all the outlier code points are the same
 762                                    in both ASCII and EBCDIC */
 763             goto do_warn;
 764         }
 765
 766         /* Here, this is not considered a malformed character, so drop through
 767          * to return it */
 768     }
 769
 770     return UNI_TO_NATIVE(uv);
 771
 772     /* There are three cases which get to beyond this point.  In all 3 cases:
 773      * <sv>         if not null points to a string to print as a warning.
 774      * <curlen>     is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
 775      *              set.
 776      * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
 777      *              This is done by initializing it to 0, and changing it only
 778      *              for case 1).
 779      * The 3 cases are:
 780      * 1)   The input is valid but problematic, and to be warned about.  The
 781      *      return value is the resultant code point; <*retlen> is set to
 782      *      <curlen>, the number of bytes that comprise the code point.
 783      *      <pack_warn> contains the result of packWARN() for the warning
 784      *      types.  The entry point for this case is the label <do_warn>;
 785      * 2)   The input is a valid code point but disallowed by the parameters to
 786      *      this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
 787      *      <*relen> is -1; otherwise it is <curlen>, the number of bytes that
 788      *      comprise the code point.  <pack_warn> contains the result of
 789      *      packWARN() for the warning types.  The entry point for this case is
 790      *      the label <disallowed>.
 791      * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
 792      *      is set, <*relen> is -1; otherwise it is <curlen>, the number of
 793      *      bytes that comprise the malformation.  All such malformations are
 794      *      assumed to be warning type <utf8>.  The entry point for this case
 795      *      is the label <malformed>.
 796      */
 797
 798 malformed:
 799
 800     if (sv && ckWARN_d(WARN_UTF8)) {
 801         pack_warn = packWARN(WARN_UTF8);
 802     }
 803
 804 disallowed:
 805
 806     if (flags & UTF8_CHECK_ONLY) {
 807         if (retlen)
 808             *retlen = ((STRLEN) -1);
 809         return 0;
 810     }
 811
 812 do_warn:
 813
 814     if (pack_warn) {    /* <pack_warn> was initialized to 0, and changed only
 815                            if warnings are to be raised. */
 816         const char * const string = SvPVX_const(sv);
 817
 818         if (PL_op)
 819             Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
 820         else
 821             Perl_warner(aTHX_ pack_warn, "%s", string);
 822     }
 823
 824     if (retlen) {
 825         *retlen = curlen;
 826     }
 827
 828     return outlier_ret;
 829 }
 830
 831 /*
 832 =for apidoc utf8_to_uvchr_buf
 833
 834 Returns the native code point of the first character in the string C<s> which
 835 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 836 C<*retlen> will be set to the length, in bytes, of that character.
 837
 838 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
 839 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
 840 NULL) to -1.  If those warnings are off, the computed value, if well-defined
 841 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
 842 C<*retlen> is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is
 843 the next possible position in C<s> that could begin a non-malformed character.
 844 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
 845 returned.
 846
 847 =cut
 848 */
 849
 850
 851 UV
 852 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 853 {
 854     assert(s < send);
 855
 856     return utf8n_to_uvchr(s, send - s, retlen,
 857                           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 858 }
 859
 860 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
 861  * there are no malformations in the input UTF-8 string C<s>.  surrogates,
 862  * non-character code points, and non-Unicode code points are allowed. */
 863
 864 UV
 865 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 866 {
 867     UV expectlen = UTF8SKIP(s);
 868     const U8* send = s + expectlen;
 869     UV uv = *s;
 870
 871     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 872
 873     if (retlen) {
 874         *retlen = expectlen;
 875     }
 876
 877     /* An invariant is trivially returned */
 878     if (expectlen == 1) {
 879         return uv;
 880     }
 881
 882 #ifdef EBCDIC
 883     uv = NATIVE_UTF8_TO_I8(uv);
 884 #endif
 885
 886     /* Remove the leading bits that indicate the number of bytes, leaving just
 887      * the bits that are part of the value */
 888     uv &= UTF_START_MASK(expectlen);
 889
 890     /* Now, loop through the remaining bytes, accumulating each into the
 891      * working total as we go.  (I khw tried unrolling the loop for up to 4
 892      * bytes, but there was no performance improvement) */
 893     for (++s; s < send; s++) {
 894         uv = UTF8_ACCUMULATE(uv, *s);
 895     }
 896
 897     return UNI_TO_NATIVE(uv);
 898
 899 }
 900
 901 /*
 902 =for apidoc utf8_to_uvuni_buf
 903
 904 Only in very rare circumstances should code need to be dealing in Unicode
 905 (as opposed to native) code points.  In those few cases, use
 906 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
 907
 908 Returns the Unicode (not-native) code point of the first character in the
 909 string C<s> which
 910 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 911 C<retlen> will be set to the length, in bytes, of that character.
 912
 913 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
 914 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
 915 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
 916 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
 917 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
 918 next possible position in C<s> that could begin a non-malformed character.
 919 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
 920
 921 =cut
 922 */
 923
 924 UV
 925 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 926 {
 927     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
 928
 929     assert(send > s);
 930
 931     /* Call the low level routine asking for checks */
 932     return NATIVE_TO_UNI(Perl_utf8n_to_uvchr(aTHX_ s, send -s, retlen,
 933                                ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY));
 934 }
 935
 936 /*
 937 =for apidoc utf8_length
 938
 939 Return the length of the UTF-8 char encoded string C<s> in characters.
 940 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 941 up past C<e>, croaks.
 942
 943 =cut
 944 */
 945
 946 STRLEN
 947 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 948 {
 949     dVAR;
 950     STRLEN len = 0;
 951
 952     PERL_ARGS_ASSERT_UTF8_LENGTH;
 953
 954     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 955      * the bitops (especially ~) can create illegal UTF-8.
 956      * In other words: in Perl UTF-8 is not just for Unicode. */
 957
 958     if (e < s)
 959         goto warn_and_return;
 960     while (s < e) {
 961         s += UTF8SKIP(s);
 962         len++;
 963     }
 964
 965     if (e != s) {
 966         len--;
 967         warn_and_return:
 968         if (PL_op)
 969             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
 970                              "%s in %s", unees, OP_DESC(PL_op));
 971         else
 972             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
 973     }
 974
 975     return len;
 976 }
 977
 978 /*
 979 =for apidoc utf8_distance
 980
 981 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 982 and C<b>.
 983
 984 WARNING: use only if you *know* that the pointers point inside the
 985 same UTF-8 buffer.
 986
 987 =cut
 988 */
 989
 990 IV
 991 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 992 {
 993     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 994
 995     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 996 }
 997
 998 /*
 999 =for apidoc utf8_hop
1000
1001 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1002 forward or backward.
1003
1004 WARNING: do not use the following unless you *know* C<off> is within
1005 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1006 on the first byte of character or just after the last byte of a character.
1007
1008 =cut
1009 */
1010
1011 U8 *
1012 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1013 {
1014     PERL_ARGS_ASSERT_UTF8_HOP;
1015
1016     PERL_UNUSED_CONTEXT;
1017     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1018      * the bitops (especially ~) can create illegal UTF-8.
1019      * In other words: in Perl UTF-8 is not just for Unicode. */
1020
1021     if (off >= 0) {
1022         while (off--)
1023             s += UTF8SKIP(s);
1024     }
1025     else {
1026         while (off++) {
1027             s--;
1028             while (UTF8_IS_CONTINUATION(*s))
1029                 s--;
1030         }
1031     }
1032     return (U8 *)s;
1033 }
1034
1035 /*
1036 =for apidoc bytes_cmp_utf8
1037
1038 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1039 sequence of characters (stored as UTF-8)
1040 in C<u>, C<ulen>.  Returns 0 if they are
1041 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1042 if the first string is greater than the second string.
1043
1044 -1 or +1 is returned if the shorter string was identical to the start of the
1045 longer string.  -2 or +2 is returned if
1046 there was a difference between characters
1047 within the strings.
1048
1049 =cut
1050 */
1051
1052 int
1053 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1054 {
1055     const U8 *const bend = b + blen;
1056     const U8 *const uend = u + ulen;
1057
1058     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1059
1060     PERL_UNUSED_CONTEXT;
1061
1062     while (b < bend && u < uend) {
1063         U8 c = *u++;
1064         if (!UTF8_IS_INVARIANT(c)) {
1065             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1066                 if (u < uend) {
1067                     U8 c1 = *u++;
1068                     if (UTF8_IS_CONTINUATION(c1)) {
1069                         c = TWO_BYTE_UTF8_TO_NATIVE(c, c1);
1070                     } else {
1071                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1072                                          "Malformed UTF-8 character "
1073                                          "(unexpected non-continuation byte 0x%02x"
1074                                          ", immediately after start byte 0x%02x)"
1075                                          /* Dear diag.t, it's in the pod.  */
1076                                          "%s%s", c1, c,
1077                                          PL_op ? " in " : "",
1078                                          PL_op ? OP_DESC(PL_op) : "");
1079                         return -2;
1080                     }
1081                 } else {
1082                     if (PL_op)
1083                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1084                                          "%s in %s", unees, OP_DESC(PL_op));
1085                     else
1086                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1087                     return -2; /* Really want to return undef :-)  */
1088                 }
1089             } else {
1090                 return -2;
1091             }
1092         }
1093         if (*b != c) {
1094             return *b < c ? -2 : +2;
1095         }
1096         ++b;
1097     }
1098
1099     if (b == bend && u == uend)
1100         return 0;
1101
1102     return b < bend ? +1 : -1;
1103 }
1104
1105 /*
1106 =for apidoc utf8_to_bytes
1107
1108 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1109 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1110 updates C<len> to contain the new length.
1111 Returns zero on failure, setting C<len> to -1.
1112
1113 If you need a copy of the string, see L</bytes_from_utf8>.
1114
1115 =cut
1116 */
1117
1118 U8 *
1119 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1120 {
1121     U8 * const save = s;
1122     U8 * const send = s + *len;
1123     U8 *d;
1124
1125     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1126
1127     /* ensure valid UTF-8 and chars < 256 before updating string */
1128     while (s < send) {
1129         if (! UTF8_IS_INVARIANT(*s)) {
1130             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1131                 *len = ((STRLEN) -1);
1132                 return 0;
1133             }
1134             s++;
1135         }
1136         s++;
1137     }
1138
1139     d = s = save;
1140     while (s < send) {
1141         U8 c = *s++;
1142         if (! UTF8_IS_INVARIANT(c)) {
1143             /* Then it is two-byte encoded */
1144             c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1145             s++;
1146         }
1147         *d++ = c;
1148     }
1149     *d = '\0';
1150     *len = d - save;
1151     return save;
1152 }
1153
1154 /*
1155 =for apidoc bytes_from_utf8
1156
1157 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1158 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
1159 the newly-created string, and updates C<len> to contain the new
1160 length.  Returns the original string if no conversion occurs, C<len>
1161 is unchanged.  Do nothing if C<is_utf8> points to 0.  Sets C<is_utf8> to
1162 0 if C<s> is converted or consisted entirely of characters that are invariant
1163 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1164
1165 =cut
1166 */
1167
1168 U8 *
1169 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1170 {
1171     U8 *d;
1172     const U8 *start = s;
1173     const U8 *send;
1174     I32 count = 0;
1175
1176     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1177
1178     PERL_UNUSED_CONTEXT;
1179     if (!*is_utf8)
1180         return (U8 *)start;
1181
1182     /* ensure valid UTF-8 and chars < 256 before converting string */
1183     for (send = s + *len; s < send;) {
1184         if (! UTF8_IS_INVARIANT(*s)) {
1185             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1186                 return (U8 *)start;
1187             }
1188             count++;
1189             s++;
1190         }
1191         s++;
1192     }
1193
1194     *is_utf8 = FALSE;
1195
1196     Newx(d, (*len) - count + 1, U8);
1197     s = start; start = d;
1198     while (s < send) {
1199         U8 c = *s++;
1200         if (! UTF8_IS_INVARIANT(c)) {
1201             /* Then it is two-byte encoded */
1202             c = TWO_BYTE_UTF8_TO_NATIVE(c, *s);
1203             s++;
1204         }
1205         *d++ = c;
1206     }
1207     *d = '\0';
1208     *len = d - start;
1209     return (U8 *)start;
1210 }
1211
1212 /*
1213 =for apidoc bytes_to_utf8
1214
1215 Converts a string C<s> of length C<len> bytes from the native encoding into
1216 UTF-8.
1217 Returns a pointer to the newly-created string, and sets C<len> to
1218 reflect the new length in bytes.
1219
1220 A C<NUL> character will be written after the end of the string.
1221
1222 If you want to convert to UTF-8 from encodings other than
1223 the native (Latin1 or EBCDIC),
1224 see L</sv_recode_to_utf8>().
1225
1226 =cut
1227 */
1228
1229 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1230    likewise need duplication. */
1231
1232 U8*
1233 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1234 {
1235     const U8 * const send = s + (*len);
1236     U8 *d;
1237     U8 *dst;
1238
1239     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1240     PERL_UNUSED_CONTEXT;
1241
1242     Newx(d, (*len) * 2 + 1, U8);
1243     dst = d;
1244
1245     while (s < send) {
1246         append_utf8_from_native_byte(*s, &d);
1247         s++;
1248     }
1249     *d = '\0';
1250     *len = d-dst;
1251     return dst;
1252 }
1253
1254 /*
1255  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1256  *
1257  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
1258  * We optimize for native, for obvious reasons. */
1259
1260 U8*
1261 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1262 {
1263     U8* pend;
1264     U8* dstart = d;
1265
1266     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1267
1268     if (bytelen & 1)
1269         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1270
1271     pend = p + bytelen;
1272
1273     while (p < pend) {
1274         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1275         p += 2;
1276         if (UNI_IS_INVARIANT(uv)) {
1277             *d++ = LATIN1_TO_NATIVE((U8) uv);
1278             continue;
1279         }
1280         if (uv <= MAX_UTF8_TWO_BYTE) {
1281             *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1282             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
1283             continue;
1284         }
1285 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1286 #define LAST_HIGH_SURROGATE  0xDBFF
1287 #define FIRST_LOW_SURROGATE  0xDC00
1288 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
1289         if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
1290             if (p >= pend) {
1291                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1292             } else {
1293                 UV low = (p[0] << 8) + p[1];
1294                 p += 2;
1295                 if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
1296                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1297                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1298                                        + (low - FIRST_LOW_SURROGATE) + 0x10000;
1299             }
1300         } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
1301             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1302         }
1303 #ifdef EBCDIC
1304         d = uvoffuni_to_utf8_flags(d, uv, 0);
1305 #else
1306         if (uv < 0x10000) {
1307             *d++ = (U8)(( uv >> 12)         | 0xe0);
1308             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1309             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1310             continue;
1311         }
1312         else {
1313             *d++ = (U8)(( uv >> 18)         | 0xf0);
1314             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1315             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1316             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1317             continue;
1318         }
1319 #endif
1320     }
1321     *newlen = d - dstart;
1322     return d;
1323 }
1324
1325 /* Note: this one is slightly destructive of the source. */
1326
1327 U8*
1328 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1329 {
1330     U8* s = (U8*)p;
1331     U8* const send = s + bytelen;
1332
1333     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1334
1335     if (bytelen & 1)
1336         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1337                    (UV)bytelen);
1338
1339     while (s < send) {
1340         const U8 tmp = s[0];
1341         s[0] = s[1];
1342         s[1] = tmp;
1343         s += 2;
1344     }
1345     return utf16_to_utf8(p, d, bytelen, newlen);
1346 }
1347
1348 bool
1349 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1350 {
1351     U8 tmpbuf[UTF8_MAXBYTES+1];
1352     uvchr_to_utf8(tmpbuf, c);
1353     return _is_utf8_FOO(classnum, tmpbuf);
1354 }
1355
1356 /* Internal function so we can deprecate the external one, and call
1357    this one from other deprecated functions in this file */
1358
1359 bool
1360 Perl__is_utf8_idstart(pTHX_ const U8 *p)
1361 {
1362     dVAR;
1363     PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
1364
1365     if (*p == '_')
1366         return TRUE;
1367     return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
1368 }
1369
1370 bool
1371 Perl__is_uni_perl_idcont(pTHX_ UV c)
1372 {
1373     U8 tmpbuf[UTF8_MAXBYTES+1];
1374     uvchr_to_utf8(tmpbuf, c);
1375     return _is_utf8_perl_idcont(tmpbuf);
1376 }
1377
1378 bool
1379 Perl__is_uni_perl_idstart(pTHX_ UV c)
1380 {
1381     U8 tmpbuf[UTF8_MAXBYTES+1];
1382     uvchr_to_utf8(tmpbuf, c);
1383     return _is_utf8_perl_idstart(tmpbuf);
1384 }
1385
1386 UV
1387 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1388 {
1389     /* We have the latin1-range values compiled into the core, so just use
1390      * those, converting the result to utf8.  The only difference between upper
1391      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1392      * either "SS" or "Ss".  Which one to use is passed into the routine in
1393      * 'S_or_s' to avoid a test */
1394
1395     UV converted = toUPPER_LATIN1_MOD(c);
1396
1397     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1398
1399     assert(S_or_s == 'S' || S_or_s == 's');
1400
1401     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
1402                                              characters in this range */
1403         *p = (U8) converted;
1404         *lenp = 1;
1405         return converted;
1406     }
1407
1408     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1409      * which it maps to one of them, so as to only have to have one check for
1410      * it in the main case */
1411     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1412         switch (c) {
1413             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1414                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1415                 break;
1416             case MICRO_SIGN:
1417                 converted = GREEK_CAPITAL_LETTER_MU;
1418                 break;
1419             case LATIN_SMALL_LETTER_SHARP_S:
1420                 *(p)++ = 'S';
1421                 *p = S_or_s;
1422                 *lenp = 2;
1423                 return 'S';
1424             default:
1425                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1426                 assert(0); /* NOTREACHED */
1427         }
1428     }
1429
1430     *(p)++ = UTF8_TWO_BYTE_HI(converted);
1431     *p = UTF8_TWO_BYTE_LO(converted);
1432     *lenp = 2;
1433
1434     return converted;
1435 }
1436
1437 /* Call the function to convert a UTF-8 encoded character to the specified case.
1438  * Note that there may be more than one character in the result.
1439  * INP is a pointer to the first byte of the input character
1440  * OUTP will be set to the first byte of the string of changed characters.  It
1441  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1442  * LENP will be set to the length in bytes of the string of changed characters
1443  *
1444  * The functions return the ordinal of the first character in the string of OUTP */
1445 #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
1446 #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
1447 #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
1448
1449 /* This additionally has the input parameter SPECIALS, which if non-zero will
1450  * cause this to use the SPECIALS hash for folding (meaning get full case
1451  * folding); otherwise, when zero, this implies a simple case fold */
1452 #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
1453
1454 UV
1455 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1456 {
1457     dVAR;
1458
1459     /* Convert the Unicode character whose ordinal is <c> to its uppercase
1460      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1461      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1462      * the changed version may be longer than the original character.
1463      *
1464      * The ordinal of the first character of the changed version is returned
1465      * (but note, as explained above, that there may be more.) */
1466
1467     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1468
1469     if (c < 256) {
1470         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1471     }
1472
1473     uvchr_to_utf8(p, c);
1474     return CALL_UPPER_CASE(p, p, lenp);
1475 }
1476
1477 UV
1478 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1479 {
1480     dVAR;
1481
1482     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1483
1484     if (c < 256) {
1485         return _to_upper_title_latin1((U8) c, p, lenp, 's');
1486     }
1487
1488     uvchr_to_utf8(p, c);
1489     return CALL_TITLE_CASE(p, p, lenp);
1490 }
1491
1492 STATIC U8
1493 S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1494 {
1495     /* We have the latin1-range values compiled into the core, so just use
1496      * those, converting the result to utf8.  Since the result is always just
1497      * one character, we allow <p> to be NULL */
1498
1499     U8 converted = toLOWER_LATIN1(c);
1500
1501     if (p != NULL) {
1502         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
1503             *p = converted;
1504             *lenp = 1;
1505         }
1506         else {
1507             *p = UTF8_TWO_BYTE_HI(converted);
1508             *(p+1) = UTF8_TWO_BYTE_LO(converted);
1509             *lenp = 2;
1510         }
1511     }
1512     return converted;
1513 }
1514
1515 UV
1516 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1517 {
1518     dVAR;
1519
1520     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1521
1522     if (c < 256) {
1523         return to_lower_latin1((U8) c, p, lenp);
1524     }
1525
1526     uvchr_to_utf8(p, c);
1527     return CALL_LOWER_CASE(p, p, lenp);
1528 }
1529
1530 UV
1531 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
1532 {
1533     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1534      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1535      *      FOLD_FLAGS_FULL  iff full folding is to be used;
1536      *
1537      *  Not to be used for locale folds
1538      */
1539
1540     UV converted;
1541
1542     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1543
1544     assert (! (flags & FOLD_FLAGS_LOCALE));
1545
1546     if (c == MICRO_SIGN) {
1547         converted = GREEK_SMALL_LETTER_MU;
1548     }
1549     else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
1550
1551         /* If can't cross 127/128 boundary, can't return "ss"; instead return
1552          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1553          * under those circumstances. */
1554         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1555             *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1556             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1557                  p, *lenp, U8);
1558             return LATIN_SMALL_LETTER_LONG_S;
1559         }
1560         else {
1561             *(p)++ = 's';
1562             *p = 's';
1563             *lenp = 2;
1564             return 's';
1565         }
1566     }
1567     else { /* In this range the fold of all other characters is their lower
1568               case */
1569         converted = toLOWER_LATIN1(c);
1570     }
1571
1572     if (UVCHR_IS_INVARIANT(converted)) {
1573         *p = (U8) converted;
1574         *lenp = 1;
1575     }
1576     else {
1577         *(p)++ = UTF8_TWO_BYTE_HI(converted);
1578         *p = UTF8_TWO_BYTE_LO(converted);
1579         *lenp = 2;
1580     }
1581
1582     return converted;
1583 }
1584
1585 UV
1586 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
1587 {
1588
1589     /* Not currently externally documented, and subject to change
1590      *  <flags> bits meanings:
1591      *      FOLD_FLAGS_FULL  iff full folding is to be used;
1592      *      FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
1593      *                        locale are to be used.
1594      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1595      */
1596
1597     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1598
1599     /* Tread a UTF-8 locale as not being in locale at all */
1600     if (IN_UTF8_CTYPE_LOCALE) {
1601         flags &= ~FOLD_FLAGS_LOCALE;
1602     }
1603
1604     if (c < 256) {
1605         UV result = _to_fold_latin1((U8) c, p, lenp,
1606                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
1607         /* It is illegal for the fold to cross the 255/256 boundary under
1608          * locale; in this case return the original */
1609         return (result > 256 && flags & FOLD_FLAGS_LOCALE)
1610                ? c
1611                : result;
1612     }
1613
1614     /* If no special needs, just use the macro */
1615     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1616         uvchr_to_utf8(p, c);
1617         return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
1618     }
1619     else {  /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1620                the special flags. */
1621         U8 utf8_c[UTF8_MAXBYTES + 1];
1622         uvchr_to_utf8(utf8_c, c);
1623         return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
1624     }
1625 }
1626
1627 PERL_STATIC_INLINE bool
1628 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1629                  const char *const swashname, SV* const invlist)
1630 {
1631     /* returns a boolean giving whether or not the UTF8-encoded character that
1632      * starts at <p> is in the swash indicated by <swashname>.  <swash>
1633      * contains a pointer to where the swash indicated by <swashname>
1634      * is to be stored; which this routine will do, so that future calls will
1635      * look at <*swash> and only generate a swash if it is not null.  <invlist>
1636      * is NULL or an inversion list that defines the swash.  If not null, it
1637      * saves time during initialization of the swash.
1638      *
1639      * Note that it is assumed that the buffer length of <p> is enough to
1640      * contain all the bytes that comprise the character.  Thus, <*p> should
1641      * have been checked before this call for mal-formedness enough to assure
1642      * that. */
1643
1644     dVAR;
1645
1646     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1647
1648     /* The API should have included a length for the UTF-8 character in <p>,
1649      * but it doesn't.  We therefore assume that p has been validated at least
1650      * as far as there being enough bytes available in it to accommodate the
1651      * character without reading beyond the end, and pass that number on to the
1652      * validating routine */
1653     if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
1654         if (ckWARN_d(WARN_UTF8)) {
1655             Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
1656                     "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
1657             if (ckWARN(WARN_UTF8)) {    /* This will output details as to the
1658                                            what the malformation is */
1659                 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
1660             }
1661         }
1662         return FALSE;
1663     }
1664     if (!*swash) {
1665         U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1666         *swash = _core_swash_init("utf8",
1667
1668                                   /* Only use the name if there is no inversion
1669                                    * list; otherwise will go out to disk */
1670                                   (invlist) ? "" : swashname,
1671
1672                                   &PL_sv_undef, 1, 0, invlist, &flags);
1673     }
1674
1675     return swash_fetch(*swash, p, TRUE) != 0;
1676 }
1677
1678 bool
1679 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
1680 {
1681     dVAR;
1682
1683     PERL_ARGS_ASSERT__IS_UTF8_FOO;
1684
1685     assert(classnum < _FIRST_NON_SWASH_CC);
1686
1687     return is_utf8_common(p,
1688                           &PL_utf8_swash_ptrs[classnum],
1689                           swash_property_names[classnum],
1690                           PL_XPosix_ptrs[classnum]);
1691 }
1692
1693 bool
1694 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
1695 {
1696     dVAR;
1697     SV* invlist = NULL;
1698
1699     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
1700
1701     if (! PL_utf8_perl_idstart) {
1702         invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
1703     }
1704     return is_utf8_common(p, &PL_utf8_perl_idstart, "", invlist);
1705 }
1706
1707 bool
1708 Perl__is_utf8_xidstart(pTHX_ const U8 *p)
1709 {
1710     dVAR;
1711
1712     PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
1713
1714     if (*p == '_')
1715         return TRUE;
1716     return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
1717 }
1718
1719 bool
1720 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
1721 {
1722     dVAR;
1723     SV* invlist = NULL;
1724
1725     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
1726
1727     if (! PL_utf8_perl_idcont) {
1728         invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
1729     }
1730     return is_utf8_common(p, &PL_utf8_perl_idcont, "", invlist);
1731 }
1732
1733 bool
1734 Perl__is_utf8_idcont(pTHX_ const U8 *p)
1735 {
1736     dVAR;
1737
1738     PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
1739
1740     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
1741 }
1742
1743 bool
1744 Perl__is_utf8_xidcont(pTHX_ const U8 *p)
1745 {
1746     dVAR;
1747
1748     PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
1749
1750     return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
1751 }
1752
1753 bool
1754 Perl__is_utf8_mark(pTHX_ const U8 *p)
1755 {
1756     dVAR;
1757
1758     PERL_ARGS_ASSERT__IS_UTF8_MARK;
1759
1760     return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
1761 }
1762
1763 /*
1764 =for apidoc to_utf8_case
1765
1766 C<p> contains the pointer to the UTF-8 string encoding
1767 the character that is being converted.  This routine assumes that the character
1768 at C<p> is well-formed.
1769
1770 C<ustrp> is a pointer to the character buffer to put the
1771 conversion result to.  C<lenp> is a pointer to the length
1772 of the result.
1773
1774 C<swashp> is a pointer to the swash to use.
1775
1776 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
1777 and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  C<special> (usually,
1778 but not always, a multicharacter mapping), is tried first.
1779
1780 C<special> is a string, normally C<NULL> or C<"">.  C<NULL> means to not use
1781 any special mappings; C<""> means to use the special mappings.  Values other
1782 than these two are treated as the name of the hash containing the special
1783 mappings, like C<"utf8::ToSpecLower">.
1784
1785 C<normal> is a string like "ToLower" which means the swash
1786 %utf8::ToLower.
1787
1788 =cut */
1789
1790 UV
1791 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1792                         SV **swashp, const char *normal, const char *special)
1793 {
1794     dVAR;
1795     STRLEN len = 0;
1796     const UV uv1 = valid_utf8_to_uvchr(p, NULL);
1797
1798     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1799
1800     /* Note that swash_fetch() doesn't output warnings for these because it
1801      * assumes we will */
1802     if (uv1 >= UNICODE_SURROGATE_FIRST) {
1803         if (uv1 <= UNICODE_SURROGATE_LAST) {
1804             if (ckWARN_d(WARN_SURROGATE)) {
1805                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
1806                 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
1807                     "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
1808             }
1809         }
1810         else if (UNICODE_IS_SUPER(uv1)) {
1811             if (ckWARN_d(WARN_NON_UNICODE)) {
1812                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
1813                 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
1814                     "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
1815             }
1816         }
1817
1818         /* Note that non-characters are perfectly legal, so no warning should
1819          * be given */
1820     }
1821
1822     if (!*swashp) /* load on-demand */
1823          *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
1824
1825     if (special) {
1826          /* It might be "special" (sometimes, but not always,
1827           * a multicharacter mapping) */
1828          HV *hv = NULL;
1829          SV **svp;
1830
1831          /* If passed in the specials name, use that; otherwise use any
1832           * given in the swash */
1833          if (*special != '\0') {
1834             hv = get_hv(special, 0);
1835         }
1836         else {
1837             svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
1838             if (svp) {
1839                 hv = MUTABLE_HV(SvRV(*svp));
1840             }
1841         }
1842
1843          if (hv
1844              && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
1845              && (*svp))
1846          {
1847              const char *s;
1848
1849               s = SvPV_const(*svp, len);
1850               if (len == 1)
1851                   /* EIGHTBIT */
1852                    len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
1853               else {
1854                    Copy(s, ustrp, len, U8);
1855               }
1856          }
1857     }
1858
1859     if (!len && *swashp) {
1860         const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is utf8 */);
1861
1862          if (uv2) {
1863               /* It was "normal" (a single character mapping). */
1864               len = uvchr_to_utf8(ustrp, uv2) - ustrp;
1865          }
1866     }
1867
1868     if (len) {
1869         if (lenp) {
1870             *lenp = len;
1871         }
1872         return valid_utf8_to_uvchr(ustrp, 0);
1873     }
1874
1875     /* Here, there was no mapping defined, which means that the code point maps
1876      * to itself.  Return the inputs */
1877     len = UTF8SKIP(p);
1878     if (p != ustrp) {   /* Don't copy onto itself */
1879         Copy(p, ustrp, len, U8);
1880     }
1881
1882     if (lenp)
1883          *lenp = len;
1884
1885     return uv1;
1886
1887 }
1888
1889 STATIC UV
1890 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
1891 {
1892     /* This is called when changing the case of a utf8-encoded character above
1893      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
1894      * result contains a character that crosses the 255/256 boundary, disallow
1895      * the change, and return the original code point.  See L<perlfunc/lc> for
1896      * why;
1897      *
1898      * p        points to the original string whose case was changed; assumed
1899      *          by this routine to be well-formed
1900      * result   the code point of the first character in the changed-case string
1901      * ustrp    points to the changed-case string (<result> represents its first char)
1902      * lenp     points to the length of <ustrp> */
1903
1904     UV original;    /* To store the first code point of <p> */
1905
1906     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
1907
1908     assert(UTF8_IS_ABOVE_LATIN1(*p));
1909
1910     /* We know immediately if the first character in the string crosses the
1911      * boundary, so can skip */
1912     if (result > 255) {
1913
1914         /* Look at every character in the result; if any cross the
1915         * boundary, the whole thing is disallowed */
1916         U8* s = ustrp + UTF8SKIP(ustrp);
1917         U8* e = ustrp + *lenp;
1918         while (s < e) {
1919             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
1920                 goto bad_crossing;
1921             }
1922             s += UTF8SKIP(s);
1923         }
1924
1925         /* Here, no characters crossed, result is ok as-is */
1926         return result;
1927     }
1928
1929 bad_crossing:
1930
1931     /* Failed, have to return the original */
1932     original = valid_utf8_to_uvchr(p, lenp);
1933     Copy(p, ustrp, *lenp, char);
1934     return original;
1935 }
1936
1937 /*
1938 =for apidoc to_utf8_upper
1939
1940 Instead use L</toUPPER_utf8>.
1941
1942 =cut */
1943
1944 /* Not currently externally documented, and subject to change:
1945  * <flags> is set iff iff the rules from the current underlying locale are to
1946  *         be used. */
1947
1948 UV
1949 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
1950 {
1951     dVAR;
1952
1953     UV result;
1954
1955     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
1956
1957     if (flags && IN_UTF8_CTYPE_LOCALE) {
1958         flags = FALSE;
1959     }
1960
1961     if (UTF8_IS_INVARIANT(*p)) {
1962         if (flags) {
1963             result = toUPPER_LC(*p);
1964         }
1965         else {
1966             return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
1967         }
1968     }
1969     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
1970         if (flags) {
1971             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
1972             result = toUPPER_LC(c);
1973         }
1974         else {
1975             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
1976                                           ustrp, lenp, 'S');
1977         }
1978     }
1979     else {  /* utf8, ord above 255 */
1980         result = CALL_UPPER_CASE(p, ustrp, lenp);
1981
1982         if (flags) {
1983             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
1984         }
1985         return result;
1986     }
1987
1988     /* Here, used locale rules.  Convert back to utf8 */
1989     if (UTF8_IS_INVARIANT(result)) {
1990         *ustrp = (U8) result;
1991         *lenp = 1;
1992     }
1993     else {
1994         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
1995         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
1996         *lenp = 2;
1997     }
1998
1999     return result;
2000 }
2001
2002 /*
2003 =for apidoc to_utf8_title
2004
2005 Instead use L</toTITLE_utf8>.
2006
2007 =cut */
2008
2009 /* Not currently externally documented, and subject to change:
2010  * <flags> is set iff the rules from the current underlying locale are to be
2011  *         used.  Since titlecase is not defined in POSIX, for other than a
2012  *         UTF-8 locale, uppercase is used instead for code points < 256.
2013  */
2014
2015 UV
2016 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2017 {
2018     dVAR;
2019
2020     UV result;
2021
2022     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2023
2024     if (flags && IN_UTF8_CTYPE_LOCALE) {
2025         flags = FALSE;
2026     }
2027
2028     if (UTF8_IS_INVARIANT(*p)) {
2029         if (flags) {
2030             result = toUPPER_LC(*p);
2031         }
2032         else {
2033             return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2034         }
2035     }
2036     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2037         if (flags) {
2038             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2039             result = toUPPER_LC(c);
2040         }
2041         else {
2042             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2043                                           ustrp, lenp, 's');
2044         }
2045     }
2046     else {  /* utf8, ord above 255 */
2047         result = CALL_TITLE_CASE(p, ustrp, lenp);
2048
2049         if (flags) {
2050             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2051         }
2052         return result;
2053     }
2054
2055     /* Here, used locale rules.  Convert back to utf8 */
2056     if (UTF8_IS_INVARIANT(result)) {
2057         *ustrp = (U8) result;
2058         *lenp = 1;
2059     }
2060     else {
2061         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2062         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2063         *lenp = 2;
2064     }
2065
2066     return result;
2067 }
2068
2069 /*
2070 =for apidoc to_utf8_lower
2071
2072 Instead use L</toLOWER_utf8>.
2073
2074 =cut */
2075
2076 /* Not currently externally documented, and subject to change:
2077  * <flags> is set iff iff the rules from the current underlying locale are to
2078  *         be used.
2079  */
2080
2081 UV
2082 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
2083 {
2084     UV result;
2085
2086     dVAR;
2087
2088     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2089
2090     if (flags && IN_UTF8_CTYPE_LOCALE) {
2091         flags = FALSE;
2092     }
2093
2094     if (UTF8_IS_INVARIANT(*p)) {
2095         if (flags) {
2096             result = toLOWER_LC(*p);
2097         }
2098         else {
2099             return to_lower_latin1(*p, ustrp, lenp);
2100         }
2101     }
2102     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2103         if (flags) {
2104             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2105             result = toLOWER_LC(c);
2106         }
2107         else {
2108             return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2109                                    ustrp, lenp);
2110         }
2111     }
2112     else {  /* utf8, ord above 255 */
2113         result = CALL_LOWER_CASE(p, ustrp, lenp);
2114
2115         if (flags) {
2116             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2117         }
2118
2119         return result;
2120     }
2121
2122     /* Here, used locale rules.  Convert back to utf8 */
2123     if (UTF8_IS_INVARIANT(result)) {
2124         *ustrp = (U8) result;
2125         *lenp = 1;
2126     }
2127     else {
2128         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2129         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2130         *lenp = 2;
2131     }
2132
2133     return result;
2134 }
2135
2136 /*
2137 =for apidoc to_utf8_fold
2138
2139 Instead use L</toFOLD_utf8>.
2140
2141 =cut */
2142
2143 /* Not currently externally documented, and subject to change,
2144  * in <flags>
2145  *      bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2146  *                            locale are to be used.
2147  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
2148  *                            otherwise simple folds
2149  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2150  *                            prohibited
2151  */
2152
2153 UV
2154 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
2155 {
2156     dVAR;
2157
2158     UV result;
2159
2160     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2161
2162     /* These are mutually exclusive */
2163     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2164
2165     assert(p != ustrp); /* Otherwise overwrites */
2166
2167     if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) {
2168         flags &= ~FOLD_FLAGS_LOCALE;
2169     }
2170
2171     if (UTF8_IS_INVARIANT(*p)) {
2172         if (flags & FOLD_FLAGS_LOCALE) {
2173             result = toFOLD_LC(*p);
2174         }
2175         else {
2176             return _to_fold_latin1(*p, ustrp, lenp,
2177                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2178         }
2179     }
2180     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2181         if (flags & FOLD_FLAGS_LOCALE) {
2182             U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
2183             result = toFOLD_LC(c);
2184         }
2185         else {
2186             return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
2187                             ustrp, lenp,
2188                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2189         }
2190     }
2191     else {  /* utf8, ord above 255 */
2192         result = CALL_FOLD_CASE(p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
2193
2194         if (flags & FOLD_FLAGS_LOCALE) {
2195
2196             /* Special case these two characters, as what normally gets
2197              * returned under locale doesn't work */
2198             if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
2199                 && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
2200                           sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
2201             {
2202                 goto return_long_s;
2203             }
2204             else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T) - 1
2205                 && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
2206                           sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
2207             {
2208                 goto return_ligature_st;
2209             }
2210             return check_locale_boundary_crossing(p, result, ustrp, lenp);
2211         }
2212         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2213             return result;
2214         }
2215         else {
2216             /* This is called when changing the case of a utf8-encoded
2217              * character above the ASCII range, and the result should not
2218              * contain an ASCII character. */
2219
2220             UV original;    /* To store the first code point of <p> */
2221
2222             /* Look at every character in the result; if any cross the
2223             * boundary, the whole thing is disallowed */
2224             U8* s = ustrp;
2225             U8* e = ustrp + *lenp;
2226             while (s < e) {
2227                 if (isASCII(*s)) {
2228                     /* Crossed, have to return the original */
2229                     original = valid_utf8_to_uvchr(p, lenp);
2230
2231                     /* But in these instances, there is an alternative we can
2232                      * return that is valid */
2233                     if (original == LATIN_CAPITAL_LETTER_SHARP_S
2234                         || original == LATIN_SMALL_LETTER_SHARP_S)
2235                     {
2236                         goto return_long_s;
2237                     }
2238                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2239                         goto return_ligature_st;
2240                     }
2241                     Copy(p, ustrp, *lenp, char);
2242                     return original;
2243                 }
2244                 s += UTF8SKIP(s);
2245             }
2246
2247             /* Here, no characters crossed, result is ok as-is */
2248             return result;
2249         }
2250     }
2251
2252     /* Here, used locale rules.  Convert back to utf8 */
2253     if (UTF8_IS_INVARIANT(result)) {
2254         *ustrp = (U8) result;
2255         *lenp = 1;
2256     }
2257     else {
2258         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2259         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
2260         *lenp = 2;
2261     }
2262
2263     return result;
2264
2265   return_long_s:
2266     /* Certain folds to 'ss' are prohibited by the options, but they do allow
2267      * folds to a string of two of these characters.  By returning this
2268      * instead, then, e.g.,
2269      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2270      * works. */
2271
2272     *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2273     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2274         ustrp, *lenp, U8);
2275     return LATIN_SMALL_LETTER_LONG_S;
2276
2277   return_ligature_st:
2278     /* Two folds to 'st' are prohibited by the options; instead we pick one and
2279      * have the other one fold to it */
2280
2281     *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2282     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2283     return LATIN_SMALL_LIGATURE_ST;
2284 }
2285
2286 /* Note:
2287  * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2288  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2289  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2290  */
2291
2292 SV*
2293 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2294 {
2295     PERL_ARGS_ASSERT_SWASH_INIT;
2296
2297     /* Returns a copy of a swash initiated by the called function.  This is the
2298      * public interface, and returning a copy prevents others from doing
2299      * mischief on the original */
2300
2301     return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
2302 }
2303
2304 SV*
2305 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
2306 {
2307     /* Initialize and return a swash, creating it if necessary.  It does this
2308      * by calling utf8_heavy.pl in the general case.  The returned value may be
2309      * the swash's inversion list instead if the input parameters allow it.
2310      * Which is returned should be immaterial to callers, as the only
2311      * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2312      * and swash_to_invlist() handle both these transparently.
2313      *
2314      * This interface should only be used by functions that won't destroy or
2315      * adversely change the swash, as doing so affects all other uses of the
2316      * swash in the program; the general public should use 'Perl_swash_init'
2317      * instead.
2318      *
2319      * pkg  is the name of the package that <name> should be in.
2320      * name is the name of the swash to find.  Typically it is a Unicode
2321      *      property name, including user-defined ones
2322      * listsv is a string to initialize the swash with.  It must be of the form
2323      *      documented as the subroutine return value in
2324      *      L<perlunicode/User-Defined Character Properties>
2325      * minbits is the number of bits required to represent each data element.
2326      *      It is '1' for binary properties.
2327      * none I (khw) do not understand this one, but it is used only in tr///.
2328      * invlist is an inversion list to initialize the swash with (or NULL)
2329      * flags_p if non-NULL is the address of various input and output flag bits
2330      *      to the routine, as follows:  ('I' means is input to the routine;
2331      *      'O' means output from the routine.  Only flags marked O are
2332      *      meaningful on return.)
2333      *  _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2334      *      came from a user-defined property.  (I O)
2335      *  _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2336      *      when the swash cannot be located, to simply return NULL. (I)
2337      *  _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2338      *      return of an inversion list instead of a swash hash if this routine
2339      *      thinks that would result in faster execution of swash_fetch() later
2340      *      on. (I)
2341      *
2342      * Thus there are three possible inputs to find the swash: <name>,
2343      * <listsv>, and <invlist>.  At least one must be specified.  The result
2344      * will be the union of the specified ones, although <listsv>'s various
2345      * actions can intersect, etc. what <name> gives.  To avoid going out to
2346      * disk at all, <invlist> should specify completely what the swash should
2347      * have, and <listsv> should be &PL_sv_undef and <name> should be "".
2348      *
2349      * <invlist> is only valid for binary properties */
2350
2351     dVAR;
2352     SV* retval = &PL_sv_undef;
2353     HV* swash_hv = NULL;
2354     const int invlist_swash_boundary =
2355         (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2356         ? 512    /* Based on some benchmarking, but not extensive, see commit
2357                     message */
2358         : -1;   /* Never return just an inversion list */
2359
2360     assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2361     assert(! invlist || minbits == 1);
2362
2363     /* If data was passed in to go out to utf8_heavy to find the swash of, do
2364      * so */
2365     if (listsv != &PL_sv_undef || strNE(name, "")) {
2366         dSP;
2367         const size_t pkg_len = strlen(pkg);
2368         const size_t name_len = strlen(name);
2369         HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2370         SV* errsv_save;
2371         GV *method;
2372
2373         PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2374
2375         PUSHSTACKi(PERLSI_MAGIC);
2376         ENTER;
2377         SAVEHINTS();
2378         save_re_context();
2379         /* We might get here via a subroutine signature which uses a utf8
2380          * parameter name, at which point PL_subname will have been set
2381          * but not yet used. */
2382         save_item(PL_subname);
2383         if (PL_parser && PL_parser->error_count)
2384             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2385         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2386         if (!method) {  /* demand load utf8 */
2387             ENTER;
2388             if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2389             GvSV(PL_errgv) = NULL;
2390             /* It is assumed that callers of this routine are not passing in
2391              * any user derived data.  */
2392             /* Need to do this after save_re_context() as it will set
2393              * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2394              * in Perl_magic_get).  Even line to create errsv_save can turn on
2395              * PL_tainted.  */
2396 #ifndef NO_TAINT_SUPPORT
2397             SAVEBOOL(TAINT_get);
2398             TAINT_NOT;
2399 #endif
2400             Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2401                              NULL);
2402             {
2403                 /* Not ERRSV, as there is no need to vivify a scalar we are
2404                    about to discard. */
2405                 SV * const errsv = GvSV(PL_errgv);
2406                 if (!SvTRUE(errsv)) {
2407                     GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2408                     SvREFCNT_dec(errsv);
2409                 }
2410             }
2411             LEAVE;
2412         }
2413         SPAGAIN;
2414         PUSHMARK(SP);
2415         EXTEND(SP,5);
2416         mPUSHp(pkg, pkg_len);
2417         mPUSHp(name, name_len);
2418         PUSHs(listsv);
2419         mPUSHi(minbits);
2420         mPUSHi(none);
2421         PUTBACK;
2422         if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2423         GvSV(PL_errgv) = NULL;
2424         /* If we already have a pointer to the method, no need to use
2425          * call_method() to repeat the lookup.  */
2426         if (method
2427             ? call_sv(MUTABLE_SV(method), G_SCALAR)
2428             : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2429         {
2430             retval = *PL_stack_sp--;
2431             SvREFCNT_inc(retval);
2432         }
2433         {
2434             /* Not ERRSV.  See above. */
2435             SV * const errsv = GvSV(PL_errgv);
2436             if (!SvTRUE(errsv)) {
2437                 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2438                 SvREFCNT_dec(errsv);
2439             }
2440         }
2441         LEAVE;
2442         POPSTACK;
2443         if (IN_PERL_COMPILETIME) {
2444             CopHINTS_set(PL_curcop, PL_hints);
2445         }
2446         if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2447             if (SvPOK(retval))
2448
2449                 /* If caller wants to handle missing properties, let them */
2450                 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2451                     return NULL;
2452                 }
2453                 Perl_croak(aTHX_
2454                            "Can't find Unicode property definition \"%"SVf"\"",
2455                            SVfARG(retval));
2456                 NOT_REACHED; /* NOTREACHED */
2457         }
2458     } /* End of calling the module to find the swash */
2459
2460     /* If this operation fetched a swash, and we will need it later, get it */
2461     if (retval != &PL_sv_undef
2462         && (minbits == 1 || (flags_p
2463                             && ! (*flags_p
2464                                   & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
2465     {
2466         swash_hv = MUTABLE_HV(SvRV(retval));
2467
2468         /* If we don't already know that there is a user-defined component to
2469          * this swash, and the user has indicated they wish to know if there is
2470          * one (by passing <flags_p>), find out */
2471         if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
2472             SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
2473             if (user_defined && SvUV(*user_defined)) {
2474                 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
2475             }
2476         }
2477     }
2478
2479     /* Make sure there is an inversion list for binary properties */
2480     if (minbits == 1) {
2481         SV** swash_invlistsvp = NULL;
2482         SV* swash_invlist = NULL;
2483         bool invlist_in_swash_is_valid = FALSE;
2484         bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
2485                                             an unclaimed reference count */
2486
2487         /* If this operation fetched a swash, get its already existing
2488          * inversion list, or create one for it */
2489
2490         if (swash_hv) {
2491             swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
2492             if (swash_invlistsvp) {
2493                 swash_invlist = *swash_invlistsvp;
2494                 invlist_in_swash_is_valid = TRUE;
2495             }
2496             else {
2497                 swash_invlist = _swash_to_invlist(retval);
2498                 swash_invlist_unclaimed = TRUE;
2499             }
2500         }
2501
2502         /* If an inversion list was passed in, have to include it */
2503         if (invlist) {
2504
2505             /* Any fetched swash will by now have an inversion list in it;
2506              * otherwise <swash_invlist>  will be NULL, indicating that we
2507              * didn't fetch a swash */
2508             if (swash_invlist) {
2509
2510                 /* Add the passed-in inversion list, which invalidates the one
2511                  * already stored in the swash */
2512                 invlist_in_swash_is_valid = FALSE;
2513                 _invlist_union(invlist, swash_invlist, &swash_invlist);
2514             }
2515             else {
2516
2517                 /* Here, there is no swash already.  Set up a minimal one, if
2518                  * we are going to return a swash */
2519                 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
2520                     swash_hv = newHV();
2521                     retval = newRV_noinc(MUTABLE_SV(swash_hv));
2522                 }
2523                 swash_invlist = invlist;
2524             }
2525         }
2526
2527         /* Here, we have computed the union of all the passed-in data.  It may
2528          * be that there was an inversion list in the swash which didn't get
2529          * touched; otherwise save the computed one */
2530         if (! invlist_in_swash_is_valid
2531             && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
2532         {
2533             if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
2534             {
2535                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2536             }
2537             /* We just stole a reference count. */
2538             if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
2539             else SvREFCNT_inc_simple_void_NN(swash_invlist);
2540         }
2541
2542         SvREADONLY_on(swash_invlist);
2543
2544         /* Use the inversion list stand-alone if small enough */
2545         if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
2546             SvREFCNT_dec(retval);
2547             if (!swash_invlist_unclaimed)
2548                 SvREFCNT_inc_simple_void_NN(swash_invlist);
2549             retval = newRV_noinc(swash_invlist);
2550         }
2551     }
2552
2553     return retval;
2554 }
2555
2556
2557 /* This API is wrong for special case conversions since we may need to
2558  * return several Unicode characters for a single Unicode character
2559  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2560  * the lower-level routine, and it is similarly broken for returning
2561  * multiple values.  --jhi
2562  * For those, you should use to_utf8_case() instead */
2563 /* Now SWASHGET is recasted into S_swatch_get in this file. */
2564
2565 /* Note:
2566  * Returns the value of property/mapping C<swash> for the first character
2567  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
2568  * assumed to be in well-formed utf8. If C<do_utf8> is false, the string C<ptr>
2569  * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
2570  *
2571  * A "swash" is a hash which contains initially the keys/values set up by
2572  * SWASHNEW.  The purpose is to be able to completely represent a Unicode
2573  * property for all possible code points.  Things are stored in a compact form
2574  * (see utf8_heavy.pl) so that calculation is required to find the actual
2575  * property value for a given code point.  As code points are looked up, new
2576  * key/value pairs are added to the hash, so that the calculation doesn't have
2577  * to ever be re-done.  Further, each calculation is done, not just for the
2578  * desired one, but for a whole block of code points adjacent to that one.
2579  * For binary properties on ASCII machines, the block is usually for 64 code
2580  * points, starting with a code point evenly divisible by 64.  Thus if the
2581  * property value for code point 257 is requested, the code goes out and
2582  * calculates the property values for all 64 code points between 256 and 319,
2583  * and stores these as a single 64-bit long bit vector, called a "swatch",
2584  * under the key for code point 256.  The key is the UTF-8 encoding for code
2585  * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
2586  * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
2587  * for code point 258 is then requested, this code realizes that it would be
2588  * stored under the key for 256, and would find that value and extract the
2589  * relevant bit, offset from 256.
2590  *
2591  * Non-binary properties are stored in as many bits as necessary to represent
2592  * their values (32 currently, though the code is more general than that), not
2593  * as single bits, but the principal is the same: the value for each key is a
2594  * vector that encompasses the property values for all code points whose UTF-8
2595  * representations are represented by the key.  That is, for all code points
2596  * whose UTF-8 representations are length N bytes, and the key is the first N-1
2597  * bytes of that.
2598  */
2599 UV
2600 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
2601 {
2602     dVAR;
2603     HV *const hv = MUTABLE_HV(SvRV(swash));
2604     U32 klen;
2605     U32 off;
2606     STRLEN slen = 0;
2607     STRLEN needents;
2608     const U8 *tmps = NULL;
2609     SV *swatch;
2610     const U8 c = *ptr;
2611
2612     PERL_ARGS_ASSERT_SWASH_FETCH;
2613
2614     /* If it really isn't a hash, it isn't really swash; must be an inversion
2615      * list */
2616     if (SvTYPE(hv) != SVt_PVHV) {
2617         return _invlist_contains_cp((SV*)hv,
2618                                     (do_utf8)
2619                                      ? valid_utf8_to_uvchr(ptr, NULL)
2620                                      : c);
2621     }
2622
2623     /* We store the values in a "swatch" which is a vec() value in a swash
2624      * hash.  Code points 0-255 are a single vec() stored with key length
2625      * (klen) 0.  All other code points have a UTF-8 representation
2626      * 0xAA..0xYY,0xZZ.  A vec() is constructed containing all of them which
2627      * share 0xAA..0xYY, which is the key in the hash to that vec.  So the key
2628      * length for them is the length of the encoded char - 1.  ptr[klen] is the
2629      * final byte in the sequence representing the character */
2630     if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
2631         klen = 0;
2632         needents = 256;
2633         off = c;
2634     }
2635     else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2636         klen = 0;
2637         needents = 256;
2638         off = TWO_BYTE_UTF8_TO_NATIVE(c, *(ptr + 1));
2639     }
2640     else {
2641         klen = UTF8SKIP(ptr) - 1;
2642
2643         /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values.  The offset into
2644          * the vec is the final byte in the sequence.  (In EBCDIC this is
2645          * converted to I8 to get consecutive values.)  To help you visualize
2646          * all this:
2647          *                       Straight 1047   After final byte
2648          *             UTF-8      UTF-EBCDIC     I8 transform
2649          *  U+0400:  \xD0\x80    \xB8\x41\x41    \xB8\x41\xA0
2650          *  U+0401:  \xD0\x81    \xB8\x41\x42    \xB8\x41\xA1
2651          *    ...
2652          *  U+0409:  \xD0\x89    \xB8\x41\x4A    \xB8\x41\xA9
2653          *  U+040A:  \xD0\x8A    \xB8\x41\x51    \xB8\x41\xAA
2654          *    ...
2655          *  U+0412:  \xD0\x92    \xB8\x41\x59    \xB8\x41\xB2
2656          *  U+0413:  \xD0\x93    \xB8\x41\x62    \xB8\x41\xB3
2657          *    ...
2658          *  U+041B:  \xD0\x9B    \xB8\x41\x6A    \xB8\x41\xBB
2659          *  U+041C:  \xD0\x9C    \xB8\x41\x70    \xB8\x41\xBC
2660          *    ...
2661          *  U+041F:  \xD0\x9F    \xB8\x41\x73    \xB8\x41\xBF
2662          *  U+0420:  \xD0\xA0    \xB8\x42\x41    \xB8\x42\x41
2663          *
2664          * (There are no discontinuities in the elided (...) entries.)
2665          * The UTF-8 key for these 33 code points is '\xD0' (which also is the
2666          * key for the next 31, up through U+043F, whose UTF-8 final byte is
2667          * \xBF).  Thus in UTF-8, each key is for a vec() for 64 code points.
2668          * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
2669          * index into the vec() swatch (after subtracting 0x80, which we
2670          * actually do with an '&').
2671          * In UTF-EBCDIC, each key is for a 32 code point vec().  The first 32
2672          * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
2673          * dicontinuities which go away by transforming it into I8, and we
2674          * effectively subtract 0xA0 to get the index. */
2675         needents = (1 << UTF_ACCUMULATION_SHIFT);
2676         off      = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
2677     }
2678
2679     /*
2680      * This single-entry cache saves about 1/3 of the utf8 overhead in test
2681      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
2682      * it's nothing to sniff at.)  Pity we usually come through at least
2683      * two function calls to get here...
2684      *
2685      * NB: this code assumes that swatches are never modified, once generated!
2686      */
2687
2688     if (hv   == PL_last_swash_hv &&
2689         klen == PL_last_swash_klen &&
2690         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
2691     {
2692         tmps = PL_last_swash_tmps;
2693         slen = PL_last_swash_slen;
2694     }
2695     else {
2696         /* Try our second-level swatch cache, kept in a hash. */
2697         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
2698
2699         /* If not cached, generate it via swatch_get */
2700         if (!svp || !SvPOK(*svp)
2701                  || !(tmps = (const U8*)SvPV_const(*svp, slen)))
2702         {
2703             if (klen) {
2704                 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
2705                 swatch = swatch_get(swash,
2706                                     code_point & ~((UV)needents - 1),
2707                                     needents);
2708             }
2709             else {  /* For the first 256 code points, the swatch has a key of
2710                        length 0 */
2711                 swatch = swatch_get(swash, 0, needents);
2712             }
2713
2714             if (IN_PERL_COMPILETIME)
2715                 CopHINTS_set(PL_curcop, PL_hints);
2716
2717             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
2718
2719             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
2720                      || (slen << 3) < needents)
2721                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
2722                            "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
2723                            svp, tmps, (UV)slen, (UV)needents);
2724         }
2725
2726         PL_last_swash_hv = hv;
2727         assert(klen <= sizeof(PL_last_swash_key));
2728         PL_last_swash_klen = (U8)klen;
2729         /* FIXME change interpvar.h?  */
2730         PL_last_swash_tmps = (U8 *) tmps;
2731         PL_last_swash_slen = slen;
2732         if (klen)
2733             Copy(ptr, PL_last_swash_key, klen, U8);
2734     }
2735
2736     switch ((int)((slen << 3) / needents)) {
2737     case 1:
2738         return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
2739     case 8:
2740         return ((UV) tmps[off]);
2741     case 16:
2742         off <<= 1;
2743         return
2744             ((UV) tmps[off    ] << 8) +
2745             ((UV) tmps[off + 1]);
2746     case 32:
2747         off <<= 2;
2748         return
2749             ((UV) tmps[off    ] << 24) +
2750             ((UV) tmps[off + 1] << 16) +
2751             ((UV) tmps[off + 2] <<  8) +
2752             ((UV) tmps[off + 3]);
2753     }
2754     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
2755                "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
2756     NORETURN_FUNCTION_END;
2757 }
2758
2759 /* Read a single line of the main body of the swash input text.  These are of
2760  * the form:
2761  * 0053 0056    0073
2762  * where each number is hex.  The first two numbers form the minimum and
2763  * maximum of a range, and the third is the value associated with the range.
2764  * Not all swashes should have a third number
2765  *
2766  * On input: l    points to the beginning of the line to be examined; it points
2767  *                to somewhere in the string of the whole input text, and is
2768  *                terminated by a \n or the null string terminator.
2769  *           lend   points to the null terminator of that string
2770  *           wants_value    is non-zero if the swash expects a third number
2771  *           typestr is the name of the swash's mapping, like 'ToLower'
2772  * On output: *min, *max, and *val are set to the values read from the line.
2773  *            returns a pointer just beyond the line examined.  If there was no
2774  *            valid min number on the line, returns lend+1
2775  */
2776
2777 STATIC U8*
2778 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
2779                              const bool wants_value, const U8* const typestr)
2780 {
2781     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
2782     STRLEN numlen;          /* Length of the number */
2783     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
2784                 | PERL_SCAN_DISALLOW_PREFIX
2785                 | PERL_SCAN_SILENT_NON_PORTABLE;
2786
2787     /* nl points to the next \n in the scan */
2788     U8* const nl = (U8*)memchr(l, '\n', lend - l);
2789
2790     PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
2791
2792     /* Get the first number on the line: the range minimum */
2793     numlen = lend - l;
2794     *min = grok_hex((char *)l, &numlen, &flags, NULL);
2795     *max = *min;    /* So can never return without setting max */
2796     if (numlen)     /* If found a hex number, position past it */
2797         l += numlen;
2798     else if (nl) {          /* Else, go handle next line, if any */
2799         return nl + 1;  /* 1 is length of "\n" */
2800     }
2801     else {              /* Else, no next line */
2802         return lend + 1;        /* to LIST's end at which \n is not found */
2803     }
2804
2805     /* The max range value follows, separated by a BLANK */
2806     if (isBLANK(*l)) {
2807         ++l;
2808         flags = PERL_SCAN_SILENT_ILLDIGIT
2809                 | PERL_SCAN_DISALLOW_PREFIX
2810                 | PERL_SCAN_SILENT_NON_PORTABLE;
2811         numlen = lend - l;
2812         *max = grok_hex((char *)l, &numlen, &flags, NULL);
2813         if (numlen)
2814             l += numlen;
2815         else    /* If no value here, it is a single element range */
2816             *max = *min;
2817
2818         /* Non-binary tables have a third entry: what the first element of the
2819          * range maps to.  The map for those currently read here is in hex */
2820         if (wants_value) {
2821             if (isBLANK(*l)) {
2822                 ++l;
2823                 flags = PERL_SCAN_SILENT_ILLDIGIT
2824                     | PERL_SCAN_DISALLOW_PREFIX
2825                     | PERL_SCAN_SILENT_NON_PORTABLE;
2826                 numlen = lend - l;
2827                 *val = grok_hex((char *)l, &numlen, &flags, NULL);
2828                 if (numlen)
2829                     l += numlen;
2830                 else
2831                     *val = 0;
2832             }
2833             else {
2834                 *val = 0;
2835                 if (typeto) {
2836                     /* diag_listed_as: To%s: illegal mapping '%s' */
2837                     Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2838                                      typestr, l);
2839                 }
2840             }
2841         }
2842         else
2843             *val = 0; /* bits == 1, then any val should be ignored */
2844     }
2845     else { /* Nothing following range min, should be single element with no
2846               mapping expected */
2847         if (wants_value) {
2848             *val = 0;
2849             if (typeto) {
2850                 /* diag_listed_as: To%s: illegal mapping '%s' */
2851                 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2852             }
2853         }
2854         else
2855             *val = 0; /* bits == 1, then val should be ignored */
2856     }
2857
2858     /* Position to next line if any, or EOF */
2859     if (nl)
2860         l = nl + 1;
2861     else
2862         l = lend;
2863
2864     return l;
2865 }
2866
2867 /* Note:
2868  * Returns a swatch (a bit vector string) for a code point sequence
2869  * that starts from the value C<start> and comprises the number C<span>.
2870  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
2871  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
2872  */
2873 STATIC SV*
2874 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
2875 {
2876     SV *swatch;
2877     U8 *l, *lend, *x, *xend, *s, *send;
2878     STRLEN lcur, xcur, scur;
2879     HV *const hv = MUTABLE_HV(SvRV(swash));
2880     SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
2881
2882     SV** listsvp = NULL; /* The string containing the main body of the table */
2883     SV** extssvp = NULL;
2884     SV** invert_it_svp = NULL;
2885     U8* typestr = NULL;
2886     STRLEN bits;
2887     STRLEN octets; /* if bits == 1, then octets == 0 */
2888     UV  none;
2889     UV  end = start + span;
2890
2891     if (invlistsvp == NULL) {
2892         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2893         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
2894         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
2895         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
2896         listsvp = hv_fetchs(hv, "LIST", FALSE);
2897         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
2898
2899         bits  = SvUV(*bitssvp);
2900         none  = SvUV(*nonesvp);
2901         typestr = (U8*)SvPV_nolen(*typesvp);
2902     }
2903     else {
2904         bits = 1;
2905         none = 0;
2906     }
2907     octets = bits >> 3; /* if bits == 1, then octets == 0 */
2908
2909     PERL_ARGS_ASSERT_SWATCH_GET;
2910
2911     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
2912         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
2913                                                  (UV)bits);
2914     }
2915
2916     /* If overflowed, use the max possible */
2917     if (end < start) {
2918         end = UV_MAX;
2919         span = end - start;
2920     }
2921
2922     /* create and initialize $swatch */
2923     scur   = octets ? (span * octets) : (span + 7) / 8;
2924     swatch = newSV(scur);
2925     SvPOK_on(swatch);
2926     s = (U8*)SvPVX(swatch);
2927     if (octets && none) {
2928         const U8* const e = s + scur;
2929         while (s < e) {
2930             if (bits == 8)
2931                 *s++ = (U8)(none & 0xff);
2932             else if (bits == 16) {
2933                 *s++ = (U8)((none >>  8) & 0xff);
2934                 *s++ = (U8)( none        & 0xff);
2935             }
2936             else if (bits == 32) {
2937                 *s++ = (U8)((none >> 24) & 0xff);
2938                 *s++ = (U8)((none >> 16) & 0xff);
2939                 *s++ = (U8)((none >>  8) & 0xff);
2940                 *s++ = (U8)( none        & 0xff);
2941             }
2942         }
2943         *s = '\0';
2944     }
2945     else {
2946         (void)memzero((U8*)s, scur + 1);
2947     }
2948     SvCUR_set(swatch, scur);
2949     s = (U8*)SvPVX(swatch);
2950
2951     if (invlistsvp) {   /* If has an inversion list set up use that */
2952         _invlist_populate_swatch(*invlistsvp, start, end, s);
2953         return swatch;
2954     }
2955
2956     /* read $swash->{LIST} */
2957     l = (U8*)SvPV(*listsvp, lcur);
2958     lend = l + lcur;
2959     while (l < lend) {
2960         UV min, max, val, upper;
2961         l = swash_scan_list_line(l, lend, &min, &max, &val,
2962                                                         cBOOL(octets), typestr);
2963         if (l > lend) {
2964             break;
2965         }
2966
2967         /* If looking for something beyond this range, go try the next one */
2968         if (max < start)
2969             continue;
2970
2971         /* <end> is generally 1 beyond where we want to set things, but at the
2972          * platform's infinity, where we can't go any higher, we want to
2973          * include the code point at <end> */
2974         upper = (max < end)
2975                 ? max
2976                 : (max != UV_MAX || end != UV_MAX)
2977                   ? end - 1
2978                   : end;
2979
2980         if (octets) {
2981             UV key;
2982             if (min < start) {
2983                 if (!none || val < none) {
2984                     val += start - min;
2985                 }
2986                 min = start;
2987             }
2988             for (key = min; key <= upper; key++) {
2989                 STRLEN offset;
2990                 /* offset must be non-negative (start <= min <= key < end) */
2991                 offset = octets * (key - start);
2992                 if (bits == 8)
2993                     s[offset] = (U8)(val & 0xff);
2994                 else if (bits == 16) {
2995                     s[offset    ] = (U8)((val >>  8) & 0xff);
2996                     s[offset + 1] = (U8)( val        & 0xff);
2997                 }
2998                 else if (bits == 32) {
2999                     s[offset    ] = (U8)((val >> 24) & 0xff);
3000                     s[offset + 1] = (U8)((val >> 16) & 0xff);
3001                     s[offset + 2] = (U8)((val >>  8) & 0xff);
3002                     s[offset + 3] = (U8)( val        & 0xff);
3003                 }
3004
3005                 if (!none || val < none)
3006                     ++val;
3007             }
3008         }
3009         else { /* bits == 1, then val should be ignored */
3010             UV key;
3011             if (min < start)
3012                 min = start;
3013
3014             for (key = min; key <= upper; key++) {
3015                 const STRLEN offset = (STRLEN)(key - start);
3016                 s[offset >> 3] |= 1 << (offset & 7);
3017             }
3018         }
3019     } /* while */
3020
3021     /* Invert if the data says it should be.  Assumes that bits == 1 */
3022     if (invert_it_svp && SvUV(*invert_it_svp)) {
3023
3024         /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3025          * be 0, and their inversion should also be 0, as we don't succeed any
3026          * Unicode property matches for non-Unicode code points */
3027         if (start <= PERL_UNICODE_MAX) {
3028
3029             /* The code below assumes that we never cross the
3030              * Unicode/above-Unicode boundary in a range, as otherwise we would
3031              * have to figure out where to stop flipping the bits.  Since this
3032              * boundary is divisible by a large power of 2, and swatches comes
3033              * in small powers of 2, this should be a valid assumption */
3034             assert(start + span - 1 <= PERL_UNICODE_MAX);
3035
3036             send = s + scur;
3037             while (s < send) {
3038                 *s = ~(*s);
3039                 s++;
3040             }
3041         }
3042     }
3043
3044     /* read $swash->{EXTRAS}
3045      * This code also copied to swash_to_invlist() below */
3046     x = (U8*)SvPV(*extssvp, xcur);
3047     xend = x + xcur;
3048     while (x < xend) {
3049         STRLEN namelen;
3050         U8 *namestr;
3051         SV** othersvp;
3052         HV* otherhv;
3053         STRLEN otherbits;
3054         SV **otherbitssvp, *other;
3055         U8 *s, *o, *nl;
3056         STRLEN slen, olen;
3057
3058         const U8 opc = *x++;
3059         if (opc == '\n')
3060             continue;
3061
3062         nl = (U8*)memchr(x, '\n', xend - x);
3063
3064         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3065             if (nl) {
3066                 x = nl + 1; /* 1 is length of "\n" */
3067                 continue;
3068             }
3069             else {
3070                 x = xend; /* to EXTRAS' end at which \n is not found */
3071                 break;
3072             }
3073         }
3074
3075         namestr = x;
3076         if (nl) {
3077             namelen = nl - namestr;
3078             x = nl + 1;
3079         }
3080         else {
3081             namelen = xend - namestr;
3082             x = xend;
3083         }
3084
3085         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3086         otherhv = MUTABLE_HV(SvRV(*othersvp));
3087         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3088         otherbits = (STRLEN)SvUV(*otherbitssvp);
3089         if (bits < otherbits)
3090             Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3091                        "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3092
3093         /* The "other" swatch must be destroyed after. */
3094         other = swatch_get(*othersvp, start, span);
3095         o = (U8*)SvPV(other, olen);
3096
3097         if (!olen)
3098             Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3099
3100         s = (U8*)SvPV(swatch, slen);
3101         if (bits == 1 && otherbits == 1) {
3102             if (slen != olen)
3103                 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3104                            "mismatch, slen=%"UVuf", olen=%"UVuf,
3105                            (UV)slen, (UV)olen);
3106
3107             switch (opc) {
3108             case '+':
3109                 while (slen--)
3110                     *s++ |= *o++;
3111                 break;
3112             case '!':
3113                 while (slen--)
3114                     *s++ |= ~*o++;
3115                 break;
3116             case '-':
3117                 while (slen--)
3118                     *s++ &= ~*o++;
3119                 break;
3120             case '&':
3121                 while (slen--)
3122                     *s++ &= *o++;
3123                 break;
3124             default:
3125                 break;
3126             }
3127         }
3128         else {
3129             STRLEN otheroctets = otherbits >> 3;
3130             STRLEN offset = 0;
3131             U8* const send = s + slen;
3132
3133             while (s < send) {
3134                 UV otherval = 0;
3135
3136                 if (otherbits == 1) {
3137                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3138                     ++offset;
3139                 }
3140                 else {
3141                     STRLEN vlen = otheroctets;
3142                     otherval = *o++;
3143                     while (--vlen) {
3144                         otherval <<= 8;
3145                         otherval |= *o++;
3146                     }
3147                 }
3148
3149                 if (opc == '+' && otherval)
3150                     NOOP;   /* replace with otherval */
3151                 else if (opc == '!' && !otherval)
3152                     otherval = 1;
3153                 else if (opc == '-' && otherval)
3154                     otherval = 0;
3155                 else if (opc == '&' && !otherval)
3156                     otherval = 0;
3157                 else {
3158                     s += octets; /* no replacement */
3159                     continue;
3160                 }
3161
3162                 if (bits == 8)
3163                     *s++ = (U8)( otherval & 0xff);
3164                 else if (bits == 16) {
3165                     *s++ = (U8)((otherval >>  8) & 0xff);
3166                     *s++ = (U8)( otherval        & 0xff);
3167                 }
3168                 else if (bits == 32) {
3169                     *s++ = (U8)((otherval >> 24) & 0xff);
3170                     *s++ = (U8)((otherval >> 16) & 0xff);
3171                     *s++ = (U8)((otherval >>  8) & 0xff);
3172                     *s++ = (U8)( otherval        & 0xff);
3173                 }
3174             }
3175         }
3176         sv_free(other); /* through with it! */
3177     } /* while */
3178     return swatch;
3179 }
3180
3181 HV*
3182 Perl__swash_inversion_hash(pTHX_ SV* const swash)
3183 {
3184
3185    /* Subject to change or removal.  For use only in regcomp.c and regexec.c
3186     * Can't be used on a property that is subject to user override, as it
3187     * relies on the value of SPECIALS in the swash which would be set by
3188     * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3189     * for overridden properties
3190     *
3191     * Returns a hash which is the inversion and closure of a swash mapping.
3192     * For example, consider the input lines:
3193     * 004B              006B
3194     * 004C              006C
3195     * 212A              006B
3196     *
3197     * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3198     * 006C.  The value for each key is an array.  For 006C, the array would
3199     * have two elements, the utf8 for itself, and for 004C.  For 006B, there
3200     * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3201     *
3202     * Note that there are no elements in the hash for 004B, 004C, 212A.  The
3203     * keys are only code points that are folded-to, so it isn't a full closure.
3204     *
3205     * Essentially, for any code point, it gives all the code points that map to
3206     * it, or the list of 'froms' for that point.
3207     *
3208     * Currently it ignores any additions or deletions from other swashes,
3209     * looking at just the main body of the swash, and if there are SPECIALS
3210     * in the swash, at that hash
3211     *
3212     * The specials hash can be extra code points, and most likely consists of
3213     * maps from single code points to multiple ones (each expressed as a string
3214     * of utf8 characters).   This function currently returns only 1-1 mappings.
3215     * However consider this possible input in the specials hash:
3216     * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
3217     * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
3218     *
3219     * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3220     * currently handle.  But it also means that FB05 and FB06 are equivalent in
3221     * a 1-1 mapping which we should handle, and this relationship may not be in
3222     * the main table.  Therefore this function examines all the multi-char
3223     * sequences and adds the 1-1 mappings that come out of that.  */
3224
3225     U8 *l, *lend;
3226     STRLEN lcur;
3227     HV *const hv = MUTABLE_HV(SvRV(swash));
3228
3229     /* The string containing the main body of the table.  This will have its
3230      * assertion fail if the swash has been converted to its inversion list */
3231     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3232
3233     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3234     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3235     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3236     /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3237     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3238     const STRLEN bits  = SvUV(*bitssvp);
3239     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3240     const UV     none  = SvUV(*nonesvp);
3241     SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3242
3243     HV* ret = newHV();
3244
3245     PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3246
3247     /* Must have at least 8 bits to get the mappings */
3248     if (bits != 8 && bits != 16 && bits != 32) {
3249         Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3250                                                  (UV)bits);
3251     }
3252
3253     if (specials_p) { /* It might be "special" (sometimes, but not always, a
3254                         mapping to more than one character */
3255
3256         /* Construct an inverse mapping hash for the specials */
3257         HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3258         HV * specials_inverse = newHV();
3259         char *char_from; /* the lhs of the map */
3260         I32 from_len;   /* its byte length */
3261         char *char_to;  /* the rhs of the map */
3262         I32 to_len;     /* its byte length */
3263         SV *sv_to;      /* and in a sv */
3264         AV* from_list;  /* list of things that map to each 'to' */
3265
3266         hv_iterinit(specials_hv);
3267
3268         /* The keys are the characters (in utf8) that map to the corresponding
3269          * utf8 string value.  Iterate through the list creating the inverse
3270          * list. */
3271         while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3272             SV** listp;
3273             if (! SvPOK(sv_to)) {
3274                 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3275                            "unexpectedly is not a string, flags=%lu",
3276                            (unsigned long)SvFLAGS(sv_to));
3277             }
3278             /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3279
3280             /* Each key in the inverse list is a mapped-to value, and the key's
3281              * hash value is a list of the strings (each in utf8) that map to
3282              * it.  Those strings are all one character long */
3283             if ((listp = hv_fetch(specials_inverse,
3284                                     SvPVX(sv_to),
3285                                     SvCUR(sv_to), 0)))
3286             {
3287                 from_list = (AV*) *listp;
3288             }
3289             else { /* No entry yet for it: create one */
3290                 from_list = newAV();
3291                 if (! hv_store(specials_inverse,
3292                                 SvPVX(sv_to),
3293                                 SvCUR(sv_to),
3294                                 (SV*) from_list, 0))
3295                 {
3296                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3297                 }
3298             }
3299
3300             /* Here have the list associated with this 'to' (perhaps newly
3301              * created and empty).  Just add to it.  Note that we ASSUME that
3302              * the input is guaranteed to not have duplications, so we don't
3303              * check for that.  Duplications just slow down execution time. */
3304             av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3305         }
3306
3307         /* Here, 'specials_inverse' contains the inverse mapping.  Go through
3308          * it looking for cases like the FB05/FB06 examples above.  There would
3309          * be an entry in the hash like
3310         *       'st' => [ FB05, FB06 ]
3311         * In this example we will create two lists that get stored in the
3312         * returned hash, 'ret':
3313         *       FB05 => [ FB05, FB06 ]
3314         *       FB06 => [ FB05, FB06 ]
3315         *
3316         * Note that there is nothing to do if the array only has one element.
3317         * (In the normal 1-1 case handled below, we don't have to worry about
3318         * two lists, as everything gets tied to the single list that is
3319         * generated for the single character 'to'.  But here, we are omitting
3320         * that list, ('st' in the example), so must have multiple lists.) */
3321         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3322                                                  &char_to, &to_len)))
3323         {
3324             if (av_tindex(from_list) > 0) {
3325                 SSize_t i;
3326
3327                 /* We iterate over all combinations of i,j to place each code
3328                  * point on each list */
3329                 for (i = 0; i <= av_tindex(from_list); i++) {
3330                     SSize_t j;
3331                     AV* i_list = newAV();
3332                     SV** entryp = av_fetch(from_list, i, FALSE);
3333                     if (entryp == NULL) {
3334                         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3335                     }
3336                     if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3337                         Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3338                     }
3339                     if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3340                                    (SV*) i_list, FALSE))
3341                     {
3342                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3343                     }
3344
3345                     /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3346                     for (j = 0; j <= av_tindex(from_list); j++) {
3347                         entryp = av_fetch(from_list, j, FALSE);
3348                         if (entryp == NULL) {
3349                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3350                         }
3351
3352                         /* When i==j this adds itself to the list */
3353                         av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3354                                         (U8*) SvPVX(*entryp),
3355                                         (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3356                                         0)));
3357                         /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3358                     }
3359                 }
3360             }
3361         }
3362         SvREFCNT_dec(specials_inverse); /* done with it */
3363     } /* End of specials */
3364
3365     /* read $swash->{LIST} */
3366     l = (U8*)SvPV(*listsvp, lcur);
3367     lend = l + lcur;
3368
3369     /* Go through each input line */
3370     while (l < lend) {
3371         UV min, max, val;
3372         UV inverse;
3373         l = swash_scan_list_line(l, lend, &min, &max, &val,
3374                                                      cBOOL(octets), typestr);
3375         if (l > lend) {
3376             break;
3377         }
3378
3379         /* Each element in the range is to be inverted */
3380         for (inverse = min; inverse <= max; inverse++) {
3381             AV* list;
3382             SV** listp;
3383             IV i;
3384             bool found_key = FALSE;
3385             bool found_inverse = FALSE;
3386
3387             /* The key is the inverse mapping */
3388             char key[UTF8_MAXBYTES+1];
3389             char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
3390             STRLEN key_len = key_end - key;
3391
3392             /* Get the list for the map */
3393             if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3394                 list = (AV*) *listp;
3395             }
3396             else { /* No entry yet for it: create one */
3397                 list = newAV();
3398                 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3399                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3400                 }
3401             }
3402
3403             /* Look through list to see if this inverse mapping already is
3404              * listed, or if there is a mapping to itself already */
3405             for (i = 0; i <= av_tindex(list); i++) {
3406                 SV** entryp = av_fetch(list, i, FALSE);
3407                 SV* entry;
3408                 UV uv;
3409                 if (entryp == NULL) {
3410                     Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3411                 }
3412                 entry = *entryp;
3413                 uv = SvUV(entry);
3414                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, uv));*/
3415                 if (uv == val) {
3416                     found_key = TRUE;
3417                 }
3418                 if (uv == inverse) {
3419                     found_inverse = TRUE;
3420                 }
3421
3422                 /* No need to continue searching if found everything we are
3423                  * looking for */
3424                 if (found_key && found_inverse) {
3425                     break;
3426                 }
3427             }
3428
3429             /* Make sure there is a mapping to itself on the list */
3430             if (! found_key) {
3431                 av_push(list, newSVuv(val));
3432                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, val, val));*/
3433             }
3434
3435
3436             /* Simply add the value to the list */
3437             if (! found_inverse) {
3438                 av_push(list, newSVuv(inverse));
3439                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %"UVXf" to list for %"UVXf"\n", __FILE__, __LINE__, inverse, val));*/
3440             }
3441
3442             /* swatch_get() increments the value of val for each element in the
3443              * range.  That makes more compact tables possible.  You can
3444              * express the capitalization, for example, of all consecutive
3445              * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3446              * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
3447              * and it's not documented; it appears to be used only in
3448              * implementing tr//; I copied the semantics from swatch_get(), just
3449              * in case */
3450             if (!none || val < none) {
3451                 ++val;
3452             }
3453         }
3454     }
3455
3456     return ret;
3457 }
3458
3459 SV*
3460 Perl__swash_to_invlist(pTHX_ SV* const swash)
3461 {
3462
3463    /* Subject to change or removal.  For use only in one place in regcomp.c.
3464     * Ownership is given to one reference count in the returned SV* */
3465
3466     U8 *l, *lend;
3467     char *loc;
3468     STRLEN lcur;
3469     HV *const hv = MUTABLE_HV(SvRV(swash));
3470     UV elements = 0;    /* Number of elements in the inversion list */
3471     U8 empty[] = "";
3472     SV** listsvp;
3473     SV** typesvp;
3474     SV** bitssvp;
3475     SV** extssvp;
3476     SV** invert_it_svp;
3477
3478     U8* typestr;
3479     STRLEN bits;
3480     STRLEN octets; /* if bits == 1, then octets == 0 */
3481     U8 *x, *xend;
3482     STRLEN xcur;
3483
3484     SV* invlist;
3485
3486     PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
3487
3488     /* If not a hash, it must be the swash's inversion list instead */
3489     if (SvTYPE(hv) != SVt_PVHV) {
3490         return SvREFCNT_inc_simple_NN((SV*) hv);
3491     }
3492
3493     /* The string containing the main body of the table */
3494     listsvp = hv_fetchs(hv, "LIST", FALSE);
3495     typesvp = hv_fetchs(hv, "TYPE", FALSE);
3496     bitssvp = hv_fetchs(hv, "BITS", FALSE);
3497     extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3498     invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3499
3500     typestr = (U8*)SvPV_nolen(*typesvp);
3501     bits  = SvUV(*bitssvp);
3502     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3503
3504     /* read $swash->{LIST} */
3505     if (SvPOK(*listsvp)) {
3506         l = (U8*)SvPV(*listsvp, lcur);
3507     }
3508     else {
3509         /* LIST legitimately doesn't contain a string during compilation phases
3510          * of Perl itself, before the Unicode tables are generated.  In this
3511          * case, just fake things up by creating an empty list */
3512         l = empty;
3513         lcur = 0;
3514     }
3515     loc = (char *) l;
3516     lend = l + lcur;
3517
3518     if (*l == 'V') {    /*  Inversion list format */
3519         char *after_strtol = (char *) lend;
3520         UV element0;
3521         UV* other_elements_ptr;
3522
3523         /* The first number is a count of the rest */
3524         l++;
3525         elements = Strtoul((char *)l, &after_strtol, 10);
3526         if (elements == 0) {
3527             invlist = _new_invlist(0);
3528         }
3529         else {
3530             l = (U8 *) after_strtol;
3531
3532             /* Get the 0th element, which is needed to setup the inversion list */
3533             element0 = (UV) Strtoul((char *)l, &after_strtol, 10);
3534             l = (U8 *) after_strtol;
3535             invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
3536             elements--;
3537
3538             /* Then just populate the rest of the input */
3539             while (elements-- > 0) {
3540                 if (l > lend) {
3541                     Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
3542                 }
3543                 *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10);
3544                 l = (U8 *) after_strtol;
3545             }
3546         }
3547     }
3548     else {
3549
3550         /* Scan the input to count the number of lines to preallocate array
3551          * size based on worst possible case, which is each line in the input
3552          * creates 2 elements in the inversion list: 1) the beginning of a
3553          * range in the list; 2) the beginning of a range not in the list.  */
3554         while ((loc = (strchr(loc, '\n'))) != NULL) {
3555             elements += 2;
3556             loc++;
3557         }
3558
3559         /* If the ending is somehow corrupt and isn't a new line, add another
3560          * element for the final range that isn't in the inversion list */
3561         if (! (*lend == '\n'
3562             || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
3563         {
3564             elements++;
3565         }
3566
3567         invlist = _new_invlist(elements);
3568
3569         /* Now go through the input again, adding each range to the list */
3570         while (l < lend) {
3571             UV start, end;
3572             UV val;             /* Not used by this function */
3573
3574             l = swash_scan_list_line(l, lend, &start, &end, &val,
3575                                                         cBOOL(octets), typestr);
3576
3577             if (l > lend) {
3578                 break;
3579             }
3580
3581             invlist = _add_range_to_invlist(invlist, start, end);
3582         }
3583     }
3584
3585     /* Invert if the data says it should be */
3586     if (invert_it_svp && SvUV(*invert_it_svp)) {
3587         _invlist_invert(invlist);
3588     }
3589
3590     /* This code is copied from swatch_get()
3591      * read $swash->{EXTRAS} */
3592     x = (U8*)SvPV(*extssvp, xcur);
3593     xend = x + xcur;
3594     while (x < xend) {
3595         STRLEN namelen;
3596         U8 *namestr;
3597         SV** othersvp;
3598         HV* otherhv;
3599         STRLEN otherbits;
3600         SV **otherbitssvp, *other;
3601         U8 *nl;
3602
3603         const U8 opc = *x++;
3604         if (opc == '\n')
3605             continue;
3606
3607         nl = (U8*)memchr(x, '\n', xend - x);
3608
3609         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3610             if (nl) {
3611                 x = nl + 1; /* 1 is length of "\n" */
3612                 continue;
3613             }
3614             else {
3615                 x = xend; /* to EXTRAS' end at which \n is not found */
3616                 break;
3617             }
3618         }
3619
3620         namestr = x;
3621         if (nl) {
3622             namelen = nl - namestr;
3623             x = nl + 1;
3624         }
3625         else {
3626             namelen = xend - namestr;
3627             x = xend;
3628         }
3629
3630         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3631         otherhv = MUTABLE_HV(SvRV(*othersvp));
3632         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3633         otherbits = (STRLEN)SvUV(*otherbitssvp);
3634
3635         if (bits != otherbits || bits != 1) {
3636             Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
3637                        "properties, bits=%"UVuf", otherbits=%"UVuf,
3638                        (UV)bits, (UV)otherbits);
3639         }
3640
3641         /* The "other" swatch must be destroyed after. */
3642         other = _swash_to_invlist((SV *)*othersvp);
3643
3644         /* End of code copied from swatch_get() */
3645         switch (opc) {
3646         case '+':
3647             _invlist_union(invlist, other, &invlist);
3648             break;
3649         case '!':
3650             _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
3651             break;
3652         case '-':
3653             _invlist_subtract(invlist, other, &invlist);
3654             break;
3655         case '&':
3656             _invlist_intersection(invlist, other, &invlist);
3657             break;
3658         default:
3659             break;
3660         }
3661         sv_free(other); /* through with it! */
3662     }
3663
3664     SvREADONLY_on(invlist);
3665     return invlist;
3666 }
3667
3668 SV*
3669 Perl__get_swash_invlist(pTHX_ SV* const swash)
3670 {
3671     SV** ptr;
3672
3673     PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
3674
3675     if (! SvROK(swash)) {
3676         return NULL;
3677     }
3678
3679     /* If it really isn't a hash, it isn't really swash; must be an inversion
3680      * list */
3681     if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
3682         return SvRV(swash);
3683     }
3684
3685     ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
3686     if (! ptr) {
3687         return NULL;
3688     }
3689
3690     return *ptr;
3691 }
3692
3693 bool
3694 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
3695 {
3696     /* May change: warns if surrogates, non-character code points, or
3697      * non-Unicode code points are in s which has length len bytes.  Returns
3698      * TRUE if none found; FALSE otherwise.  The only other validity check is
3699      * to make sure that this won't exceed the string's length */
3700
3701     const U8* const e = s + len;
3702     bool ok = TRUE;
3703
3704     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3705
3706     while (s < e) {
3707         if (UTF8SKIP(s) > len) {
3708             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3709                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3710             return FALSE;
3711         }
3712         if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
3713             STRLEN char_len;
3714             if (UTF8_IS_SUPER(s)) {
3715                 if (ckWARN_d(WARN_NON_UNICODE)) {
3716                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3717                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3718                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
3719                     ok = FALSE;
3720                 }
3721             }
3722             else if (UTF8_IS_SURROGATE(s)) {
3723                 if (ckWARN_d(WARN_SURROGATE)) {
3724                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3725                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3726                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
3727                     ok = FALSE;
3728                 }
3729             }
3730             else if
3731                 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
3732                  && (ckWARN_d(WARN_NONCHAR)))
3733             {
3734                 UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3735                 Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
3736                     "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
3737                 ok = FALSE;
3738             }
3739         }
3740         s += UTF8SKIP(s);
3741     }
3742
3743     return ok;
3744 }
3745
3746 /*
3747 =for apidoc pv_uni_display
3748
3749 Build to the scalar C<dsv> a displayable version of the string C<spv>,
3750 length C<len>, the displayable version being at most C<pvlim> bytes long
3751 (if longer, the rest is truncated and "..." will be appended).
3752
3753 The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
3754 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
3755 to display the \\[nrfta\\] as the backslashed versions (like '\n')
3756 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
3757 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
3758 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
3759
3760 The pointer to the PV of the C<dsv> is returned.
3761
3762 =cut */
3763 char *
3764 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
3765 {
3766     int truncated = 0;
3767     const char *s, *e;
3768
3769     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
3770
3771     sv_setpvs(dsv, "");
3772     SvUTF8_off(dsv);
3773     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
3774          UV u;
3775           /* This serves double duty as a flag and a character to print after
3776              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
3777           */
3778          char ok = 0;
3779
3780          if (pvlim && SvCUR(dsv) >= pvlim) {
3781               truncated++;
3782               break;
3783          }
3784          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
3785          if (u < 256) {
3786              const unsigned char c = (unsigned char)u & 0xFF;
3787              if (flags & UNI_DISPLAY_BACKSLASH) {
3788                  switch (c) {
3789                  case '\n':
3790                      ok = 'n'; break;
3791                  case '\r':
3792                      ok = 'r'; break;
3793                  case '\t':
3794                      ok = 't'; break;
3795                  case '\f':
3796                      ok = 'f'; break;
3797                  case '\a':
3798                      ok = 'a'; break;
3799                  case '\\':
3800                      ok = '\\'; break;
3801                  default: break;
3802                  }
3803                  if (ok) {
3804                      const char string = ok;
3805                      sv_catpvs(dsv, "\\");
3806                      sv_catpvn(dsv, &string, 1);
3807                  }
3808              }
3809              /* isPRINT() is the locale-blind version. */
3810              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
3811                  const char string = c;
3812                  sv_catpvn(dsv, &string, 1);
3813                  ok = 1;
3814              }
3815          }
3816          if (!ok)
3817              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
3818     }
3819     if (truncated)
3820          sv_catpvs(dsv, "...");
3821
3822     return SvPVX(dsv);
3823 }
3824
3825 /*
3826 =for apidoc sv_uni_display
3827
3828 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
3829 the displayable version being at most C<pvlim> bytes long
3830 (if longer, the rest is truncated and "..." will be appended).
3831
3832 The C<flags> argument is as in L</pv_uni_display>().
3833
3834 The pointer to the PV of the C<dsv> is returned.
3835
3836 =cut
3837 */
3838 char *
3839 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
3840 {
3841     const char * const ptr =
3842         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
3843
3844     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
3845
3846     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
3847                                 SvCUR(ssv), pvlim, flags);
3848 }
3849
3850 /*
3851 =for apidoc foldEQ_utf8
3852
3853 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
3854 of which may be in UTF-8) are the same case-insensitively; false otherwise.
3855 How far into the strings to compare is determined by other input parameters.
3856
3857 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
3858 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
3859 with respect to C<s2>.
3860
3861 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
3862 equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
3863 scan will not be considered to be a match unless the goal is reached, and
3864 scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
3865 C<s2>.
3866
3867 If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
3868 considered an end pointer to the position 1 byte past the maximum point
3869 in C<s1> beyond which scanning will not continue under any circumstances.
3870 (This routine assumes that UTF-8 encoded input strings are not malformed;
3871 malformed input can cause it to read past C<pe1>).
3872 This means that if both C<l1> and C<pe1> are specified, and C<pe1>
3873 is less than C<s1>+C<l1>, the match will never be successful because it can
3874 never
3875 get as far as its goal (and in fact is asserted against).  Correspondingly for
3876 C<pe2> with respect to C<s2>.
3877
3878 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
3879 C<l2> must be non-zero), and if both do, both have to be
3880 reached for a successful match.   Also, if the fold of a character is multiple
3881 characters, all of them must be matched (see tr21 reference below for
3882 'folding').
3883
3884 Upon a successful match, if C<pe1> is non-NULL,
3885 it will be set to point to the beginning of the I<next> character of C<s1>
3886 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
3887
3888 For case-insensitiveness, the "casefolding" of Unicode is used
3889 instead of upper/lowercasing both the characters, see
3890 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
3891
3892 =cut */
3893
3894 /* A flags parameter has been added which may change, and hence isn't
3895  * externally documented.  Currently it is:
3896  *  0 for as-documented above
3897  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
3898                             ASCII one, to not match
3899  *  FOLDEQ_LOCALE           is set iff the rules from the current underlying
3900  *                          locale are to be used.
3901  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
3902  *                            routine.  This allows that step to be skipped.
3903  *  FOLDEQ_S2_ALREADY_FOLDED  Similarly.
3904  */
3905 I32
3906 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
3907 {
3908     dVAR;
3909     const U8 *p1  = (const U8*)s1; /* Point to current char */
3910     const U8 *p2  = (const U8*)s2;
3911     const U8 *g1 = NULL;       /* goal for s1 */
3912     const U8 *g2 = NULL;
3913     const U8 *e1 = NULL;       /* Don't scan s1 past this */
3914     U8 *f1 = NULL;             /* Point to current folded */
3915     const U8 *e2 = NULL;
3916     U8 *f2 = NULL;
3917     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
3918     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
3919     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
3920
3921     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
3922
3923     assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
3924            && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
3925     /* The algorithm is to trial the folds without regard to the flags on
3926      * the first line of the above assert(), and then see if the result
3927      * violates them.  This means that the inputs can't be pre-folded to a
3928      * violating result, hence the assert.  This could be changed, with the
3929      * addition of extra tests here for the already-folded case, which would
3930      * slow it down.  That cost is more than any possible gain for when these
3931      * flags are specified, as the flags indicate /il or /iaa matching which
3932      * is less common than /iu, and I (khw) also believe that real-world /il
3933      * and /iaa matches are most likely to involve code points 0-255, and this
3934      * function only under rare conditions gets called for 0-255. */
3935
3936     if (IN_UTF8_CTYPE_LOCALE) {
3937         flags &= ~FOLDEQ_LOCALE;
3938     }
3939
3940     if (pe1) {
3941         e1 = *(U8**)pe1;
3942     }
3943
3944     if (l1) {
3945         g1 = (const U8*)s1 + l1;
3946     }
3947
3948     if (pe2) {
3949         e2 = *(U8**)pe2;
3950     }
3951
3952     if (l2) {
3953         g2 = (const U8*)s2 + l2;
3954     }
3955
3956     /* Must have at least one goal */
3957     assert(g1 || g2);
3958
3959     if (g1) {
3960
3961         /* Will never match if goal is out-of-bounds */
3962         assert(! e1  || e1 >= g1);
3963
3964         /* Here, there isn't an end pointer, or it is beyond the goal.  We
3965         * only go as far as the goal */
3966         e1 = g1;
3967     }
3968     else {
3969         assert(e1);    /* Must have an end for looking at s1 */
3970     }
3971
3972     /* Same for goal for s2 */
3973     if (g2) {
3974         assert(! e2  || e2 >= g2);
3975         e2 = g2;
3976     }
3977     else {
3978         assert(e2);
3979     }
3980
3981     /* If both operands are already folded, we could just do a memEQ on the
3982      * whole strings at once, but it would be better if the caller realized
3983      * this and didn't even call us */
3984
3985     /* Look through both strings, a character at a time */
3986     while (p1 < e1 && p2 < e2) {
3987
3988         /* If at the beginning of a new character in s1, get its fold to use
3989          * and the length of the fold.  (exception: locale rules just get the
3990          * character to a single byte) */
3991         if (n1 == 0) {
3992             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
3993                 f1 = (U8 *) p1;
3994                 n1 = UTF8SKIP(f1);
3995             }
3996             else {
3997                 /* If in locale matching, we use two sets of rules, depending
3998                  * on if the code point is above or below 255.  Here, we test
3999                  * for and handle locale rules */
4000                 if ((flags & FOLDEQ_LOCALE)
4001                     && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1)))
4002                 {
4003                     /* There is no mixing of code points above and below 255. */
4004                     if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) {
4005                         return 0;
4006                     }
4007
4008                     /* We handle locale rules by converting, if necessary, the
4009                      * code point to a single byte. */
4010                     if (! u1 || UTF8_IS_INVARIANT(*p1)) {
4011                         *foldbuf1 = *p1;
4012                     }
4013                     else {
4014                         *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1));
4015                     }
4016                     n1 = 1;
4017                 }
4018                 else if (isASCII(*p1)) {    /* Note, that here won't be both
4019                                                ASCII and using locale rules */
4020
4021                     /* If trying to mix non- with ASCII, and not supposed to,
4022                      * fail */
4023                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4024                         return 0;
4025                     }
4026                     n1 = 1;
4027                     *foldbuf1 = toFOLD(*p1);
4028                 }
4029                 else if (u1) {
4030                     to_utf8_fold(p1, foldbuf1, &n1);
4031                 }
4032                 else {  /* Not utf8, get utf8 fold */
4033                     to_uni_fold(*p1, foldbuf1, &n1);
4034                 }
4035                 f1 = foldbuf1;
4036             }
4037         }
4038
4039         if (n2 == 0) {    /* Same for s2 */
4040             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4041                 f2 = (U8 *) p2;
4042                 n2 = UTF8SKIP(f2);
4043             }
4044             else {
4045                 if ((flags & FOLDEQ_LOCALE)
4046                     && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2)))
4047                 {
4048                     /* Here, the next char in s2 is < 256.  We've already
4049                      * worked on s1, and if it isn't also < 256, can't match */
4050                     if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) {
4051                         return 0;
4052                     }
4053                     if (! u2 || UTF8_IS_INVARIANT(*p2)) {
4054                         *foldbuf2 = *p2;
4055                     }
4056                     else {
4057                         *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1));
4058                     }
4059
4060                     /* Use another function to handle locale rules.  We've made
4061                      * sure that both characters to compare are single bytes */
4062                     if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
4063                         return 0;
4064                     }
4065                     n1 = n2 = 0;
4066                 }
4067                 else if (isASCII(*p2)) {
4068                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4069                         return 0;
4070                     }
4071                     n2 = 1;
4072                     *foldbuf2 = toFOLD(*p2);
4073                 }
4074                 else if (u2) {
4075                     to_utf8_fold(p2, foldbuf2, &n2);
4076                 }
4077                 else {
4078                     to_uni_fold(*p2, foldbuf2, &n2);
4079                 }
4080                 f2 = foldbuf2;
4081             }
4082         }
4083
4084         /* Here f1 and f2 point to the beginning of the strings to compare.
4085          * These strings are the folds of the next character from each input
4086          * string, stored in utf8. */
4087
4088         /* While there is more to look for in both folds, see if they
4089         * continue to match */
4090         while (n1 && n2) {
4091             U8 fold_length = UTF8SKIP(f1);
4092             if (fold_length != UTF8SKIP(f2)
4093                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4094                                                        function call for single
4095                                                        byte */
4096                 || memNE((char*)f1, (char*)f2, fold_length))
4097             {
4098                 return 0; /* mismatch */
4099             }
4100
4101             /* Here, they matched, advance past them */
4102             n1 -= fold_length;
4103             f1 += fold_length;
4104             n2 -= fold_length;
4105             f2 += fold_length;
4106         }
4107
4108         /* When reach the end of any fold, advance the input past it */
4109         if (n1 == 0) {
4110             p1 += u1 ? UTF8SKIP(p1) : 1;
4111         }
4112         if (n2 == 0) {
4113             p2 += u2 ? UTF8SKIP(p2) : 1;
4114         }
4115     } /* End of loop through both strings */
4116
4117     /* A match is defined by each scan that specified an explicit length
4118     * reaching its final goal, and the other not having matched a partial
4119     * character (which can happen when the fold of a character is more than one
4120     * character). */
4121     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4122         return 0;
4123     }
4124
4125     /* Successful match.  Set output pointers */
4126     if (pe1) {
4127         *pe1 = (char*)p1;
4128     }
4129     if (pe2) {
4130         *pe2 = (char*)p2;
4131     }
4132     return 1;
4133 }
4134
4135 /* XXX The next two functions should likely be moved to mathoms.c once all
4136  * occurrences of them are removed from the core; some cpan-upstream modules
4137  * still use them */
4138
4139 U8 *
4140 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
4141 {
4142     PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
4143
4144     return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
4145 }
4146
4147 /*
4148 =for apidoc utf8n_to_uvuni
4149
4150 Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
4151
4152 This function was useful for code that wanted to handle both EBCDIC and
4153 ASCII platforms with Unicode properties, but starting in Perl v5.20, the
4154 distinctions between the platforms have mostly been made invisible to most
4155 code, so this function is quite unlikely to be what you want.  If you do need
4156 this precise functionality, use instead
4157 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
4158 or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
4159
4160 =cut
4161 */
4162
4163 UV
4164 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
4165 {
4166     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
4167
4168     return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
4169 }
4170
4171 /*
4172 =for apidoc uvuni_to_utf8_flags
4173
4174 Instead you almost certainly want to use L</uvchr_to_utf8> or
4175 L</uvchr_to_utf8_flags>>.
4176
4177 This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
4178 which itself, while not deprecated, should be used only in isolated
4179 circumstances.  These functions were useful for code that wanted to handle
4180 both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
4181 v5.20, the distinctions between the platforms have mostly been made invisible
4182 to most code, so this function is quite unlikely to be what you want.
4183
4184 =cut
4185 */
4186
4187 U8 *
4188 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
4189 {
4190     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
4191
4192     return uvoffuni_to_utf8_flags(d, uv, flags);
4193 }
4194
4195 /*
4196  * Local variables:
4197  * c-indentation-style: bsd
4198  * c-basic-offset: 4
4199  * indent-tabs-mode: nil
4200  * End:
4201  *
4202  * ex: set ts=8 sts=4 sw=4 et:
4203  */