utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these are
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if the first C<len> bytes of the string C<s> are the same whether
  61 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  62 is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  63 fit this definition, hence the function's name.
  64
  65 If C<len> is 0, it will be calculated using C<strlen(s)>.
  66
  67 See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  68
  69 =cut
  70 */
  71
  72 bool
  73 Perl_is_ascii_string(const U8 *s, STRLEN len)
  74 {
  75     const U8* const send = s + (len ? len : strlen((const char *)s));
  76     const U8* x = s;
  77
  78     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  79
  80     for (; x < send; ++x) {
  81         if (!UTF8_IS_INVARIANT(*x))
  82             break;
  83     }
  84
  85     return x == send;
  86 }
  87
  88 /*
  89 =for apidoc uvuni_to_utf8_flags
  90
  91 Adds the UTF-8 representation of the code point C<uv> to the end
  92 of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
  93 bytes available. The return value is the pointer to the byte after the
  94 end of the new character. In other words,
  95
  96     d = uvuni_to_utf8_flags(d, uv, flags);
  97
  98 or, in most cases,
  99
 100     d = uvuni_to_utf8(d, uv);
 101
 102 (which is equivalent to)
 103
 104     d = uvuni_to_utf8_flags(d, uv, 0);
 105
 106 This is the recommended Unicode-aware way of saying
 107
 108     *(d++) = uv;
 109
 110 This function will convert to UTF-8 (and not warn) even code points that aren't
 111 legal Unicode or are problematic, unless C<flags> contains one or more of the
 112 following flags:
 113
 114 If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
 115 the function will raise a warning, provided UTF8 warnings are enabled.  If instead
 116 UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
 117 If both flags are set, the function will both warn and return NULL.
 118
 119 The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
 120 affect how the function handles a Unicode non-character.  And, likewise for the
 121 UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
 122 above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
 123 even less portable) can be warned and/or disallowed even if other above-Unicode
 124 code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
 125 flags.
 126
 127 And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
 128 above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
 129 DISALLOW flags.
 130
 131
 132 =cut
 133 */
 134
 135 U8 *
 136 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 137 {
 138     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 139
 140     if (ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR)) {
 141         if (UNICODE_IS_SURROGATE(uv)) {
 142             if (flags & UNICODE_WARN_SURROGATE) {
 143                 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
 144                                             "UTF-16 surrogate U+%04"UVXf, uv);
 145             }
 146             if (flags & UNICODE_DISALLOW_SURROGATE) {
 147                 return NULL;
 148             }
 149         }
 150         else if (UNICODE_IS_SUPER(uv)) {
 151             if (flags & UNICODE_WARN_SUPER
 152                 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
 153             {
 154                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
 155                           "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
 156             }
 157             if (flags & UNICODE_DISALLOW_SUPER
 158                 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
 159             {
 160                 return NULL;
 161             }
 162         }
 163         else if (UNICODE_IS_NONCHAR(uv)) {
 164             if (flags & UNICODE_WARN_NONCHAR) {
 165                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
 166                  "Unicode non-character U+%04"UVXf" is illegal for open interchange",
 167                  uv);
 168             }
 169             if (flags & UNICODE_DISALLOW_NONCHAR) {
 170                 return NULL;
 171             }
 172         }
 173     }
 174     if (UNI_IS_INVARIANT(uv)) {
 175         *d++ = (U8)UTF_TO_NATIVE(uv);
 176         return d;
 177     }
 178 #if defined(EBCDIC)
 179     else {
 180         STRLEN len  = UNISKIP(uv);
 181         U8 *p = d+len-1;
 182         while (p > d) {
 183             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 184             uv >>= UTF_ACCUMULATION_SHIFT;
 185         }
 186         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 187         return d+len;
 188     }
 189 #else /* Non loop style */
 190     if (uv < 0x800) {
 191         *d++ = (U8)(( uv >>  6)         | 0xc0);
 192         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 193         return d;
 194     }
 195     if (uv < 0x10000) {
 196         *d++ = (U8)(( uv >> 12)         | 0xe0);
 197         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 198         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 199         return d;
 200     }
 201     if (uv < 0x200000) {
 202         *d++ = (U8)(( uv >> 18)         | 0xf0);
 203         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 205         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 206         return d;
 207     }
 208     if (uv < 0x4000000) {
 209         *d++ = (U8)(( uv >> 24)         | 0xf8);
 210         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 211         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 212         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 213         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 214         return d;
 215     }
 216     if (uv < 0x80000000) {
 217         *d++ = (U8)(( uv >> 30)         | 0xfc);
 218         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 219         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 220         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 221         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 222         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 223         return d;
 224     }
 225 #ifdef HAS_QUAD
 226     if (uv < UTF8_QUAD_MAX)
 227 #endif
 228     {
 229         *d++ =                            0xfe; /* Can't match U+FEFF! */
 230         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 231         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 232         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 233         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 234         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 235         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 236         return d;
 237     }
 238 #ifdef HAS_QUAD
 239     {
 240         *d++ =                            0xff;         /* Can't match U+FFFE! */
 241         *d++ =                            0x80;         /* 6 Reserved bits */
 242         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 243         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 244         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 245         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 246         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 247         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 248         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 249         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 250         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 251         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 252         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 253         return d;
 254     }
 255 #endif
 256 #endif /* Loop style */
 257 }
 258
 259 /*
 260
 261 Tests if the first C<len> bytes of string C<s> form a valid UTF-8
 262 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 263 UTF-8 character.  The number of bytes in the UTF-8 character
 264 will be returned if it is valid, otherwise 0.
 265
 266 This is the "slow" version as opposed to the "fast" version which is
 267 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 268 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 269 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 270 you should use the _slow().  In practice this means that the _slow()
 271 will be used very rarely, since the maximum Unicode code point (as of
 272 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 273 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 274 five bytes or more.
 275
 276 =cut */
 277 STATIC STRLEN
 278 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 279 {
 280     U8 u = *s;
 281     STRLEN slen;
 282     UV uv, ouv;
 283
 284     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 285
 286     if (UTF8_IS_INVARIANT(u))
 287         return len == 1;
 288
 289     if (!UTF8_IS_START(u))
 290         return 0;
 291
 292     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 293         return 0;
 294
 295     slen = len - 1;
 296     s++;
 297 #ifdef EBCDIC
 298     u = NATIVE_TO_UTF(u);
 299 #endif
 300     u &= UTF_START_MASK(len);
 301     uv  = u;
 302     ouv = uv;
 303     while (slen--) {
 304         if (!UTF8_IS_CONTINUATION(*s))
 305             return 0;
 306         uv = UTF8_ACCUMULATE(uv, *s);
 307         if (uv < ouv)
 308             return 0;
 309         ouv = uv;
 310         s++;
 311     }
 312
 313     if ((STRLEN)UNISKIP(uv) < len)
 314         return 0;
 315
 316     return len;
 317 }
 318
 319 /*
 320 =for apidoc is_utf8_char_buf
 321
 322 Returns the number of bytes that comprise the first UTF-8 encoded character in
 323 buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
 324 buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
 325 encoded character.
 326
 327 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
 328 machines) is a valid UTF-8 character.
 329
 330 =cut */
 331
 332 STRLEN
 333 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
 334 {
 335
 336     STRLEN len;
 337
 338     PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
 339
 340     if (buf_end <= buf) {
 341         return 0;
 342     }
 343
 344     len = buf_end - buf;
 345     if (len > UTF8SKIP(buf)) {
 346         len = UTF8SKIP(buf);
 347     }
 348
 349 #ifdef IS_UTF8_CHAR
 350     if (IS_UTF8_CHAR_FAST(len))
 351         return IS_UTF8_CHAR(buf, len) ? len : 0;
 352 #endif /* #ifdef IS_UTF8_CHAR */
 353     return is_utf8_char_slow(buf, len);
 354 }
 355
 356 /*
 357 =for apidoc is_utf8_char
 358
 359 DEPRECATED!
 360
 361 Tests if some arbitrary number of bytes begins in a valid UTF-8
 362 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 363 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 364 character will be returned if it is valid, otherwise 0.
 365
 366 This function is deprecated due to the possibility that malformed input could
 367 cause reading beyond the end of the input buffer.  Use L</is_utf8_char_buf>
 368 instead.
 369
 370 =cut */
 371
 372 STRLEN
 373 Perl_is_utf8_char(const U8 *s)
 374 {
 375     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 376
 377     /* Assumes we have enough space, which is why this is deprecated */
 378     return is_utf8_char_buf(s, s + UTF8SKIP(s));
 379 }
 380
 381
 382 /*
 383 =for apidoc is_utf8_string
 384
 385 Returns true if the first C<len> bytes of string C<s> form a valid
 386 UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
 387 using C<strlen(s)> (which means if you use this option, that C<s> has to have a
 388 terminating NUL byte).  Note that all characters being ASCII constitute 'a
 389 valid UTF-8 string'.
 390
 391 See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
 392
 393 =cut
 394 */
 395
 396 bool
 397 Perl_is_utf8_string(const U8 *s, STRLEN len)
 398 {
 399     const U8* const send = s + (len ? len : strlen((const char *)s));
 400     const U8* x = s;
 401
 402     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 403
 404     while (x < send) {
 405          /* Inline the easy bits of is_utf8_char() here for speed... */
 406          if (UTF8_IS_INVARIANT(*x)) {
 407             x++;
 408          }
 409          else if (!UTF8_IS_START(*x))
 410              return FALSE;
 411          else {
 412               /* ... and call is_utf8_char() only if really needed. */
 413              const STRLEN c = UTF8SKIP(x);
 414              const U8* const next_char_ptr = x + c;
 415
 416              if (next_char_ptr > send) {
 417                  return FALSE;
 418              }
 419
 420              if (IS_UTF8_CHAR_FAST(c)) {
 421                  if (!IS_UTF8_CHAR(x, c))
 422                      return FALSE;
 423              }
 424              else if (! is_utf8_char_slow(x, c)) {
 425                  return FALSE;
 426              }
 427              x = next_char_ptr;
 428          }
 429     }
 430
 431     return TRUE;
 432 }
 433
 434 /*
 435 Implemented as a macro in utf8.h
 436
 437 =for apidoc is_utf8_string_loc
 438
 439 Like L</is_utf8_string> but stores the location of the failure (in the
 440 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 441 "utf8ness success") in the C<ep>.
 442
 443 See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
 444
 445 =for apidoc is_utf8_string_loclen
 446
 447 Like L</is_utf8_string>() but stores the location of the failure (in the
 448 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 449 "utf8ness success") in the C<ep>, and the number of UTF-8
 450 encoded characters in the C<el>.
 451
 452 See also L</is_utf8_string_loc>() and L</is_utf8_string>().
 453
 454 =cut
 455 */
 456
 457 bool
 458 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 459 {
 460     const U8* const send = s + (len ? len : strlen((const char *)s));
 461     const U8* x = s;
 462     STRLEN c;
 463     STRLEN outlen = 0;
 464
 465     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 466
 467     while (x < send) {
 468          const U8* next_char_ptr;
 469
 470          /* Inline the easy bits of is_utf8_char() here for speed... */
 471          if (UTF8_IS_INVARIANT(*x))
 472              next_char_ptr = x + 1;
 473          else if (!UTF8_IS_START(*x))
 474              goto out;
 475          else {
 476              /* ... and call is_utf8_char() only if really needed. */
 477              c = UTF8SKIP(x);
 478              next_char_ptr = c + x;
 479              if (next_char_ptr > send) {
 480                  goto out;
 481              }
 482              if (IS_UTF8_CHAR_FAST(c)) {
 483                  if (!IS_UTF8_CHAR(x, c))
 484                      c = 0;
 485              } else
 486                  c = is_utf8_char_slow(x, c);
 487              if (!c)
 488                  goto out;
 489          }
 490          x = next_char_ptr;
 491          outlen++;
 492     }
 493
 494  out:
 495     if (el)
 496         *el = outlen;
 497
 498     if (ep)
 499         *ep = x;
 500     return (x == send);
 501 }
 502
 503 /*
 504
 505 =for apidoc utf8n_to_uvuni
 506
 507 Bottom level UTF-8 decode routine.
 508 Returns the code point value of the first character in the string C<s>
 509 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
 510 C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
 511 character.
 512
 513 The value of C<flags> determines the behavior when C<s> does not point to a
 514 well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
 515 C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
 516 is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
 517 is raised.
 518
 519 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
 520 individual types of malformations, such as the sequence being overlong (that
 521 is, when there is a shorter sequence that can express the same code point;
 522 overlong sequences are expressly forbidden in the UTF-8 standard due to
 523 potential security issues).  Another malformation example is the first byte of
 524 a character not being a legal first byte.  See F<utf8.h> for the list of such
 525 flags.  Of course, the value returned by this function under such conditions is
 526 not reliable.
 527
 528 The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
 529 flags) malformation is found.  If this flag is set, the routine assumes that
 530 the caller will raise a warning, and this function will silently just set
 531 C<retlen> to C<-1> and return zero.
 532
 533 Certain code points are considered problematic.  These are Unicode surrogates,
 534 Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
 535 By default these are considered regular code points, but certain situations
 536 warrant special handling for them.  If C<flags> contains
 537 UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
 538 malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
 539 UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
 540 maximum) can be set to disallow these categories individually.
 541
 542 The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
 543 UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
 544 for their respective categories, but otherwise the code points are considered
 545 valid (not malformations).  To get a category to both be treated as a
 546 malformation and raise a warning, specify both the WARN and DISALLOW flags.
 547 (But note that warnings are not raised if lexically disabled nor if
 548 UTF8_CHECK_ONLY is also specified.)
 549
 550 Very large code points (above 0x7FFF_FFFF) are considered more problematic than
 551 the others that are above the Unicode legal maximum.  There are several
 552 reasons: they do not fit into a 32-bit word, are not representable on EBCDIC
 553 platforms, and the original UTF-8 specification never went above
 554 this number (the current 0x10FFF limit was imposed later).  The UTF-8 encoding
 555 on ASCII platforms for these large code points begins with a byte containing
 556 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
 557 malformations, while allowing smaller above-Unicode code points.  (Of course
 558 UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
 559 as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
 560 flags, but applies just to these code points.
 561
 562 All other code points corresponding to Unicode characters, including private
 563 use and those yet to be assigned, are never considered malformed and never
 564 warn.
 565
 566 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 567
 568 =cut
 569 */
 570
 571 UV
 572 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 573 {
 574     dVAR;
 575     const U8 * const s0 = s;
 576     UV uv = *s, ouv = 0;
 577     STRLEN len = 1;
 578     bool dowarn = ckWARN_d(WARN_UTF8);
 579     const UV startbyte = *s;
 580     STRLEN expectlen = 0;
 581     U32 warning = 0;
 582     SV* sv = NULL;
 583
 584     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 585
 586 /* This list is a superset of the UTF8_ALLOW_XXX. */
 587
 588 #define UTF8_WARN_EMPTY                          1
 589 #define UTF8_WARN_CONTINUATION                   2
 590 #define UTF8_WARN_NON_CONTINUATION               3
 591 #define UTF8_WARN_SHORT                          4
 592 #define UTF8_WARN_OVERFLOW                       5
 593 #define UTF8_WARN_LONG                           6
 594
 595     if (curlen == 0 &&
 596         !(flags & UTF8_ALLOW_EMPTY)) {
 597         warning = UTF8_WARN_EMPTY;
 598         goto malformed;
 599     }
 600
 601     if (UTF8_IS_INVARIANT(uv)) {
 602         if (retlen)
 603             *retlen = 1;
 604         return (UV) (NATIVE_TO_UTF(*s));
 605     }
 606
 607     if (UTF8_IS_CONTINUATION(uv) &&
 608         !(flags & UTF8_ALLOW_CONTINUATION)) {
 609         warning = UTF8_WARN_CONTINUATION;
 610         goto malformed;
 611     }
 612
 613     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 614         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 615         warning = UTF8_WARN_NON_CONTINUATION;
 616         goto malformed;
 617     }
 618
 619 #ifdef EBCDIC
 620     uv = NATIVE_TO_UTF(uv);
 621 #else
 622     if (uv == 0xfe || uv == 0xff) {
 623         if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
 624             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
 625             flags &= ~UTF8_WARN_SUPER;  /* Only warn once on this problem */
 626         }
 627         if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
 628             goto malformed;
 629         }
 630     }
 631 #endif
 632
 633     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 634     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 635     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 636     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 637 #ifdef EBCDIC
 638     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 639     else                        { len =  7; uv &= 0x01; }
 640 #else
 641     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 642     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 643     else                        { len = 13; uv = 0; } /* whoa! */
 644 #endif
 645
 646     if (retlen)
 647         *retlen = len;
 648
 649     expectlen = len;
 650
 651     if ((curlen < expectlen) &&
 652         !(flags & UTF8_ALLOW_SHORT)) {
 653         warning = UTF8_WARN_SHORT;
 654         goto malformed;
 655     }
 656
 657     len--;
 658     s++;
 659     ouv = uv;   /* ouv is the value from the previous iteration */
 660
 661     while (len--) {
 662         if (!UTF8_IS_CONTINUATION(*s) &&
 663             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 664             s--;
 665             warning = UTF8_WARN_NON_CONTINUATION;
 666             goto malformed;
 667         }
 668         else
 669             uv = UTF8_ACCUMULATE(uv, *s);
 670         if (!(uv > ouv)) {  /* If the value didn't grow from the previous
 671                                iteration, something is horribly wrong */
 672             /* These cannot be allowed. */
 673             if (uv == ouv) {
 674                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 675                     warning = UTF8_WARN_LONG;
 676                     goto malformed;
 677                 }
 678             }
 679             else { /* uv < ouv */
 680                 /* This cannot be allowed. */
 681                 warning = UTF8_WARN_OVERFLOW;
 682                 goto malformed;
 683             }
 684         }
 685         s++;
 686         ouv = uv;
 687     }
 688
 689     if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
 690         warning = UTF8_WARN_LONG;
 691         goto malformed;
 692     } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
 693         if (UNICODE_IS_SURROGATE(uv)) {
 694             if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
 695                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
 696             }
 697             if (flags & UTF8_DISALLOW_SURROGATE) {
 698                 goto disallowed;
 699             }
 700         }
 701         else if (UNICODE_IS_NONCHAR(uv)) {
 702             if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
 703                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
 704             }
 705             if (flags & UTF8_DISALLOW_NONCHAR) {
 706                 goto disallowed;
 707             }
 708         }
 709         else if ((uv > PERL_UNICODE_MAX)) {
 710             if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
 711                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
 712             }
 713             if (flags & UTF8_DISALLOW_SUPER) {
 714                 goto disallowed;
 715             }
 716         }
 717
 718         /* Here, this is not considered a malformed character, so drop through
 719          * to return it */
 720     }
 721
 722     return uv;
 723
 724 disallowed: /* Is disallowed, but otherwise not malformed.  'sv' will have been
 725                set if there is to be a warning. */
 726     if (!sv) {
 727         dowarn = 0;
 728     }
 729
 730 malformed:
 731
 732     if (flags & UTF8_CHECK_ONLY) {
 733         if (retlen)
 734             *retlen = ((STRLEN) -1);
 735         return 0;
 736     }
 737
 738     if (dowarn) {
 739         if (! sv) {
 740             sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 741         }
 742
 743         switch (warning) {
 744             case 0: /* Intentionally empty. */ break;
 745             case UTF8_WARN_EMPTY:
 746                 sv_catpvs(sv, "(empty string)");
 747                 break;
 748             case UTF8_WARN_CONTINUATION:
 749                 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 750                 break;
 751             case UTF8_WARN_NON_CONTINUATION:
 752                 if (s == s0)
 753                     Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 754                                 (UV)s[1], startbyte);
 755                 else {
 756                     const int len = (int)(s-s0);
 757                     Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 758                                 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 759                 }
 760
 761                 break;
 762             case UTF8_WARN_SHORT:
 763                 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 764                                 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 765                 expectlen = curlen;             /* distance for caller to skip */
 766                 break;
 767             case UTF8_WARN_OVERFLOW:
 768                 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 769                                 ouv, *s, startbyte);
 770                 break;
 771             case UTF8_WARN_LONG:
 772                 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 773                                 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 774                 break;
 775             default:
 776                 sv_catpvs(sv, "(unknown reason)");
 777                 break;
 778         }
 779
 780         if (sv) {
 781             const char * const s = SvPVX_const(sv);
 782
 783             if (PL_op)
 784                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 785                             "%s in %s", s,  OP_DESC(PL_op));
 786             else
 787                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 788         }
 789     }
 790
 791     if (retlen)
 792         *retlen = expectlen ? expectlen : len;
 793
 794     return 0;
 795 }
 796
 797 /*
 798 =for apidoc utf8_to_uvchr_buf
 799
 800 Returns the native code point of the first character in the string C<s> which
 801 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 802 C<retlen> will be set to the length, in bytes, of that character.
 803
 804 If C<s> does not point to a well-formed UTF-8 character, zero is
 805 returned and C<retlen> is set, if possible, to -1.
 806
 807 =cut
 808 */
 809
 810
 811 UV
 812 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 813 {
 814     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
 815
 816     assert(s < send);
 817
 818     return utf8n_to_uvchr(s, send - s, retlen,
 819                           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 820 }
 821
 822 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
 823  * there are no malformations in the input UTF-8 string C<s>.  Currently, some
 824  * malformations are checked for, but this checking likely will be removed in
 825  * the future */
 826
 827 UV
 828 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 829 {
 830     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 831
 832     return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
 833 }
 834
 835 /*
 836 =for apidoc utf8_to_uvchr
 837
 838 DEPRECATED!
 839
 840 Returns the native code point of the first character in the string C<s>
 841 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 842 length, in bytes, of that character.
 843
 844 Some, but not all, UTF-8 malformations are detected, and in fact, some
 845 malformed input could cause reading beyond the end of the input buffer, which
 846 is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
 847
 848 If C<s> points to one of the detected malformations, zero is
 849 returned and C<retlen> is set, if possible, to -1.
 850
 851 =cut
 852 */
 853
 854 UV
 855 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 856 {
 857     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 858
 859     return valid_utf8_to_uvchr(s, retlen);
 860 }
 861
 862 /*
 863 =for apidoc utf8_to_uvuni_buf
 864
 865 Returns the Unicode code point of the first character in the string C<s> which
 866 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 867 C<retlen> will be set to the length, in bytes, of that character.
 868
 869 This function should only be used when the returned UV is considered
 870 an index into the Unicode semantic tables (e.g. swashes).
 871
 872 If C<s> does not point to a well-formed UTF-8 character, zero is
 873 returned and C<retlen> is set, if possible, to -1.
 874
 875 =cut
 876 */
 877
 878 UV
 879 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 880 {
 881     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
 882
 883     assert(send > s);
 884
 885     /* Call the low level routine asking for checks */
 886     return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
 887                                ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 888 }
 889
 890 /* Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
 891  * there are no malformations in the input UTF-8 string C<s>.  Currently, some
 892  * malformations are checked for, but this checking likely will be removed in
 893  * the future */
 894
 895 UV
 896 Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 897 {
 898     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
 899
 900     return utf8_to_uvuni_buf(s, s + UTF8_MAXBYTES, retlen);
 901 }
 902
 903 /*
 904 =for apidoc utf8_to_uvuni
 905
 906 DEPRECATED!
 907
 908 Returns the Unicode code point of the first character in the string C<s>
 909 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 910 length, in bytes, of that character.
 911
 912 This function should only be used when the returned UV is considered
 913 an index into the Unicode semantic tables (e.g. swashes).
 914
 915 Some, but not all, UTF-8 malformations are detected, and in fact, some
 916 malformed input could cause reading beyond the end of the input buffer, which
 917 is why this function is deprecated.  Use L</utf8_to_uvuni_buf> instead.
 918
 919 If C<s> points to one of the detected malformations, zero is
 920 returned and C<retlen> is set, if possible, to -1.
 921
 922 =cut
 923 */
 924
 925 UV
 926 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 927 {
 928     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 929
 930     return valid_utf8_to_uvuni(s, retlen);
 931 }
 932
 933 /*
 934 =for apidoc utf8_length
 935
 936 Return the length of the UTF-8 char encoded string C<s> in characters.
 937 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 938 up past C<e>, croaks.
 939
 940 =cut
 941 */
 942
 943 STRLEN
 944 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 945 {
 946     dVAR;
 947     STRLEN len = 0;
 948
 949     PERL_ARGS_ASSERT_UTF8_LENGTH;
 950
 951     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 952      * the bitops (especially ~) can create illegal UTF-8.
 953      * In other words: in Perl UTF-8 is not just for Unicode. */
 954
 955     if (e < s)
 956         goto warn_and_return;
 957     while (s < e) {
 958         if (!UTF8_IS_INVARIANT(*s))
 959             s += UTF8SKIP(s);
 960         else
 961             s++;
 962         len++;
 963     }
 964
 965     if (e != s) {
 966         len--;
 967         warn_and_return:
 968         if (PL_op)
 969             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
 970                              "%s in %s", unees, OP_DESC(PL_op));
 971         else
 972             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
 973     }
 974
 975     return len;
 976 }
 977
 978 /*
 979 =for apidoc utf8_distance
 980
 981 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 982 and C<b>.
 983
 984 WARNING: use only if you *know* that the pointers point inside the
 985 same UTF-8 buffer.
 986
 987 =cut
 988 */
 989
 990 IV
 991 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 992 {
 993     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 994
 995     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 996 }
 997
 998 /*
 999 =for apidoc utf8_hop
1000
1001 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1002 forward or backward.
1003
1004 WARNING: do not use the following unless you *know* C<off> is within
1005 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1006 on the first byte of character or just after the last byte of a character.
1007
1008 =cut
1009 */
1010
1011 U8 *
1012 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1013 {
1014     PERL_ARGS_ASSERT_UTF8_HOP;
1015
1016     PERL_UNUSED_CONTEXT;
1017     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1018      * the bitops (especially ~) can create illegal UTF-8.
1019      * In other words: in Perl UTF-8 is not just for Unicode. */
1020
1021     if (off >= 0) {
1022         while (off--)
1023             s += UTF8SKIP(s);
1024     }
1025     else {
1026         while (off++) {
1027             s--;
1028             while (UTF8_IS_CONTINUATION(*s))
1029                 s--;
1030         }
1031     }
1032     return (U8 *)s;
1033 }
1034
1035 /*
1036 =for apidoc bytes_cmp_utf8
1037
1038 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1039 sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
1040 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1041 if the first string is greater than the second string.
1042
1043 -1 or +1 is returned if the shorter string was identical to the start of the
1044 longer string. -2 or +2 is returned if the was a difference between characters
1045 within the strings.
1046
1047 =cut
1048 */
1049
1050 int
1051 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1052 {
1053     const U8 *const bend = b + blen;
1054     const U8 *const uend = u + ulen;
1055
1056     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1057
1058     PERL_UNUSED_CONTEXT;
1059
1060     while (b < bend && u < uend) {
1061         U8 c = *u++;
1062         if (!UTF8_IS_INVARIANT(c)) {
1063             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1064                 if (u < uend) {
1065                     U8 c1 = *u++;
1066                     if (UTF8_IS_CONTINUATION(c1)) {
1067                         c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
1068                     } else {
1069                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1070                                          "Malformed UTF-8 character "
1071                                          "(unexpected non-continuation byte 0x%02x"
1072                                          ", immediately after start byte 0x%02x)"
1073                                          /* Dear diag.t, it's in the pod.  */
1074                                          "%s%s", c1, c,
1075                                          PL_op ? " in " : "",
1076                                          PL_op ? OP_DESC(PL_op) : "");
1077                         return -2;
1078                     }
1079                 } else {
1080                     if (PL_op)
1081                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1082                                          "%s in %s", unees, OP_DESC(PL_op));
1083                     else
1084                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1085                     return -2; /* Really want to return undef :-)  */
1086                 }
1087             } else {
1088                 return -2;
1089             }
1090         }
1091         if (*b != c) {
1092             return *b < c ? -2 : +2;
1093         }
1094         ++b;
1095     }
1096
1097     if (b == bend && u == uend)
1098         return 0;
1099
1100     return b < bend ? +1 : -1;
1101 }
1102
1103 /*
1104 =for apidoc utf8_to_bytes
1105
1106 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1107 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1108 updates C<len> to contain the new length.
1109 Returns zero on failure, setting C<len> to -1.
1110
1111 If you need a copy of the string, see L</bytes_from_utf8>.
1112
1113 =cut
1114 */
1115
1116 U8 *
1117 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1118 {
1119     U8 * const save = s;
1120     U8 * const send = s + *len;
1121     U8 *d;
1122
1123     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1124
1125     /* ensure valid UTF-8 and chars < 256 before updating string */
1126     while (s < send) {
1127         U8 c = *s++;
1128
1129         if (!UTF8_IS_INVARIANT(c) &&
1130             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
1131              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
1132             *len = ((STRLEN) -1);
1133             return 0;
1134         }
1135     }
1136
1137     d = s = save;
1138     while (s < send) {
1139         STRLEN ulen;
1140         *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
1141         s += ulen;
1142     }
1143     *d = '\0';
1144     *len = d - save;
1145     return save;
1146 }
1147
1148 /*
1149 =for apidoc bytes_from_utf8
1150
1151 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1152 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
1153 the newly-created string, and updates C<len> to contain the new
1154 length.  Returns the original string if no conversion occurs, C<len>
1155 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
1156 0 if C<s> is converted or consisted entirely of characters that are invariant
1157 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1158
1159 =cut
1160 */
1161
1162 U8 *
1163 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1164 {
1165     U8 *d;
1166     const U8 *start = s;
1167     const U8 *send;
1168     I32 count = 0;
1169
1170     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1171
1172     PERL_UNUSED_CONTEXT;
1173     if (!*is_utf8)
1174         return (U8 *)start;
1175
1176     /* ensure valid UTF-8 and chars < 256 before converting string */
1177     for (send = s + *len; s < send;) {
1178         U8 c = *s++;
1179         if (!UTF8_IS_INVARIANT(c)) {
1180             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
1181                 (c = *s++) && UTF8_IS_CONTINUATION(c))
1182                 count++;
1183             else
1184                 return (U8 *)start;
1185         }
1186     }
1187
1188     *is_utf8 = FALSE;
1189
1190     Newx(d, (*len) - count + 1, U8);
1191     s = start; start = d;
1192     while (s < send) {
1193         U8 c = *s++;
1194         if (!UTF8_IS_INVARIANT(c)) {
1195             /* Then it is two-byte encoded */
1196             c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
1197         }
1198         *d++ = c;
1199     }
1200     *d = '\0';
1201     *len = d - start;
1202     return (U8 *)start;
1203 }
1204
1205 /*
1206 =for apidoc bytes_to_utf8
1207
1208 Converts a string C<s> of length C<len> bytes from the native encoding into
1209 UTF-8.
1210 Returns a pointer to the newly-created string, and sets C<len> to
1211 reflect the new length in bytes.
1212
1213 A NUL character will be written after the end of the string.
1214
1215 If you want to convert to UTF-8 from encodings other than
1216 the native (Latin1 or EBCDIC),
1217 see L</sv_recode_to_utf8>().
1218
1219 =cut
1220 */
1221
1222 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1223    likewise need duplication. */
1224
1225 U8*
1226 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1227 {
1228     const U8 * const send = s + (*len);
1229     U8 *d;
1230     U8 *dst;
1231
1232     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1233     PERL_UNUSED_CONTEXT;
1234
1235     Newx(d, (*len) * 2 + 1, U8);
1236     dst = d;
1237
1238     while (s < send) {
1239         const UV uv = NATIVE_TO_ASCII(*s++);
1240         if (UNI_IS_INVARIANT(uv))
1241             *d++ = (U8)UTF_TO_NATIVE(uv);
1242         else {
1243             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
1244             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
1245         }
1246     }
1247     *d = '\0';
1248     *len = d-dst;
1249     return dst;
1250 }
1251
1252 /*
1253  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1254  *
1255  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
1256  * We optimize for native, for obvious reasons. */
1257
1258 U8*
1259 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1260 {
1261     U8* pend;
1262     U8* dstart = d;
1263
1264     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1265
1266     if (bytelen & 1)
1267         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1268
1269     pend = p + bytelen;
1270
1271     while (p < pend) {
1272         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1273         p += 2;
1274         if (uv < 0x80) {
1275 #ifdef EBCDIC
1276             *d++ = UNI_TO_NATIVE(uv);
1277 #else
1278             *d++ = (U8)uv;
1279 #endif
1280             continue;
1281         }
1282         if (uv < 0x800) {
1283             *d++ = (U8)(( uv >>  6)         | 0xc0);
1284             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1285             continue;
1286         }
1287         if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
1288             if (p >= pend) {
1289                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1290             } else {
1291                 UV low = (p[0] << 8) + p[1];
1292                 p += 2;
1293                 if (low < 0xdc00 || low > 0xdfff)
1294                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1295                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
1296             }
1297         } else if (uv >= 0xdc00 && uv <= 0xdfff) {
1298             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1299         }
1300         if (uv < 0x10000) {
1301             *d++ = (U8)(( uv >> 12)         | 0xe0);
1302             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1303             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1304             continue;
1305         }
1306         else {
1307             *d++ = (U8)(( uv >> 18)         | 0xf0);
1308             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1309             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1310             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1311             continue;
1312         }
1313     }
1314     *newlen = d - dstart;
1315     return d;
1316 }
1317
1318 /* Note: this one is slightly destructive of the source. */
1319
1320 U8*
1321 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1322 {
1323     U8* s = (U8*)p;
1324     U8* const send = s + bytelen;
1325
1326     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1327
1328     if (bytelen & 1)
1329         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1330                    (UV)bytelen);
1331
1332     while (s < send) {
1333         const U8 tmp = s[0];
1334         s[0] = s[1];
1335         s[1] = tmp;
1336         s += 2;
1337     }
1338     return utf16_to_utf8(p, d, bytelen, newlen);
1339 }
1340
1341 /* for now these are all defined (inefficiently) in terms of the utf8 versions.
1342  * Note that the macros in handy.h that call these short-circuit calling them
1343  * for Latin-1 range inputs */
1344
1345 bool
1346 Perl_is_uni_alnum(pTHX_ UV c)
1347 {
1348     U8 tmpbuf[UTF8_MAXBYTES+1];
1349     uvchr_to_utf8(tmpbuf, c);
1350     return is_utf8_alnum(tmpbuf);
1351 }
1352
1353 bool
1354 Perl_is_uni_idfirst(pTHX_ UV c)
1355 {
1356     U8 tmpbuf[UTF8_MAXBYTES+1];
1357     uvchr_to_utf8(tmpbuf, c);
1358     return is_utf8_idfirst(tmpbuf);
1359 }
1360
1361 bool
1362 Perl_is_uni_alpha(pTHX_ UV c)
1363 {
1364     U8 tmpbuf[UTF8_MAXBYTES+1];
1365     uvchr_to_utf8(tmpbuf, c);
1366     return is_utf8_alpha(tmpbuf);
1367 }
1368
1369 bool
1370 Perl_is_uni_ascii(pTHX_ UV c)
1371 {
1372     return isASCII(c);
1373 }
1374
1375 bool
1376 Perl_is_uni_space(pTHX_ UV c)
1377 {
1378     U8 tmpbuf[UTF8_MAXBYTES+1];
1379     uvchr_to_utf8(tmpbuf, c);
1380     return is_utf8_space(tmpbuf);
1381 }
1382
1383 bool
1384 Perl_is_uni_digit(pTHX_ UV c)
1385 {
1386     U8 tmpbuf[UTF8_MAXBYTES+1];
1387     uvchr_to_utf8(tmpbuf, c);
1388     return is_utf8_digit(tmpbuf);
1389 }
1390
1391 bool
1392 Perl_is_uni_upper(pTHX_ UV c)
1393 {
1394     U8 tmpbuf[UTF8_MAXBYTES+1];
1395     uvchr_to_utf8(tmpbuf, c);
1396     return is_utf8_upper(tmpbuf);
1397 }
1398
1399 bool
1400 Perl_is_uni_lower(pTHX_ UV c)
1401 {
1402     U8 tmpbuf[UTF8_MAXBYTES+1];
1403     uvchr_to_utf8(tmpbuf, c);
1404     return is_utf8_lower(tmpbuf);
1405 }
1406
1407 bool
1408 Perl_is_uni_cntrl(pTHX_ UV c)
1409 {
1410     return isCNTRL_L1(c);
1411 }
1412
1413 bool
1414 Perl_is_uni_graph(pTHX_ UV c)
1415 {
1416     U8 tmpbuf[UTF8_MAXBYTES+1];
1417     uvchr_to_utf8(tmpbuf, c);
1418     return is_utf8_graph(tmpbuf);
1419 }
1420
1421 bool
1422 Perl_is_uni_print(pTHX_ UV c)
1423 {
1424     U8 tmpbuf[UTF8_MAXBYTES+1];
1425     uvchr_to_utf8(tmpbuf, c);
1426     return is_utf8_print(tmpbuf);
1427 }
1428
1429 bool
1430 Perl_is_uni_punct(pTHX_ UV c)
1431 {
1432     U8 tmpbuf[UTF8_MAXBYTES+1];
1433     uvchr_to_utf8(tmpbuf, c);
1434     return is_utf8_punct(tmpbuf);
1435 }
1436
1437 bool
1438 Perl_is_uni_xdigit(pTHX_ UV c)
1439 {
1440     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1441     uvchr_to_utf8(tmpbuf, c);
1442     return is_utf8_xdigit(tmpbuf);
1443 }
1444
1445 UV
1446 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1447 {
1448     /* We have the latin1-range values compiled into the core, so just use
1449      * those, converting the result to utf8.  The only difference between upper
1450      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1451      * either "SS" or "Ss".  Which one to use is passed into the routine in
1452      * 'S_or_s' to avoid a test */
1453
1454     UV converted = toUPPER_LATIN1_MOD(c);
1455
1456     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1457
1458     assert(S_or_s == 'S' || S_or_s == 's');
1459
1460     if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
1461                                           characters in this range */
1462         *p = (U8) converted;
1463         *lenp = 1;
1464         return converted;
1465     }
1466
1467     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1468      * which it maps to one of them, so as to only have to have one check for
1469      * it in the main case */
1470     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1471         switch (c) {
1472             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1473                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1474                 break;
1475             case MICRO_SIGN:
1476                 converted = GREEK_CAPITAL_LETTER_MU;
1477                 break;
1478             case LATIN_SMALL_LETTER_SHARP_S:
1479                 *(p)++ = 'S';
1480                 *p = S_or_s;
1481                 *lenp = 2;
1482                 return 'S';
1483             default:
1484                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1485                 /* NOTREACHED */
1486         }
1487     }
1488
1489     *(p)++ = UTF8_TWO_BYTE_HI(converted);
1490     *p = UTF8_TWO_BYTE_LO(converted);
1491     *lenp = 2;
1492
1493     return converted;
1494 }
1495
1496 /* Call the function to convert a UTF-8 encoded character to the specified case.
1497  * Note that there may be more than one character in the result.
1498  * INP is a pointer to the first byte of the input character
1499  * OUTP will be set to the first byte of the string of changed characters.  It
1500  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1501  * LENP will be set to the length in bytes of the string of changed characters
1502  *
1503  * The functions return the ordinal of the first character in the string of OUTP */
1504 #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1505 #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1506 #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
1507
1508 /* This additionally has the input parameter SPECIALS, which if non-zero will
1509  * cause this to use the SPECIALS hash for folding (meaning get full case
1510  * folding); otherwise, when zero, this implies a simple case fold */
1511 #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
1512
1513 UV
1514 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1515 {
1516     dVAR;
1517
1518     /* Convert the Unicode character whose ordinal is <c> to its uppercase
1519      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1520      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1521      * the changed version may be longer than the original character.
1522      *
1523      * The ordinal of the first character of the changed version is returned
1524      * (but note, as explained above, that there may be more.) */
1525
1526     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1527
1528     if (c < 256) {
1529         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1530     }
1531
1532     uvchr_to_utf8(p, c);
1533     return CALL_UPPER_CASE(p, p, lenp);
1534 }
1535
1536 UV
1537 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1538 {
1539     dVAR;
1540
1541     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1542
1543     if (c < 256) {
1544         return _to_upper_title_latin1((U8) c, p, lenp, 's');
1545     }
1546
1547     uvchr_to_utf8(p, c);
1548     return CALL_TITLE_CASE(p, p, lenp);
1549 }
1550
1551 STATIC U8
1552 S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1553 {
1554     /* We have the latin1-range values compiled into the core, so just use
1555      * those, converting the result to utf8.  Since the result is always just
1556      * one character, we allow <p> to be NULL */
1557
1558     U8 converted = toLOWER_LATIN1(c);
1559
1560     if (p != NULL) {
1561         if (UNI_IS_INVARIANT(converted)) {
1562             *p = converted;
1563             *lenp = 1;
1564         }
1565         else {
1566             *p = UTF8_TWO_BYTE_HI(converted);
1567             *(p+1) = UTF8_TWO_BYTE_LO(converted);
1568             *lenp = 2;
1569         }
1570     }
1571     return converted;
1572 }
1573
1574 UV
1575 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1576 {
1577     dVAR;
1578
1579     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1580
1581     if (c < 256) {
1582         return to_lower_latin1((U8) c, p, lenp);
1583     }
1584
1585     uvchr_to_utf8(p, c);
1586     return CALL_LOWER_CASE(p, p, lenp);
1587 }
1588
1589 UV
1590 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
1591 {
1592     /* Corresponds to to_lower_latin1(), <flags> is TRUE if to use full case
1593      * folding */
1594
1595     UV converted;
1596
1597     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1598
1599     if (c == MICRO_SIGN) {
1600         converted = GREEK_SMALL_LETTER_MU;
1601     }
1602     else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
1603         *(p)++ = 's';
1604         *p = 's';
1605         *lenp = 2;
1606         return 's';
1607     }
1608     else { /* In this range the fold of all other characters is their lower
1609               case */
1610         converted = toLOWER_LATIN1(c);
1611     }
1612
1613     if (UNI_IS_INVARIANT(converted)) {
1614         *p = (U8) converted;
1615         *lenp = 1;
1616     }
1617     else {
1618         *(p)++ = UTF8_TWO_BYTE_HI(converted);
1619         *p = UTF8_TWO_BYTE_LO(converted);
1620         *lenp = 2;
1621     }
1622
1623     return converted;
1624 }
1625
1626 UV
1627 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags)
1628 {
1629
1630     /* Not currently externally documented, and subject to change, <flags> is
1631      * TRUE iff full folding is to be used */
1632
1633     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1634
1635     if (c < 256) {
1636         return _to_fold_latin1((U8) c, p, lenp, flags);
1637     }
1638
1639     uvchr_to_utf8(p, c);
1640     return CALL_FOLD_CASE(p, p, lenp, flags);
1641 }
1642
1643 /* for now these all assume no locale info available for Unicode > 255; and
1644  * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been
1645  * called instead, so that these don't get called for < 255 */
1646
1647 bool
1648 Perl_is_uni_alnum_lc(pTHX_ UV c)
1649 {
1650     return is_uni_alnum(c);     /* XXX no locale support yet */
1651 }
1652
1653 bool
1654 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1655 {
1656     return is_uni_idfirst(c);   /* XXX no locale support yet */
1657 }
1658
1659 bool
1660 Perl_is_uni_alpha_lc(pTHX_ UV c)
1661 {
1662     return is_uni_alpha(c);     /* XXX no locale support yet */
1663 }
1664
1665 bool
1666 Perl_is_uni_ascii_lc(pTHX_ UV c)
1667 {
1668     return is_uni_ascii(c);     /* XXX no locale support yet */
1669 }
1670
1671 bool
1672 Perl_is_uni_space_lc(pTHX_ UV c)
1673 {
1674     return is_uni_space(c);     /* XXX no locale support yet */
1675 }
1676
1677 bool
1678 Perl_is_uni_digit_lc(pTHX_ UV c)
1679 {
1680     return is_uni_digit(c);     /* XXX no locale support yet */
1681 }
1682
1683 bool
1684 Perl_is_uni_upper_lc(pTHX_ UV c)
1685 {
1686     return is_uni_upper(c);     /* XXX no locale support yet */
1687 }
1688
1689 bool
1690 Perl_is_uni_lower_lc(pTHX_ UV c)
1691 {
1692     return is_uni_lower(c);     /* XXX no locale support yet */
1693 }
1694
1695 bool
1696 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1697 {
1698     return is_uni_cntrl(c);     /* XXX no locale support yet */
1699 }
1700
1701 bool
1702 Perl_is_uni_graph_lc(pTHX_ UV c)
1703 {
1704     return is_uni_graph(c);     /* XXX no locale support yet */
1705 }
1706
1707 bool
1708 Perl_is_uni_print_lc(pTHX_ UV c)
1709 {
1710     return is_uni_print(c);     /* XXX no locale support yet */
1711 }
1712
1713 bool
1714 Perl_is_uni_punct_lc(pTHX_ UV c)
1715 {
1716     return is_uni_punct(c);     /* XXX no locale support yet */
1717 }
1718
1719 bool
1720 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1721 {
1722     return is_uni_xdigit(c);    /* XXX no locale support yet */
1723 }
1724
1725 U32
1726 Perl_to_uni_upper_lc(pTHX_ U32 c)
1727 {
1728     /* XXX returns only the first character -- do not use XXX */
1729     /* XXX no locale support yet */
1730     STRLEN len;
1731     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1732     return (U32)to_uni_upper(c, tmpbuf, &len);
1733 }
1734
1735 U32
1736 Perl_to_uni_title_lc(pTHX_ U32 c)
1737 {
1738     /* XXX returns only the first character XXX -- do not use XXX */
1739     /* XXX no locale support yet */
1740     STRLEN len;
1741     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1742     return (U32)to_uni_title(c, tmpbuf, &len);
1743 }
1744
1745 U32
1746 Perl_to_uni_lower_lc(pTHX_ U32 c)
1747 {
1748     /* XXX returns only the first character -- do not use XXX */
1749     /* XXX no locale support yet */
1750     STRLEN len;
1751     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1752     return (U32)to_uni_lower(c, tmpbuf, &len);
1753 }
1754
1755 static bool
1756 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1757                  const char *const swashname)
1758 {
1759     /* returns a boolean giving whether or not the UTF8-encoded character that
1760      * starts at <p> is in the swash indicated by <swashname>.  <swash>
1761      * contains a pointer to where the swash indicated by <swashname>
1762      * is to be stored; which this routine will do, so that future calls will
1763      * look at <*swash> and only generate a swash if it is not null
1764      *
1765      * Note that it is assumed that the buffer length of <p> is enough to
1766      * contain all the bytes that comprise the character.  Thus, <*p> should
1767      * have been checked before this call for mal-formedness enough to assure
1768      * that. */
1769
1770     dVAR;
1771
1772     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1773
1774     /* The API should have included a length for the UTF-8 character in <p>,
1775      * but it doesn't.  We therefor assume that p has been validated at least
1776      * as far as there being enough bytes available in it to accommodate the
1777      * character without reading beyond the end, and pass that number on to the
1778      * validating routine */
1779     if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
1780         return FALSE;
1781     if (!*swash)
1782         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1783     return swash_fetch(*swash, p, TRUE) != 0;
1784 }
1785
1786 bool
1787 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1788 {
1789     dVAR;
1790
1791     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1792
1793     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1794      * descendant of isalnum(3), in other words, it doesn't
1795      * contain the '_'. --jhi */
1796     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1797 }
1798
1799 bool
1800 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1801 {
1802     dVAR;
1803
1804     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1805
1806     if (*p == '_')
1807         return TRUE;
1808     /* is_utf8_idstart would be more logical. */
1809     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1810 }
1811
1812 bool
1813 Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
1814 {
1815     dVAR;
1816
1817     PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
1818
1819     if (*p == '_')
1820         return TRUE;
1821     /* is_utf8_idstart would be more logical. */
1822     return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
1823 }
1824
1825 bool
1826 Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
1827 {
1828     dVAR;
1829
1830     PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
1831
1832     return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
1833 }
1834
1835 bool
1836 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1837 {
1838     dVAR;
1839
1840     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1841
1842     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1843 }
1844
1845 bool
1846 Perl_is_utf8_xidcont(pTHX_ const U8 *p)
1847 {
1848     dVAR;
1849
1850     PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
1851
1852     return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
1853 }
1854
1855 bool
1856 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1857 {
1858     dVAR;
1859
1860     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1861
1862     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1863 }
1864
1865 bool
1866 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1867 {
1868     dVAR;
1869
1870     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1871
1872     /* ASCII characters are the same whether in utf8 or not.  So the macro
1873      * works on both utf8 and non-utf8 representations. */
1874     return isASCII(*p);
1875 }
1876
1877 bool
1878 Perl_is_utf8_space(pTHX_ const U8 *p)
1879 {
1880     dVAR;
1881
1882     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1883
1884     return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
1885 }
1886
1887 bool
1888 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1889 {
1890     dVAR;
1891
1892     PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1893
1894     /* Only true if is an ASCII space-like character, and ASCII is invariant
1895      * under utf8, so can just use the macro */
1896     return isSPACE_A(*p);
1897 }
1898
1899 bool
1900 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1901 {
1902     dVAR;
1903
1904     PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1905
1906     /* Only true if is an ASCII word character, and ASCII is invariant
1907      * under utf8, so can just use the macro */
1908     return isWORDCHAR_A(*p);
1909 }
1910
1911 bool
1912 Perl_is_utf8_digit(pTHX_ const U8 *p)
1913 {
1914     dVAR;
1915
1916     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1917
1918     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1919 }
1920
1921 bool
1922 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1923 {
1924     dVAR;
1925
1926     PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1927
1928     /* Only true if is an ASCII digit character, and ASCII is invariant
1929      * under utf8, so can just use the macro */
1930     return isDIGIT_A(*p);
1931 }
1932
1933 bool
1934 Perl_is_utf8_upper(pTHX_ const U8 *p)
1935 {
1936     dVAR;
1937
1938     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1939
1940     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1941 }
1942
1943 bool
1944 Perl_is_utf8_lower(pTHX_ const U8 *p)
1945 {
1946     dVAR;
1947
1948     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1949
1950     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1951 }
1952
1953 bool
1954 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1955 {
1956     dVAR;
1957
1958     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1959
1960     if (isASCII(*p)) {
1961         return isCNTRL_A(*p);
1962     }
1963
1964     /* All controls are in Latin1 */
1965     if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
1966         return 0;
1967     }
1968     return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
1969 }
1970
1971 bool
1972 Perl_is_utf8_graph(pTHX_ const U8 *p)
1973 {
1974     dVAR;
1975
1976     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1977
1978     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1979 }
1980
1981 bool
1982 Perl_is_utf8_print(pTHX_ const U8 *p)
1983 {
1984     dVAR;
1985
1986     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1987
1988     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1989 }
1990
1991 bool
1992 Perl_is_utf8_punct(pTHX_ const U8 *p)
1993 {
1994     dVAR;
1995
1996     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1997
1998     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1999 }
2000
2001 bool
2002 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
2003 {
2004     dVAR;
2005
2006     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2007
2008     return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
2009 }
2010
2011 bool
2012 Perl_is_utf8_mark(pTHX_ const U8 *p)
2013 {
2014     dVAR;
2015
2016     PERL_ARGS_ASSERT_IS_UTF8_MARK;
2017
2018     return is_utf8_common(p, &PL_utf8_mark, "IsM");
2019 }
2020
2021 bool
2022 Perl_is_utf8_X_begin(pTHX_ const U8 *p)
2023 {
2024     dVAR;
2025
2026     PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
2027
2028     return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
2029 }
2030
2031 bool
2032 Perl_is_utf8_X_extend(pTHX_ const U8 *p)
2033 {
2034     dVAR;
2035
2036     PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
2037
2038     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
2039 }
2040
2041 bool
2042 Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
2043 {
2044     dVAR;
2045
2046     PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
2047
2048     return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
2049 }
2050
2051 bool
2052 Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
2053 {
2054     dVAR;
2055
2056     PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
2057
2058     return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
2059 }
2060
2061 bool
2062 Perl_is_utf8_X_L(pTHX_ const U8 *p)
2063 {
2064     dVAR;
2065
2066     PERL_ARGS_ASSERT_IS_UTF8_X_L;
2067
2068     return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
2069 }
2070
2071 bool
2072 Perl_is_utf8_X_LV(pTHX_ const U8 *p)
2073 {
2074     dVAR;
2075
2076     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
2077
2078     return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
2079 }
2080
2081 bool
2082 Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
2083 {
2084     dVAR;
2085
2086     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
2087
2088     return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
2089 }
2090
2091 bool
2092 Perl_is_utf8_X_T(pTHX_ const U8 *p)
2093 {
2094     dVAR;
2095
2096     PERL_ARGS_ASSERT_IS_UTF8_X_T;
2097
2098     return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
2099 }
2100
2101 bool
2102 Perl_is_utf8_X_V(pTHX_ const U8 *p)
2103 {
2104     dVAR;
2105
2106     PERL_ARGS_ASSERT_IS_UTF8_X_V;
2107
2108     return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
2109 }
2110
2111 bool
2112 Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
2113 {
2114     dVAR;
2115
2116     PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
2117
2118     return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
2119 }
2120
2121 bool
2122 Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
2123 {
2124     /* For exclusive use of pp_quotemeta() */
2125
2126     dVAR;
2127
2128     PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
2129
2130     return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
2131 }
2132
2133 /*
2134 =for apidoc to_utf8_case
2135
2136 The C<p> contains the pointer to the UTF-8 string encoding
2137 the character that is being converted.  This routine assumes that the character
2138 at C<p> is well-formed.
2139
2140 The C<ustrp> is a pointer to the character buffer to put the
2141 conversion result to.  The C<lenp> is a pointer to the length
2142 of the result.
2143
2144 The C<swashp> is a pointer to the swash to use.
2145
2146 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
2147 and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  The C<special> (usually,
2148 but not always, a multicharacter mapping), is tried first.
2149
2150 The C<special> is a string like "utf8::ToSpecLower", which means the
2151 hash %utf8::ToSpecLower.  The access to the hash is through
2152 Perl_to_utf8_case().
2153
2154 The C<normal> is a string like "ToLower" which means the swash
2155 %utf8::ToLower.
2156
2157 =cut */
2158
2159 UV
2160 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2161                         SV **swashp, const char *normal, const char *special)
2162 {
2163     dVAR;
2164     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2165     STRLEN len = 0;
2166     const UV uv0 = valid_utf8_to_uvchr(p, NULL);
2167     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
2168      * are necessary in EBCDIC, they are redundant no-ops
2169      * in ASCII-ish platforms, and hopefully optimized away. */
2170     const UV uv1 = NATIVE_TO_UNI(uv0);
2171
2172     PERL_ARGS_ASSERT_TO_UTF8_CASE;
2173
2174     /* Note that swash_fetch() doesn't output warnings for these because it
2175      * assumes we will */
2176     if (uv1 >= UNICODE_SURROGATE_FIRST) {
2177         if (uv1 <= UNICODE_SURROGATE_LAST) {
2178             if (ckWARN_d(WARN_SURROGATE)) {
2179                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2180                 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2181                     "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2182             }
2183         }
2184         else if (UNICODE_IS_SUPER(uv1)) {
2185             if (ckWARN_d(WARN_NON_UNICODE)) {
2186                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2187                 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2188                     "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2189             }
2190         }
2191
2192         /* Note that non-characters are perfectly legal, so no warning should
2193          * be given */
2194     }
2195
2196     uvuni_to_utf8(tmpbuf, uv1);
2197
2198     if (!*swashp) /* load on-demand */
2199          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
2200
2201     if (special) {
2202          /* It might be "special" (sometimes, but not always,
2203           * a multicharacter mapping) */
2204          HV * const hv = get_hv(special, 0);
2205          SV **svp;
2206
2207          if (hv &&
2208              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
2209              (*svp)) {
2210              const char *s;
2211
2212               s = SvPV_const(*svp, len);
2213               if (len == 1)
2214                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2215               else {
2216 #ifdef EBCDIC
2217                    /* If we have EBCDIC we need to remap the characters
2218                     * since any characters in the low 256 are Unicode
2219                     * code points, not EBCDIC. */
2220                    U8 *t = (U8*)s, *tend = t + len, *d;
2221
2222                    d = tmpbuf;
2223                    if (SvUTF8(*svp)) {
2224                         STRLEN tlen = 0;
2225
2226                         while (t < tend) {
2227                              const UV c = utf8_to_uvchr_buf(t, tend, &tlen);
2228                              if (tlen > 0) {
2229                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
2230                                   t += tlen;
2231                              }
2232                              else
2233                                   break;
2234                         }
2235                    }
2236                    else {
2237                         while (t < tend) {
2238                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
2239                              t++;
2240                         }
2241                    }
2242                    len = d - tmpbuf;
2243                    Copy(tmpbuf, ustrp, len, U8);
2244 #else
2245                    Copy(s, ustrp, len, U8);
2246 #endif
2247               }
2248          }
2249     }
2250
2251     if (!len && *swashp) {
2252         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
2253
2254          if (uv2) {
2255               /* It was "normal" (a single character mapping). */
2256               const UV uv3 = UNI_TO_NATIVE(uv2);
2257               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2258          }
2259     }
2260
2261     if (!len) /* Neither: just copy.  In other words, there was no mapping
2262                  defined, which means that the code point maps to itself */
2263          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
2264
2265     if (lenp)
2266          *lenp = len;
2267
2268     return len ? valid_utf8_to_uvchr(ustrp, 0) : 0;
2269 }
2270
2271 STATIC UV
2272 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2273 {
2274     /* This is called when changing the case of a utf8-encoded character above
2275      * the Latin1 range, and the operation is in locale.  If the result
2276      * contains a character that crosses the 255/256 boundary, disallow the
2277      * change, and return the original code point.  See L<perlfunc/lc> for why;
2278      *
2279      * p        points to the original string whose case was changed; assumed
2280      *          by this routine to be well-formed
2281      * result   the code point of the first character in the changed-case string
2282      * ustrp    points to the changed-case string (<result> represents its first char)
2283      * lenp     points to the length of <ustrp> */
2284
2285     UV original;    /* To store the first code point of <p> */
2286
2287     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2288
2289     assert(! UTF8_IS_INVARIANT(*p) && ! UTF8_IS_DOWNGRADEABLE_START(*p));
2290
2291     /* We know immediately if the first character in the string crosses the
2292      * boundary, so can skip */
2293     if (result > 255) {
2294
2295         /* Look at every character in the result; if any cross the
2296         * boundary, the whole thing is disallowed */
2297         U8* s = ustrp + UTF8SKIP(ustrp);
2298         U8* e = ustrp + *lenp;
2299         while (s < e) {
2300             if (UTF8_IS_INVARIANT(*s) || UTF8_IS_DOWNGRADEABLE_START(*s))
2301             {
2302                 goto bad_crossing;
2303             }
2304             s += UTF8SKIP(s);
2305         }
2306
2307         /* Here, no characters crossed, result is ok as-is */
2308         return result;
2309     }
2310
2311 bad_crossing:
2312
2313     /* Failed, have to return the original */
2314     original = valid_utf8_to_uvchr(p, lenp);
2315     Copy(p, ustrp, *lenp, char);
2316     return original;
2317 }
2318
2319 /*
2320 =for apidoc to_utf8_upper
2321
2322 Convert the UTF-8 encoded character at C<p> to its uppercase version and
2323 store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
2324 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2325 the uppercase version may be longer than the original character.
2326
2327 The first character of the uppercased version is returned
2328 (but note, as explained above, that there may be more.)
2329
2330 The character at C<p> is assumed by this routine to be well-formed.
2331
2332 =cut */
2333
2334 /* Not currently externally documented, and subject to change:
2335  * <flags> is set iff locale semantics are to be used for code points < 256
2336  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2337  *               were used in the calculation; otherwise unchanged. */
2338
2339 UV
2340 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2341 {
2342     dVAR;
2343
2344     UV result;
2345
2346     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2347
2348     if (UTF8_IS_INVARIANT(*p)) {
2349         if (flags) {
2350             result = toUPPER_LC(*p);
2351         }
2352         else {
2353             return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2354         }
2355     }
2356     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2357         if (flags) {
2358             result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2359         }
2360         else {
2361             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2362                                           ustrp, lenp, 'S');
2363         }
2364     }
2365     else {  /* utf8, ord above 255 */
2366         result = CALL_UPPER_CASE(p, ustrp, lenp);
2367
2368         if (flags) {
2369             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2370         }
2371         return result;
2372     }
2373
2374     /* Here, used locale rules.  Convert back to utf8 */
2375     if (UTF8_IS_INVARIANT(result)) {
2376         *ustrp = (U8) result;
2377         *lenp = 1;
2378     }
2379     else {
2380         *ustrp = UTF8_EIGHT_BIT_HI(result);
2381         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2382         *lenp = 2;
2383     }
2384
2385     if (tainted_ptr) {
2386         *tainted_ptr = TRUE;
2387     }
2388     return result;
2389 }
2390
2391 /*
2392 =for apidoc to_utf8_title
2393
2394 Convert the UTF-8 encoded character at C<p> to its titlecase version and
2395 store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
2396 that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2397 titlecase version may be longer than the original character.
2398
2399 The first character of the titlecased version is returned
2400 (but note, as explained above, that there may be more.)
2401
2402 The character at C<p> is assumed by this routine to be well-formed.
2403
2404 =cut */
2405
2406 /* Not currently externally documented, and subject to change:
2407  * <flags> is set iff locale semantics are to be used for code points < 256
2408  *         Since titlecase is not defined in POSIX, uppercase is used instead
2409  *         for these/
2410  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2411  *               were used in the calculation; otherwise unchanged. */
2412
2413 UV
2414 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2415 {
2416     dVAR;
2417
2418     UV result;
2419
2420     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2421
2422     if (UTF8_IS_INVARIANT(*p)) {
2423         if (flags) {
2424             result = toUPPER_LC(*p);
2425         }
2426         else {
2427             return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2428         }
2429     }
2430     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2431         if (flags) {
2432             result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2433         }
2434         else {
2435             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2436                                           ustrp, lenp, 's');
2437         }
2438     }
2439     else {  /* utf8, ord above 255 */
2440         result = CALL_TITLE_CASE(p, ustrp, lenp);
2441
2442         if (flags) {
2443             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2444         }
2445         return result;
2446     }
2447
2448     /* Here, used locale rules.  Convert back to utf8 */
2449     if (UTF8_IS_INVARIANT(result)) {
2450         *ustrp = (U8) result;
2451         *lenp = 1;
2452     }
2453     else {
2454         *ustrp = UTF8_EIGHT_BIT_HI(result);
2455         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2456         *lenp = 2;
2457     }
2458
2459     if (tainted_ptr) {
2460         *tainted_ptr = TRUE;
2461     }
2462     return result;
2463 }
2464
2465 /*
2466 =for apidoc to_utf8_lower
2467
2468 Convert the UTF-8 encoded character at C<p> to its lowercase version and
2469 store that in UTF-8 in ustrp and its length in bytes in C<lenp>.  Note
2470 that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2471 lowercase version may be longer than the original character.
2472
2473 The first character of the lowercased version is returned
2474 (but note, as explained above, that there may be more.)
2475
2476 The character at C<p> is assumed by this routine to be well-formed.
2477
2478 =cut */
2479
2480 /* Not currently externally documented, and subject to change:
2481  * <flags> is set iff locale semantics are to be used for code points < 256
2482  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2483  *               were used in the calculation; otherwise unchanged. */
2484
2485 UV
2486 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2487 {
2488     UV result;
2489
2490     dVAR;
2491
2492     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2493
2494     if (UTF8_IS_INVARIANT(*p)) {
2495         if (flags) {
2496             result = toLOWER_LC(*p);
2497         }
2498         else {
2499             return to_lower_latin1(*p, ustrp, lenp);
2500         }
2501     }
2502     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2503         if (flags) {
2504             result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2505         }
2506         else {
2507             return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2508                                    ustrp, lenp);
2509         }
2510     }
2511     else {  /* utf8, ord above 255 */
2512         result = CALL_LOWER_CASE(p, ustrp, lenp);
2513
2514         if (flags) {
2515             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2516         }
2517
2518         return result;
2519     }
2520
2521     /* Here, used locale rules.  Convert back to utf8 */
2522     if (UTF8_IS_INVARIANT(result)) {
2523         *ustrp = (U8) result;
2524         *lenp = 1;
2525     }
2526     else {
2527         *ustrp = UTF8_EIGHT_BIT_HI(result);
2528         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2529         *lenp = 2;
2530     }
2531
2532     if (tainted_ptr) {
2533         *tainted_ptr = TRUE;
2534     }
2535     return result;
2536 }
2537
2538 /*
2539 =for apidoc to_utf8_fold
2540
2541 Convert the UTF-8 encoded character at C<p> to its foldcase version and
2542 store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
2543 that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2544 foldcase version may be longer than the original character (up to
2545 three characters).
2546
2547 The first character of the foldcased version is returned
2548 (but note, as explained above, that there may be more.)
2549
2550 The character at C<p> is assumed by this routine to be well-formed.
2551
2552 =cut */
2553
2554 /* Not currently externally documented, and subject to change,
2555  * in <flags>
2556  *      bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2557  *                            points < 256.  Since foldcase is not defined in
2558  *                            POSIX, lowercase is used instead
2559  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
2560  *                            otherwise simple folds
2561  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2562  *               were used in the calculation; otherwise unchanged. */
2563
2564 UV
2565 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
2566 {
2567     dVAR;
2568
2569     UV result;
2570
2571     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2572
2573     if (UTF8_IS_INVARIANT(*p)) {
2574         if (flags & FOLD_FLAGS_LOCALE) {
2575             result = toLOWER_LC(*p);
2576         }
2577         else {
2578             return _to_fold_latin1(*p, ustrp, lenp,
2579                                    cBOOL(flags & FOLD_FLAGS_FULL));
2580         }
2581     }
2582     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2583         if (flags & FOLD_FLAGS_LOCALE) {
2584             result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2585         }
2586         else {
2587             return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2588                                    ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
2589         }
2590     }
2591     else {  /* utf8, ord above 255 */
2592         result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
2593
2594         if ((flags & FOLD_FLAGS_LOCALE)) {
2595             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2596         }
2597
2598         return result;
2599     }
2600
2601     /* Here, used locale rules.  Convert back to utf8 */
2602     if (UTF8_IS_INVARIANT(result)) {
2603         *ustrp = (U8) result;
2604         *lenp = 1;
2605     }
2606     else {
2607         *ustrp = UTF8_EIGHT_BIT_HI(result);
2608         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2609         *lenp = 2;
2610     }
2611
2612     if (tainted_ptr) {
2613         *tainted_ptr = TRUE;
2614     }
2615     return result;
2616 }
2617
2618 /* Note:
2619  * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2620  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2621  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2622  */
2623
2624 SV*
2625 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2626 {
2627     PERL_ARGS_ASSERT_SWASH_INIT;
2628
2629     /* Returns a copy of a swash initiated by the called function.  This is the
2630      * public interface, and returning a copy prevents others from doing
2631      * mischief on the original */
2632
2633     return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, FALSE, NULL, FALSE));
2634 }
2635
2636 SV*
2637 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, bool return_if_undef, SV* invlist, bool passed_in_invlist_has_user_defined_property)
2638 {
2639     /* Initialize and return a swash, creating it if necessary.  It does this
2640      * by calling utf8_heavy.pl in the general case.
2641      *
2642      * This interface should only be used by functions that won't destroy or
2643      * adversely change the swash, as doing so affects all other uses of the
2644      * swash in the program; the general public should use 'Perl_swash_init'
2645      * instead.
2646      *
2647      * pkg  is the name of the package that <name> should be in.
2648      * name is the name of the swash to find.  Typically it is a Unicode
2649      *      property name, including user-defined ones
2650      * listsv is a string to initialize the swash with.  It must be of the form
2651      *      documented as the subroutine return value in
2652      *      L<perlunicode/User-Defined Character Properties>
2653      * minbits is the number of bits required to represent each data element.
2654      *      It is '1' for binary properties.
2655      * none I (khw) do not understand this one, but it is used only in tr///.
2656      * return_if_undef is TRUE if the routine shouldn't croak if it can't find
2657      *      the requested property
2658      * invlist is an inversion list to initialize the swash with (or NULL)
2659      * has_user_defined_property is TRUE if <invlist> has some component that
2660      *      came from a user-defined property
2661      *
2662      * Thus there are three possible inputs to find the swash: <name>,
2663      * <listsv>, and <invlist>.  At least one must be specified.  The result
2664      * will be the union of the specified ones, although <listsv>'s various
2665      * actions can intersect, etc. what <name> gives.
2666      *
2667      * <invlist> is only valid for binary properties */
2668
2669     dVAR;
2670     SV* retval = &PL_sv_undef;
2671
2672     assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2673     assert(! invlist || minbits == 1);
2674
2675     /* If data was passed in to go out to utf8_heavy to find the swash of, do
2676      * so */
2677     if (listsv != &PL_sv_undef || strNE(name, "")) {
2678         dSP;
2679         const size_t pkg_len = strlen(pkg);
2680         const size_t name_len = strlen(name);
2681         HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2682         SV* errsv_save;
2683         GV *method;
2684
2685         PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2686
2687         PUSHSTACKi(PERLSI_MAGIC);
2688         ENTER;
2689         SAVEHINTS();
2690         save_re_context();
2691         if (PL_parser && PL_parser->error_count)
2692             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2693         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2694         if (!method) {  /* demand load utf8 */
2695             ENTER;
2696             errsv_save = newSVsv(ERRSV);
2697             /* It is assumed that callers of this routine are not passing in
2698              * any user derived data.  */
2699             /* Need to do this after save_re_context() as it will set
2700              * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2701              * in Perl_magic_get).  Even line to create errsv_save can turn on
2702              * PL_tainted.  */
2703             SAVEBOOL(PL_tainted);
2704             PL_tainted = 0;
2705             Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2706                              NULL);
2707             if (!SvTRUE(ERRSV))
2708                 sv_setsv(ERRSV, errsv_save);
2709             SvREFCNT_dec(errsv_save);
2710             LEAVE;
2711         }
2712         SPAGAIN;
2713         PUSHMARK(SP);
2714         EXTEND(SP,5);
2715         mPUSHp(pkg, pkg_len);
2716         mPUSHp(name, name_len);
2717         PUSHs(listsv);
2718         mPUSHi(minbits);
2719         mPUSHi(none);
2720         PUTBACK;
2721         errsv_save = newSVsv(ERRSV);
2722         /* If we already have a pointer to the method, no need to use
2723          * call_method() to repeat the lookup.  */
2724         if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
2725             : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2726         {
2727             retval = *PL_stack_sp--;
2728             SvREFCNT_inc(retval);
2729         }
2730         if (!SvTRUE(ERRSV))
2731             sv_setsv(ERRSV, errsv_save);
2732         SvREFCNT_dec(errsv_save);
2733         LEAVE;
2734         POPSTACK;
2735         if (IN_PERL_COMPILETIME) {
2736             CopHINTS_set(PL_curcop, PL_hints);
2737         }
2738         if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2739             if (SvPOK(retval))
2740
2741                 /* If caller wants to handle missing properties, let them */
2742                 if (return_if_undef) {
2743                     return NULL;
2744                 }
2745                 Perl_croak(aTHX_
2746                            "Can't find Unicode property definition \"%"SVf"\"",
2747                            SVfARG(retval));
2748             Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
2749         }
2750     } /* End of calling the module to find the swash */
2751
2752     /* Make sure there is an inversion list for binary properties */
2753     if (minbits == 1) {
2754         SV** swash_invlistsvp = NULL;
2755         SV* swash_invlist = NULL;
2756         bool invlist_in_swash_is_valid = FALSE;
2757         HV* swash_hv = NULL;
2758
2759         /* If this operation fetched a swash, get its already existing
2760          * inversion list or create one for it */
2761         if (retval != &PL_sv_undef) {
2762             swash_hv = MUTABLE_HV(SvRV(retval));
2763
2764             swash_invlistsvp = hv_fetchs(swash_hv, "INVLIST", FALSE);
2765             if (swash_invlistsvp) {
2766                 swash_invlist = *swash_invlistsvp;
2767                 invlist_in_swash_is_valid = TRUE;
2768             }
2769             else {
2770                 swash_invlist = _swash_to_invlist(retval);
2771             }
2772         }
2773
2774         /* If an inversion list was passed in, have to include it */
2775         if (invlist) {
2776
2777             /* Any fetched swash will by now have an inversion list in it;
2778              * otherwise <swash_invlist>  will be NULL, indicating that we
2779              * didn't fetch a swash */
2780             if (swash_invlist) {
2781
2782                 /* Add the passed-in inversion list, which invalidates the one
2783                  * already stored in the swash */
2784                 invlist_in_swash_is_valid = FALSE;
2785                 _invlist_union(invlist, swash_invlist, &swash_invlist);
2786             }
2787             else {
2788
2789                 /* Here, there is no swash already.  Set up a minimal one */
2790                 swash_hv = newHV();
2791                 retval = newRV_inc(MUTABLE_SV(swash_hv));
2792                 swash_invlist = invlist;
2793             }
2794
2795             if (passed_in_invlist_has_user_defined_property) {
2796                 if (! hv_stores(swash_hv, "USER_DEFINED", newSVuv(1))) {
2797                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2798                 }
2799             }
2800         }
2801
2802         /* Here, we have computed the union of all the passed-in data.  It may
2803          * be that there was an inversion list in the swash which didn't get
2804          * touched; otherwise save the one computed one */
2805         if (! invlist_in_swash_is_valid) {
2806             if (! hv_stores(MUTABLE_HV(SvRV(retval)), "INVLIST", swash_invlist))
2807             {
2808                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2809             }
2810         }
2811     }
2812
2813     return retval;
2814 }
2815
2816
2817 /* This API is wrong for special case conversions since we may need to
2818  * return several Unicode characters for a single Unicode character
2819  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2820  * the lower-level routine, and it is similarly broken for returning
2821  * multiple values.  --jhi
2822  * For those, you should use to_utf8_case() instead */
2823 /* Now SWASHGET is recasted into S_swatch_get in this file. */
2824
2825 /* Note:
2826  * Returns the value of property/mapping C<swash> for the first character
2827  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
2828  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
2829  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
2830  *
2831  * A "swash" is a hash which contains initially the keys/values set up by
2832  * SWASHNEW.  The purpose is to be able to completely represent a Unicode
2833  * property for all possible code points.  Things are stored in a compact form
2834  * (see utf8_heavy.pl) so that calculation is required to find the actual
2835  * property value for a given code point.  As code points are looked up, new
2836  * key/value pairs are added to the hash, so that the calculation doesn't have
2837  * to ever be re-done.  Further, each calculation is done, not just for the
2838  * desired one, but for a whole block of code points adjacent to that one.
2839  * For binary properties on ASCII machines, the block is usually for 64 code
2840  * points, starting with a code point evenly divisible by 64.  Thus if the
2841  * property value for code point 257 is requested, the code goes out and
2842  * calculates the property values for all 64 code points between 256 and 319,
2843  * and stores these as a single 64-bit long bit vector, called a "swatch",
2844  * under the key for code point 256.  The key is the UTF-8 encoding for code
2845  * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
2846  * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
2847  * for code point 258 is then requested, this code realizes that it would be
2848  * stored under the key for 256, and would find that value and extract the
2849  * relevant bit, offset from 256.
2850  *
2851  * Non-binary properties are stored in as many bits as necessary to represent
2852  * their values (32 currently, though the code is more general than that), not
2853  * as single bits, but the principal is the same: the value for each key is a
2854  * vector that encompasses the property values for all code points whose UTF-8
2855  * representations are represented by the key.  That is, for all code points
2856  * whose UTF-8 representations are length N bytes, and the key is the first N-1
2857  * bytes of that.
2858  */
2859 UV
2860 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
2861 {
2862     dVAR;
2863     HV *const hv = MUTABLE_HV(SvRV(swash));
2864     U32 klen;
2865     U32 off;
2866     STRLEN slen;
2867     STRLEN needents;
2868     const U8 *tmps = NULL;
2869     U32 bit;
2870     SV *swatch;
2871     U8 tmputf8[2];
2872     const UV c = NATIVE_TO_ASCII(*ptr);
2873
2874     PERL_ARGS_ASSERT_SWASH_FETCH;
2875
2876     /* Convert to utf8 if not already */
2877     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
2878         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
2879         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
2880         ptr = tmputf8;
2881     }
2882     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
2883      * then the "swatch" is a vec() for all the chars which start
2884      * with 0xAA..0xYY
2885      * So the key in the hash (klen) is length of encoded char -1
2886      */
2887     klen = UTF8SKIP(ptr) - 1;
2888     off  = ptr[klen];
2889
2890     if (klen == 0) {
2891       /* If char is invariant then swatch is for all the invariant chars
2892        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
2893        */
2894         needents = UTF_CONTINUATION_MARK;
2895         off      = NATIVE_TO_UTF(ptr[klen]);
2896     }
2897     else {
2898       /* If char is encoded then swatch is for the prefix */
2899         needents = (1 << UTF_ACCUMULATION_SHIFT);
2900         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
2901     }
2902
2903     /*
2904      * This single-entry cache saves about 1/3 of the utf8 overhead in test
2905      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
2906      * it's nothing to sniff at.)  Pity we usually come through at least
2907      * two function calls to get here...
2908      *
2909      * NB: this code assumes that swatches are never modified, once generated!
2910      */
2911
2912     if (hv   == PL_last_swash_hv &&
2913         klen == PL_last_swash_klen &&
2914         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
2915     {
2916         tmps = PL_last_swash_tmps;
2917         slen = PL_last_swash_slen;
2918     }
2919     else {
2920         /* Try our second-level swatch cache, kept in a hash. */
2921         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
2922
2923         /* If not cached, generate it via swatch_get */
2924         if (!svp || !SvPOK(*svp)
2925                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2926             /* We use utf8n_to_uvuni() as we want an index into
2927                Unicode tables, not a native character number.
2928              */
2929             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
2930                                            ckWARN(WARN_UTF8) ?
2931                                            0 : UTF8_ALLOW_ANY);
2932             swatch = swatch_get(swash,
2933                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
2934                                 (klen) ? (code_point & ~((UV)needents - 1)) : 0,
2935                                 needents);
2936
2937             if (IN_PERL_COMPILETIME)
2938                 CopHINTS_set(PL_curcop, PL_hints);
2939
2940             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
2941
2942             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
2943                      || (slen << 3) < needents)
2944                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
2945                            "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
2946                            svp, tmps, (UV)slen, (UV)needents);
2947         }
2948
2949         PL_last_swash_hv = hv;
2950         assert(klen <= sizeof(PL_last_swash_key));
2951         PL_last_swash_klen = (U8)klen;
2952         /* FIXME change interpvar.h?  */
2953         PL_last_swash_tmps = (U8 *) tmps;
2954         PL_last_swash_slen = slen;
2955         if (klen)
2956             Copy(ptr, PL_last_swash_key, klen, U8);
2957     }
2958
2959     if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
2960         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2961
2962         /* This outputs warnings for binary properties only, assuming that
2963          * to_utf8_case() will output any for non-binary.  Also, surrogates
2964          * aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
2965
2966         if (! bitssvp || SvUV(*bitssvp) == 1) {
2967             /* User-defined properties can silently match above-Unicode */
2968             SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
2969             if (! user_defined_svp || ! SvUV(*user_defined_svp)) {
2970                 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
2971                 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2972                     "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
2973             }
2974         }
2975     }
2976
2977     switch ((int)((slen << 3) / needents)) {
2978     case 1:
2979         bit = 1 << (off & 7);
2980         off >>= 3;
2981         return (tmps[off] & bit) != 0;
2982     case 8:
2983         return tmps[off];
2984     case 16:
2985         off <<= 1;
2986         return (tmps[off] << 8) + tmps[off + 1] ;
2987     case 32:
2988         off <<= 2;
2989         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2990     }
2991     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
2992                "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
2993     NORETURN_FUNCTION_END;
2994 }
2995
2996 /* Read a single line of the main body of the swash input text.  These are of
2997  * the form:
2998  * 0053 0056    0073
2999  * where each number is hex.  The first two numbers form the minimum and
3000  * maximum of a range, and the third is the value associated with the range.
3001  * Not all swashes should have a third number
3002  *
3003  * On input: l    points to the beginning of the line to be examined; it points
3004  *                to somewhere in the string of the whole input text, and is
3005  *                terminated by a \n or the null string terminator.
3006  *           lend   points to the null terminator of that string
3007  *           wants_value    is non-zero if the swash expects a third number
3008  *           typestr is the name of the swash's mapping, like 'ToLower'
3009  * On output: *min, *max, and *val are set to the values read from the line.
3010  *            returns a pointer just beyond the line examined.  If there was no
3011  *            valid min number on the line, returns lend+1
3012  */
3013
3014 STATIC U8*
3015 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3016                              const bool wants_value, const U8* const typestr)
3017 {
3018     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
3019     STRLEN numlen;          /* Length of the number */
3020     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3021                 | PERL_SCAN_DISALLOW_PREFIX
3022                 | PERL_SCAN_SILENT_NON_PORTABLE;
3023
3024     /* nl points to the next \n in the scan */
3025     U8* const nl = (U8*)memchr(l, '\n', lend - l);
3026
3027     /* Get the first number on the line: the range minimum */
3028     numlen = lend - l;
3029     *min = grok_hex((char *)l, &numlen, &flags, NULL);
3030     if (numlen)     /* If found a hex number, position past it */
3031         l += numlen;
3032     else if (nl) {          /* Else, go handle next line, if any */
3033         return nl + 1;  /* 1 is length of "\n" */
3034     }
3035     else {              /* Else, no next line */
3036         return lend + 1;        /* to LIST's end at which \n is not found */
3037     }
3038
3039     /* The max range value follows, separated by a BLANK */
3040     if (isBLANK(*l)) {
3041         ++l;
3042         flags = PERL_SCAN_SILENT_ILLDIGIT
3043                 | PERL_SCAN_DISALLOW_PREFIX
3044                 | PERL_SCAN_SILENT_NON_PORTABLE;
3045         numlen = lend - l;
3046         *max = grok_hex((char *)l, &numlen, &flags, NULL);
3047         if (numlen)
3048             l += numlen;
3049         else    /* If no value here, it is a single element range */
3050             *max = *min;
3051
3052         /* Non-binary tables have a third entry: what the first element of the
3053          * range maps to */
3054         if (wants_value) {
3055             if (isBLANK(*l)) {
3056                 ++l;
3057
3058                 /* The ToLc, etc table mappings are not in hex, and must be
3059                  * corrected by adding the code point to them */
3060                 if (typeto) {
3061                     char *after_strtol = (char *) lend;
3062                     *val = Strtol((char *)l, &after_strtol, 10);
3063                     l = (U8 *) after_strtol;
3064                 }
3065                 else { /* Other tables are in hex, and are the correct result
3066                           without tweaking */
3067                     flags = PERL_SCAN_SILENT_ILLDIGIT
3068                         | PERL_SCAN_DISALLOW_PREFIX
3069                         | PERL_SCAN_SILENT_NON_PORTABLE;
3070                     numlen = lend - l;
3071                     *val = grok_hex((char *)l, &numlen, &flags, NULL);
3072                     if (numlen)
3073                         l += numlen;
3074                     else
3075                         *val = 0;
3076                 }
3077             }
3078             else {
3079                 *val = 0;
3080                 if (typeto) {
3081                     /* diag_listed_as: To%s: illegal mapping '%s' */
3082                     Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3083                                      typestr, l);
3084                 }
3085             }
3086         }
3087         else
3088             *val = 0; /* bits == 1, then any val should be ignored */
3089     }
3090     else { /* Nothing following range min, should be single element with no
3091               mapping expected */
3092         *max = *min;
3093         if (wants_value) {
3094             *val = 0;
3095             if (typeto) {
3096                 /* diag_listed_as: To%s: illegal mapping '%s' */
3097                 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3098             }
3099         }
3100         else
3101             *val = 0; /* bits == 1, then val should be ignored */
3102     }
3103
3104     /* Position to next line if any, or EOF */
3105     if (nl)
3106         l = nl + 1;
3107     else
3108         l = lend;
3109
3110     return l;
3111 }
3112
3113 /* Note:
3114  * Returns a swatch (a bit vector string) for a code point sequence
3115  * that starts from the value C<start> and comprises the number C<span>.
3116  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3117  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3118  */
3119 STATIC SV*
3120 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3121 {
3122     SV *swatch;
3123     U8 *l, *lend, *x, *xend, *s, *send;
3124     STRLEN lcur, xcur, scur;
3125     HV *const hv = MUTABLE_HV(SvRV(swash));
3126     SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
3127
3128     SV** listsvp = NULL; /* The string containing the main body of the table */
3129     SV** extssvp = NULL;
3130     SV** invert_it_svp = NULL;
3131     U8* typestr = NULL;
3132     STRLEN bits;
3133     STRLEN octets; /* if bits == 1, then octets == 0 */
3134     UV  none;
3135     UV  end = start + span;
3136
3137     if (invlistsvp == NULL) {
3138         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3139         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3140         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3141         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3142         listsvp = hv_fetchs(hv, "LIST", FALSE);
3143         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3144
3145         bits  = SvUV(*bitssvp);
3146         none  = SvUV(*nonesvp);
3147         typestr = (U8*)SvPV_nolen(*typesvp);
3148     }
3149     else {
3150         bits = 1;
3151         none = 0;
3152     }
3153     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3154
3155     PERL_ARGS_ASSERT_SWATCH_GET;
3156
3157     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
3158         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3159                                                  (UV)bits);
3160     }
3161
3162     /* If overflowed, use the max possible */
3163     if (end < start) {
3164         end = UV_MAX;
3165         span = end - start;
3166     }
3167
3168     /* create and initialize $swatch */
3169     scur   = octets ? (span * octets) : (span + 7) / 8;
3170     swatch = newSV(scur);
3171     SvPOK_on(swatch);
3172     s = (U8*)SvPVX(swatch);
3173     if (octets && none) {
3174         const U8* const e = s + scur;
3175         while (s < e) {
3176             if (bits == 8)
3177                 *s++ = (U8)(none & 0xff);
3178             else if (bits == 16) {
3179                 *s++ = (U8)((none >>  8) & 0xff);
3180                 *s++ = (U8)( none        & 0xff);
3181             }
3182             else if (bits == 32) {
3183                 *s++ = (U8)((none >> 24) & 0xff);
3184                 *s++ = (U8)((none >> 16) & 0xff);
3185                 *s++ = (U8)((none >>  8) & 0xff);
3186                 *s++ = (U8)( none        & 0xff);
3187             }
3188         }
3189         *s = '\0';
3190     }
3191     else {
3192         (void)memzero((U8*)s, scur + 1);
3193     }
3194     SvCUR_set(swatch, scur);
3195     s = (U8*)SvPVX(swatch);
3196
3197     if (invlistsvp) {   /* If has an inversion list set up use that */
3198         _invlist_populate_swatch(*invlistsvp, start, end, s);
3199         return swatch;
3200     }
3201
3202     /* read $swash->{LIST} */
3203     l = (U8*)SvPV(*listsvp, lcur);
3204     lend = l + lcur;
3205     while (l < lend) {
3206         UV min, max, val, upper;
3207         l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3208                                          cBOOL(octets), typestr);
3209         if (l > lend) {
3210             break;
3211         }
3212
3213         /* If looking for something beyond this range, go try the next one */
3214         if (max < start)
3215             continue;
3216
3217         /* <end> is generally 1 beyond where we want to set things, but at the
3218          * platform's infinity, where we can't go any higher, we want to
3219          * include the code point at <end> */
3220         upper = (max < end)
3221                 ? max
3222                 : (max != UV_MAX || end != UV_MAX)
3223                   ? end - 1
3224                   : end;
3225
3226         if (octets) {
3227             UV key;
3228             if (min < start) {
3229                 if (!none || val < none) {
3230                     val += start - min;
3231                 }
3232                 min = start;
3233             }
3234             for (key = min; key <= upper; key++) {
3235                 STRLEN offset;
3236                 /* offset must be non-negative (start <= min <= key < end) */
3237                 offset = octets * (key - start);
3238                 if (bits == 8)
3239                     s[offset] = (U8)(val & 0xff);
3240                 else if (bits == 16) {
3241                     s[offset    ] = (U8)((val >>  8) & 0xff);
3242                     s[offset + 1] = (U8)( val        & 0xff);
3243                 }
3244                 else if (bits == 32) {
3245                     s[offset    ] = (U8)((val >> 24) & 0xff);
3246                     s[offset + 1] = (U8)((val >> 16) & 0xff);
3247                     s[offset + 2] = (U8)((val >>  8) & 0xff);
3248                     s[offset + 3] = (U8)( val        & 0xff);
3249                 }
3250
3251                 if (!none || val < none)
3252                     ++val;
3253             }
3254         }
3255         else { /* bits == 1, then val should be ignored */
3256             UV key;
3257             if (min < start)
3258                 min = start;
3259
3260             for (key = min; key <= upper; key++) {
3261                 const STRLEN offset = (STRLEN)(key - start);
3262                 s[offset >> 3] |= 1 << (offset & 7);
3263             }
3264         }
3265     } /* while */
3266
3267     /* Invert if the data says it should be.  Assumes that bits == 1 */
3268     if (invert_it_svp && SvUV(*invert_it_svp)) {
3269
3270         /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3271          * be 0, and their inversion should also be 0, as we don't succeed any
3272          * Unicode property matches for non-Unicode code points */
3273         if (start <= PERL_UNICODE_MAX) {
3274
3275             /* The code below assumes that we never cross the
3276              * Unicode/above-Unicode boundary in a range, as otherwise we would
3277              * have to figure out where to stop flipping the bits.  Since this
3278              * boundary is divisible by a large power of 2, and swatches comes
3279              * in small powers of 2, this should be a valid assumption */
3280             assert(start + span - 1 <= PERL_UNICODE_MAX);
3281
3282             send = s + scur;
3283             while (s < send) {
3284                 *s = ~(*s);
3285                 s++;
3286             }
3287         }
3288     }
3289
3290     /* read $swash->{EXTRAS}
3291      * This code also copied to swash_to_invlist() below */
3292     x = (U8*)SvPV(*extssvp, xcur);
3293     xend = x + xcur;
3294     while (x < xend) {
3295         STRLEN namelen;
3296         U8 *namestr;
3297         SV** othersvp;
3298         HV* otherhv;
3299         STRLEN otherbits;
3300         SV **otherbitssvp, *other;
3301         U8 *s, *o, *nl;
3302         STRLEN slen, olen;
3303
3304         const U8 opc = *x++;
3305         if (opc == '\n')
3306             continue;
3307
3308         nl = (U8*)memchr(x, '\n', xend - x);
3309
3310         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3311             if (nl) {
3312                 x = nl + 1; /* 1 is length of "\n" */
3313                 continue;
3314             }
3315             else {
3316                 x = xend; /* to EXTRAS' end at which \n is not found */
3317                 break;
3318             }
3319         }
3320
3321         namestr = x;
3322         if (nl) {
3323             namelen = nl - namestr;
3324             x = nl + 1;
3325         }
3326         else {
3327             namelen = xend - namestr;
3328             x = xend;
3329         }
3330
3331         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3332         otherhv = MUTABLE_HV(SvRV(*othersvp));
3333         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3334         otherbits = (STRLEN)SvUV(*otherbitssvp);
3335         if (bits < otherbits)
3336             Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3337                        "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3338
3339         /* The "other" swatch must be destroyed after. */
3340         other = swatch_get(*othersvp, start, span);
3341         o = (U8*)SvPV(other, olen);
3342
3343         if (!olen)
3344             Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3345
3346         s = (U8*)SvPV(swatch, slen);
3347         if (bits == 1 && otherbits == 1) {
3348             if (slen != olen)
3349                 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3350                            "mismatch, slen=%"UVuf", olen=%"UVuf,
3351                            (UV)slen, (UV)olen);
3352
3353             switch (opc) {
3354             case '+':
3355                 while (slen--)
3356                     *s++ |= *o++;
3357                 break;
3358             case '!':
3359                 while (slen--)
3360                     *s++ |= ~*o++;
3361                 break;
3362             case '-':
3363                 while (slen--)
3364                     *s++ &= ~*o++;
3365                 break;
3366             case '&':
3367                 while (slen--)
3368                     *s++ &= *o++;
3369                 break;
3370             default:
3371                 break;
3372             }
3373         }
3374         else {
3375             STRLEN otheroctets = otherbits >> 3;
3376             STRLEN offset = 0;
3377             U8* const send = s + slen;
3378
3379             while (s < send) {
3380                 UV otherval = 0;
3381
3382                 if (otherbits == 1) {
3383                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3384                     ++offset;
3385                 }
3386                 else {
3387                     STRLEN vlen = otheroctets;
3388                     otherval = *o++;
3389                     while (--vlen) {
3390                         otherval <<= 8;
3391                         otherval |= *o++;
3392                     }
3393                 }
3394
3395                 if (opc == '+' && otherval)
3396                     NOOP;   /* replace with otherval */
3397                 else if (opc == '!' && !otherval)
3398                     otherval = 1;
3399                 else if (opc == '-' && otherval)
3400                     otherval = 0;
3401                 else if (opc == '&' && !otherval)
3402                     otherval = 0;
3403                 else {
3404                     s += octets; /* no replacement */
3405                     continue;
3406                 }
3407
3408                 if (bits == 8)
3409                     *s++ = (U8)( otherval & 0xff);
3410                 else if (bits == 16) {
3411                     *s++ = (U8)((otherval >>  8) & 0xff);
3412                     *s++ = (U8)( otherval        & 0xff);
3413                 }
3414                 else if (bits == 32) {
3415                     *s++ = (U8)((otherval >> 24) & 0xff);
3416                     *s++ = (U8)((otherval >> 16) & 0xff);
3417                     *s++ = (U8)((otherval >>  8) & 0xff);
3418                     *s++ = (U8)( otherval        & 0xff);
3419                 }
3420             }
3421         }
3422         sv_free(other); /* through with it! */
3423     } /* while */
3424     return swatch;
3425 }
3426
3427 HV*
3428 Perl__swash_inversion_hash(pTHX_ SV* const swash)
3429 {
3430
3431    /* Subject to change or removal.  For use only in one place in regcomp.c.
3432     * Can't be used on a property that is subject to user override, as it
3433     * relies on the value of SPECIALS in the swash which would be set by
3434     * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3435     * for overridden properties
3436     *
3437     * Returns a hash which is the inversion and closure of a swash mapping.
3438     * For example, consider the input lines:
3439     * 004B              006B
3440     * 004C              006C
3441     * 212A              006B
3442     *
3443     * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3444     * 006C.  The value for each key is an array.  For 006C, the array would
3445     * have a two elements, the utf8 for itself, and for 004C.  For 006B, there
3446     * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3447     *
3448     * Essentially, for any code point, it gives all the code points that map to
3449     * it, or the list of 'froms' for that point.
3450     *
3451     * Currently it ignores any additions or deletions from other swashes,
3452     * looking at just the main body of the swash, and if there are SPECIALS
3453     * in the swash, at that hash
3454     *
3455     * The specials hash can be extra code points, and most likely consists of
3456     * maps from single code points to multiple ones (each expressed as a string
3457     * of utf8 characters).   This function currently returns only 1-1 mappings.
3458     * However consider this possible input in the specials hash:
3459     * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
3460     * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
3461     *
3462     * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3463     * currently handle.  But it also means that FB05 and FB06 are equivalent in
3464     * a 1-1 mapping which we should handle, and this relationship may not be in
3465     * the main table.  Therefore this function examines all the multi-char
3466     * sequences and adds the 1-1 mappings that come out of that.  */
3467
3468     U8 *l, *lend;
3469     STRLEN lcur;
3470     HV *const hv = MUTABLE_HV(SvRV(swash));
3471
3472     /* The string containing the main body of the table */
3473     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3474
3475     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3476     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3477     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3478     /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3479     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3480     const STRLEN bits  = SvUV(*bitssvp);
3481     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3482     const UV     none  = SvUV(*nonesvp);
3483     SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3484
3485     HV* ret = newHV();
3486
3487     PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3488
3489     /* Must have at least 8 bits to get the mappings */
3490     if (bits != 8 && bits != 16 && bits != 32) {
3491         Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3492                                                  (UV)bits);
3493     }
3494
3495     if (specials_p) { /* It might be "special" (sometimes, but not always, a
3496                         mapping to more than one character */
3497
3498         /* Construct an inverse mapping hash for the specials */
3499         HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3500         HV * specials_inverse = newHV();
3501         char *char_from; /* the lhs of the map */
3502         I32 from_len;   /* its byte length */
3503         char *char_to;  /* the rhs of the map */
3504         I32 to_len;     /* its byte length */
3505         SV *sv_to;      /* and in a sv */
3506         AV* from_list;  /* list of things that map to each 'to' */
3507
3508         hv_iterinit(specials_hv);
3509
3510         /* The keys are the characters (in utf8) that map to the corresponding
3511          * utf8 string value.  Iterate through the list creating the inverse
3512          * list. */
3513         while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3514             SV** listp;
3515             if (! SvPOK(sv_to)) {
3516                 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3517                            "unexpectedly is not a string, flags=%lu",
3518                            (unsigned long)SvFLAGS(sv_to));
3519             }
3520             /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3521
3522             /* Each key in the inverse list is a mapped-to value, and the key's
3523              * hash value is a list of the strings (each in utf8) that map to
3524              * it.  Those strings are all one character long */
3525             if ((listp = hv_fetch(specials_inverse,
3526                                     SvPVX(sv_to),
3527                                     SvCUR(sv_to), 0)))
3528             {
3529                 from_list = (AV*) *listp;
3530             }
3531             else { /* No entry yet for it: create one */
3532                 from_list = newAV();
3533                 if (! hv_store(specials_inverse,
3534                                 SvPVX(sv_to),
3535                                 SvCUR(sv_to),
3536                                 (SV*) from_list, 0))
3537                 {
3538                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3539                 }
3540             }
3541
3542             /* Here have the list associated with this 'to' (perhaps newly
3543              * created and empty).  Just add to it.  Note that we ASSUME that
3544              * the input is guaranteed to not have duplications, so we don't
3545              * check for that.  Duplications just slow down execution time. */
3546             av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3547         }
3548
3549         /* Here, 'specials_inverse' contains the inverse mapping.  Go through
3550          * it looking for cases like the FB05/FB06 examples above.  There would
3551          * be an entry in the hash like
3552         *       'st' => [ FB05, FB06 ]
3553         * In this example we will create two lists that get stored in the
3554         * returned hash, 'ret':
3555         *       FB05 => [ FB05, FB06 ]
3556         *       FB06 => [ FB05, FB06 ]
3557         *
3558         * Note that there is nothing to do if the array only has one element.
3559         * (In the normal 1-1 case handled below, we don't have to worry about
3560         * two lists, as everything gets tied to the single list that is
3561         * generated for the single character 'to'.  But here, we are omitting
3562         * that list, ('st' in the example), so must have multiple lists.) */
3563         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3564                                                  &char_to, &to_len)))
3565         {
3566             if (av_len(from_list) > 0) {
3567                 int i;
3568
3569                 /* We iterate over all combinations of i,j to place each code
3570                  * point on each list */
3571                 for (i = 0; i <= av_len(from_list); i++) {
3572                     int j;
3573                     AV* i_list = newAV();
3574                     SV** entryp = av_fetch(from_list, i, FALSE);
3575                     if (entryp == NULL) {
3576                         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3577                     }
3578                     if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3579                         Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3580                     }
3581                     if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3582                                    (SV*) i_list, FALSE))
3583                     {
3584                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3585                     }
3586
3587                     /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3588                     for (j = 0; j <= av_len(from_list); j++) {
3589                         entryp = av_fetch(from_list, j, FALSE);
3590                         if (entryp == NULL) {
3591                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3592                         }
3593
3594                         /* When i==j this adds itself to the list */
3595                         av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3596                                         (U8*) SvPVX(*entryp),
3597                                         (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3598                                         0)));
3599                         /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3600                     }
3601                 }
3602             }
3603         }
3604         SvREFCNT_dec(specials_inverse); /* done with it */
3605     } /* End of specials */
3606
3607     /* read $swash->{LIST} */
3608     l = (U8*)SvPV(*listsvp, lcur);
3609     lend = l + lcur;
3610
3611     /* Go through each input line */
3612     while (l < lend) {
3613         UV min, max, val;
3614         UV inverse;
3615         l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3616                                          cBOOL(octets), typestr);
3617         if (l > lend) {
3618             break;
3619         }
3620
3621         /* Each element in the range is to be inverted */
3622         for (inverse = min; inverse <= max; inverse++) {
3623             AV* list;
3624             SV** listp;
3625             IV i;
3626             bool found_key = FALSE;
3627             bool found_inverse = FALSE;
3628
3629             /* The key is the inverse mapping */
3630             char key[UTF8_MAXBYTES+1];
3631             char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
3632             STRLEN key_len = key_end - key;
3633
3634             /* Get the list for the map */
3635             if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3636                 list = (AV*) *listp;
3637             }
3638             else { /* No entry yet for it: create one */
3639                 list = newAV();
3640                 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3641                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3642                 }
3643             }
3644
3645             /* Look through list to see if this inverse mapping already is
3646              * listed, or if there is a mapping to itself already */
3647             for (i = 0; i <= av_len(list); i++) {
3648                 SV** entryp = av_fetch(list, i, FALSE);
3649                 SV* entry;
3650                 if (entryp == NULL) {
3651                     Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3652                 }
3653                 entry = *entryp;
3654                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
3655                 if (SvUV(entry) == val) {
3656                     found_key = TRUE;
3657                 }
3658                 if (SvUV(entry) == inverse) {
3659                     found_inverse = TRUE;
3660                 }
3661
3662                 /* No need to continue searching if found everything we are
3663                  * looking for */
3664                 if (found_key && found_inverse) {
3665                     break;
3666                 }
3667             }
3668
3669             /* Make sure there is a mapping to itself on the list */
3670             if (! found_key) {
3671                 av_push(list, newSVuv(val));
3672                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));*/
3673             }
3674
3675
3676             /* Simply add the value to the list */
3677             if (! found_inverse) {
3678                 av_push(list, newSVuv(inverse));
3679                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));*/
3680             }
3681
3682             /* swatch_get() increments the value of val for each element in the
3683              * range.  That makes more compact tables possible.  You can
3684              * express the capitalization, for example, of all consecutive
3685              * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3686              * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
3687              * and it's not documented; it appears to be used only in
3688              * implementing tr//; I copied the semantics from swatch_get(), just
3689              * in case */
3690             if (!none || val < none) {
3691                 ++val;
3692             }
3693         }
3694     }
3695
3696     return ret;
3697 }
3698
3699 SV*
3700 Perl__swash_to_invlist(pTHX_ SV* const swash)
3701 {
3702
3703    /* Subject to change or removal.  For use only in one place in regcomp.c */
3704
3705     U8 *l, *lend;
3706     char *loc;
3707     STRLEN lcur;
3708     HV *const hv = MUTABLE_HV(SvRV(swash));
3709     UV elements = 0;    /* Number of elements in the inversion list */
3710     U8 empty[] = "";
3711
3712     /* The string containing the main body of the table */
3713     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3714     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3715     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3716     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3717     SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3718
3719     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3720     const STRLEN bits  = SvUV(*bitssvp);
3721     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3722     U8 *x, *xend;
3723     STRLEN xcur;
3724
3725     SV* invlist;
3726
3727     PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
3728
3729     /* read $swash->{LIST} */
3730     if (SvPOK(*listsvp)) {
3731         l = (U8*)SvPV(*listsvp, lcur);
3732     }
3733     else {
3734         /* LIST legitimately doesn't contain a string during compilation phases
3735          * of Perl itself, before the Unicode tables are generated.  In this
3736          * case, just fake things up by creating an empty list */
3737         l = empty;
3738         lcur = 0;
3739     }
3740     loc = (char *) l;
3741     lend = l + lcur;
3742
3743     /* Scan the input to count the number of lines to preallocate array size
3744      * based on worst possible case, which is each line in the input creates 2
3745      * elements in the inversion list: 1) the beginning of a range in the list;
3746      * 2) the beginning of a range not in the list.  */
3747     while ((loc = (strchr(loc, '\n'))) != NULL) {
3748         elements += 2;
3749         loc++;
3750     }
3751
3752     /* If the ending is somehow corrupt and isn't a new line, add another
3753      * element for the final range that isn't in the inversion list */
3754     if (! (*lend == '\n'
3755         || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
3756     {
3757         elements++;
3758     }
3759
3760     invlist = _new_invlist(elements);
3761
3762     /* Now go through the input again, adding each range to the list */
3763     while (l < lend) {
3764         UV start, end;
3765         UV val;         /* Not used by this function */
3766
3767         l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
3768                                          cBOOL(octets), typestr);
3769
3770         if (l > lend) {
3771             break;
3772         }
3773
3774         invlist = _add_range_to_invlist(invlist, start, end);
3775     }
3776
3777     /* Invert if the data says it should be */
3778     if (invert_it_svp && SvUV(*invert_it_svp)) {
3779         _invlist_invert_prop(invlist);
3780     }
3781
3782     /* This code is copied from swatch_get()
3783      * read $swash->{EXTRAS} */
3784     x = (U8*)SvPV(*extssvp, xcur);
3785     xend = x + xcur;
3786     while (x < xend) {
3787         STRLEN namelen;
3788         U8 *namestr;
3789         SV** othersvp;
3790         HV* otherhv;
3791         STRLEN otherbits;
3792         SV **otherbitssvp, *other;
3793         U8 *nl;
3794
3795         const U8 opc = *x++;
3796         if (opc == '\n')
3797             continue;
3798
3799         nl = (U8*)memchr(x, '\n', xend - x);
3800
3801         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3802             if (nl) {
3803                 x = nl + 1; /* 1 is length of "\n" */
3804                 continue;
3805             }
3806             else {
3807                 x = xend; /* to EXTRAS' end at which \n is not found */
3808                 break;
3809             }
3810         }
3811
3812         namestr = x;
3813         if (nl) {
3814             namelen = nl - namestr;
3815             x = nl + 1;
3816         }
3817         else {
3818             namelen = xend - namestr;
3819             x = xend;
3820         }
3821
3822         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3823         otherhv = MUTABLE_HV(SvRV(*othersvp));
3824         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3825         otherbits = (STRLEN)SvUV(*otherbitssvp);
3826
3827         if (bits != otherbits || bits != 1) {
3828             Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
3829                        "properties, bits=%"UVuf", otherbits=%"UVuf,
3830                        (UV)bits, (UV)otherbits);
3831         }
3832
3833         /* The "other" swatch must be destroyed after. */
3834         other = _swash_to_invlist((SV *)*othersvp);
3835
3836         /* End of code copied from swatch_get() */
3837         switch (opc) {
3838         case '+':
3839             _invlist_union(invlist, other, &invlist);
3840             break;
3841         case '!':
3842             _invlist_invert(other);
3843             _invlist_union(invlist, other, &invlist);
3844             break;
3845         case '-':
3846             _invlist_subtract(invlist, other, &invlist);
3847             break;
3848         case '&':
3849             _invlist_intersection(invlist, other, &invlist);
3850             break;
3851         default:
3852             break;
3853         }
3854         sv_free(other); /* through with it! */
3855     }
3856
3857     return invlist;
3858 }
3859
3860 /*
3861 =for apidoc uvchr_to_utf8
3862
3863 Adds the UTF-8 representation of the Native code point C<uv> to the end
3864 of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
3865 bytes available. The return value is the pointer to the byte after the
3866 end of the new character. In other words,
3867
3868     d = uvchr_to_utf8(d, uv);
3869
3870 is the recommended wide native character-aware way of saying
3871
3872     *(d++) = uv;
3873
3874 =cut
3875 */
3876
3877 /* On ASCII machines this is normally a macro but we want a
3878    real function in case XS code wants it
3879 */
3880 U8 *
3881 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
3882 {
3883     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
3884
3885     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
3886 }
3887
3888 U8 *
3889 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
3890 {
3891     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
3892
3893     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
3894 }
3895
3896 /*
3897 =for apidoc utf8n_to_uvchr
3898
3899 Returns the native character value of the first character in the string
3900 C<s>
3901 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
3902 length, in bytes, of that character.
3903
3904 C<length> and C<flags> are the same as L</utf8n_to_uvuni>().
3905
3906 =cut
3907 */
3908 /* On ASCII machines this is normally a macro but we want
3909    a real function in case XS code wants it
3910 */
3911 UV
3912 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
3913 U32 flags)
3914 {
3915     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
3916
3917     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
3918
3919     return UNI_TO_NATIVE(uv);
3920 }
3921
3922 bool
3923 Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
3924 {
3925     /* May change: warns if surrogates, non-character code points, or
3926      * non-Unicode code points are in s which has length len bytes.  Returns
3927      * TRUE if none found; FALSE otherwise.  The only other validity check is
3928      * to make sure that this won't exceed the string's length */
3929
3930     const U8* const e = s + len;
3931     bool ok = TRUE;
3932
3933     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3934
3935     while (s < e) {
3936         if (UTF8SKIP(s) > len) {
3937             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3938                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3939             return FALSE;
3940         }
3941         if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
3942             STRLEN char_len;
3943             if (UTF8_IS_SUPER(s)) {
3944                 if (ckWARN_d(WARN_NON_UNICODE)) {
3945                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3946                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3947                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
3948                     ok = FALSE;
3949                 }
3950             }
3951             else if (UTF8_IS_SURROGATE(s)) {
3952                 if (ckWARN_d(WARN_SURROGATE)) {
3953                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3954                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3955                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
3956                     ok = FALSE;
3957                 }
3958             }
3959             else if
3960                 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
3961                  && (ckWARN_d(WARN_NONCHAR)))
3962             {
3963                 UV uv = utf8_to_uvchr_buf(s, e, &char_len);
3964                 Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
3965                     "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
3966                 ok = FALSE;
3967             }
3968         }
3969         s += UTF8SKIP(s);
3970     }
3971
3972     return ok;
3973 }
3974
3975 /*
3976 =for apidoc pv_uni_display
3977
3978 Build to the scalar C<dsv> a displayable version of the string C<spv>,
3979 length C<len>, the displayable version being at most C<pvlim> bytes long
3980 (if longer, the rest is truncated and "..." will be appended).
3981
3982 The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
3983 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
3984 to display the \\[nrfta\\] as the backslashed versions (like '\n')
3985 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
3986 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
3987 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
3988
3989 The pointer to the PV of the C<dsv> is returned.
3990
3991 =cut */
3992 char *
3993 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
3994 {
3995     int truncated = 0;
3996     const char *s, *e;
3997
3998     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
3999
4000     sv_setpvs(dsv, "");
4001     SvUTF8_off(dsv);
4002     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4003          UV u;
4004           /* This serves double duty as a flag and a character to print after
4005              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4006           */
4007          char ok = 0;
4008
4009          if (pvlim && SvCUR(dsv) >= pvlim) {
4010               truncated++;
4011               break;
4012          }
4013          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4014          if (u < 256) {
4015              const unsigned char c = (unsigned char)u & 0xFF;
4016              if (flags & UNI_DISPLAY_BACKSLASH) {
4017                  switch (c) {
4018                  case '\n':
4019                      ok = 'n'; break;
4020                  case '\r':
4021                      ok = 'r'; break;
4022                  case '\t':
4023                      ok = 't'; break;
4024                  case '\f':
4025                      ok = 'f'; break;
4026                  case '\a':
4027                      ok = 'a'; break;
4028                  case '\\':
4029                      ok = '\\'; break;
4030                  default: break;
4031                  }
4032                  if (ok) {
4033                      const char string = ok;
4034                      sv_catpvs(dsv, "\\");
4035                      sv_catpvn(dsv, &string, 1);
4036                  }
4037              }
4038              /* isPRINT() is the locale-blind version. */
4039              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4040                  const char string = c;
4041                  sv_catpvn(dsv, &string, 1);
4042                  ok = 1;
4043              }
4044          }
4045          if (!ok)
4046              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
4047     }
4048     if (truncated)
4049          sv_catpvs(dsv, "...");
4050
4051     return SvPVX(dsv);
4052 }
4053
4054 /*
4055 =for apidoc sv_uni_display
4056
4057 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4058 the displayable version being at most C<pvlim> bytes long
4059 (if longer, the rest is truncated and "..." will be appended).
4060
4061 The C<flags> argument is as in L</pv_uni_display>().
4062
4063 The pointer to the PV of the C<dsv> is returned.
4064
4065 =cut
4066 */
4067 char *
4068 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4069 {
4070     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4071
4072      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
4073                                 SvCUR(ssv), pvlim, flags);
4074 }
4075
4076 /*
4077 =for apidoc foldEQ_utf8
4078
4079 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
4080 of which may be in UTF-8) are the same case-insensitively; false otherwise.
4081 How far into the strings to compare is determined by other input parameters.
4082
4083 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4084 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
4085 with respect to C<s2>.
4086
4087 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
4088 equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
4089 scan will not be considered to be a match unless the goal is reached, and
4090 scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
4091 C<s2>.
4092
4093 If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
4094 considered an end pointer beyond which scanning of C<s1> will not continue under
4095 any circumstances.  This means that if both C<l1> and C<pe1> are specified, and
4096 C<pe1>
4097 is less than C<s1>+C<l1>, the match will never be successful because it can
4098 never
4099 get as far as its goal (and in fact is asserted against).  Correspondingly for
4100 C<pe2> with respect to C<s2>.
4101
4102 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4103 C<l2> must be non-zero), and if both do, both have to be
4104 reached for a successful match.   Also, if the fold of a character is multiple
4105 characters, all of them must be matched (see tr21 reference below for
4106 'folding').
4107
4108 Upon a successful match, if C<pe1> is non-NULL,
4109 it will be set to point to the beginning of the I<next> character of C<s1>
4110 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4111
4112 For case-insensitiveness, the "casefolding" of Unicode is used
4113 instead of upper/lowercasing both the characters, see
4114 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
4115
4116 =cut */
4117
4118 /* A flags parameter has been added which may change, and hence isn't
4119  * externally documented.  Currently it is:
4120  *  0 for as-documented above
4121  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4122                             ASCII one, to not match
4123  *  FOLDEQ_UTF8_LOCALE      meaning that locale rules are to be used for code
4124  *                          points below 256; unicode rules for above 255; and
4125  *                          folds that cross those boundaries are disallowed,
4126  *                          like the NOMIX_ASCII option
4127  *  FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
4128  *                           routine.  This allows that step to be skipped.
4129  *  FOLDEQ_S2_ALREADY_FOLDED   Similarly.
4130  */
4131 I32
4132 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
4133 {
4134     dVAR;
4135     register const U8 *p1  = (const U8*)s1; /* Point to current char */
4136     register const U8 *p2  = (const U8*)s2;
4137     register const U8 *g1 = NULL;       /* goal for s1 */
4138     register const U8 *g2 = NULL;
4139     register const U8 *e1 = NULL;       /* Don't scan s1 past this */
4140     register U8 *f1 = NULL;             /* Point to current folded */
4141     register const U8 *e2 = NULL;
4142     register U8 *f2 = NULL;
4143     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4144     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4145     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4146
4147     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4148
4149     /* The algorithm requires that input with the flags on the first line of
4150      * the assert not be pre-folded. */
4151     assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE))
4152         && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
4153
4154     if (pe1) {
4155         e1 = *(U8**)pe1;
4156     }
4157
4158     if (l1) {
4159         g1 = (const U8*)s1 + l1;
4160     }
4161
4162     if (pe2) {
4163         e2 = *(U8**)pe2;
4164     }
4165
4166     if (l2) {
4167         g2 = (const U8*)s2 + l2;
4168     }
4169
4170     /* Must have at least one goal */
4171     assert(g1 || g2);
4172
4173     if (g1) {
4174
4175         /* Will never match if goal is out-of-bounds */
4176         assert(! e1  || e1 >= g1);
4177
4178         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4179         * only go as far as the goal */
4180         e1 = g1;
4181     }
4182     else {
4183         assert(e1);    /* Must have an end for looking at s1 */
4184     }
4185
4186     /* Same for goal for s2 */
4187     if (g2) {
4188         assert(! e2  || e2 >= g2);
4189         e2 = g2;
4190     }
4191     else {
4192         assert(e2);
4193     }
4194
4195     /* If both operands are already folded, we could just do a memEQ on the
4196      * whole strings at once, but it would be better if the caller realized
4197      * this and didn't even call us */
4198
4199     /* Look through both strings, a character at a time */
4200     while (p1 < e1 && p2 < e2) {
4201
4202         /* If at the beginning of a new character in s1, get its fold to use
4203          * and the length of the fold.  (exception: locale rules just get the
4204          * character to a single byte) */
4205         if (n1 == 0) {
4206             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4207                 f1 = (U8 *) p1;
4208                 n1 = UTF8SKIP(f1);
4209             }
4210
4211             else {
4212                 /* If in locale matching, we use two sets of rules, depending
4213                  * on if the code point is above or below 255.  Here, we test
4214                  * for and handle locale rules */
4215                 if ((flags & FOLDEQ_UTF8_LOCALE)
4216                     && (! u1 || UTF8_IS_INVARIANT(*p1)
4217                         || UTF8_IS_DOWNGRADEABLE_START(*p1)))
4218                 {
4219                     /* There is no mixing of code points above and below 255. */
4220                     if (u2 && (! UTF8_IS_INVARIANT(*p2)
4221                         && ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
4222                     {
4223                         return 0;
4224                     }
4225
4226                     /* We handle locale rules by converting, if necessary, the
4227                      * code point to a single byte. */
4228                     if (! u1 || UTF8_IS_INVARIANT(*p1)) {
4229                         *foldbuf1 = *p1;
4230                     }
4231                     else {
4232                         *foldbuf1 = TWO_BYTE_UTF8_TO_UNI(*p1, *(p1 + 1));
4233                     }
4234                     n1 = 1;
4235                 }
4236                 else if (isASCII(*p1)) {    /* Note, that here won't be both
4237                                                ASCII and using locale rules */
4238
4239                     /* If trying to mix non- with ASCII, and not supposed to,
4240                      * fail */
4241                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4242                         return 0;
4243                     }
4244                     n1 = 1;
4245                     *foldbuf1 = toLOWER(*p1);   /* Folds in the ASCII range are
4246                                                    just lowercased */
4247                 }
4248                 else if (u1) {
4249                     to_utf8_fold(p1, foldbuf1, &n1);
4250                 }
4251                 else {  /* Not utf8, get utf8 fold */
4252                     to_uni_fold(NATIVE_TO_UNI(*p1), foldbuf1, &n1);
4253                 }
4254                 f1 = foldbuf1;
4255             }
4256         }
4257
4258         if (n2 == 0) {    /* Same for s2 */
4259             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4260                 f2 = (U8 *) p2;
4261                 n2 = UTF8SKIP(f2);
4262             }
4263             else {
4264                 if ((flags & FOLDEQ_UTF8_LOCALE)
4265                     && (! u2 || UTF8_IS_INVARIANT(*p2) || UTF8_IS_DOWNGRADEABLE_START(*p2)))
4266                 {
4267                     /* Here, the next char in s2 is < 256.  We've already
4268                      * worked on s1, and if it isn't also < 256, can't match */
4269                     if (u1 && (! UTF8_IS_INVARIANT(*p1)
4270                         && ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
4271                     {
4272                         return 0;
4273                     }
4274                     if (! u2 || UTF8_IS_INVARIANT(*p2)) {
4275                         *foldbuf2 = *p2;
4276                     }
4277                     else {
4278                         *foldbuf2 = TWO_BYTE_UTF8_TO_UNI(*p2, *(p2 + 1));
4279                     }
4280
4281                     /* Use another function to handle locale rules.  We've made
4282                      * sure that both characters to compare are single bytes */
4283                     if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
4284                         return 0;
4285                     }
4286                     n1 = n2 = 0;
4287                 }
4288                 else if (isASCII(*p2)) {
4289                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4290                         return 0;
4291                     }
4292                     n2 = 1;
4293                     *foldbuf2 = toLOWER(*p2);
4294                 }
4295                 else if (u2) {
4296                     to_utf8_fold(p2, foldbuf2, &n2);
4297                 }
4298                 else {
4299                     to_uni_fold(NATIVE_TO_UNI(*p2), foldbuf2, &n2);
4300                 }
4301                 f2 = foldbuf2;
4302             }
4303         }
4304
4305         /* Here f1 and f2 point to the beginning of the strings to compare.
4306          * These strings are the folds of the next character from each input
4307          * string, stored in utf8. */
4308
4309         /* While there is more to look for in both folds, see if they
4310         * continue to match */
4311         while (n1 && n2) {
4312             U8 fold_length = UTF8SKIP(f1);
4313             if (fold_length != UTF8SKIP(f2)
4314                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4315                                                        function call for single
4316                                                        byte */
4317                 || memNE((char*)f1, (char*)f2, fold_length))
4318             {
4319                 return 0; /* mismatch */
4320             }
4321
4322             /* Here, they matched, advance past them */
4323             n1 -= fold_length;
4324             f1 += fold_length;
4325             n2 -= fold_length;
4326             f2 += fold_length;
4327         }
4328
4329         /* When reach the end of any fold, advance the input past it */
4330         if (n1 == 0) {
4331             p1 += u1 ? UTF8SKIP(p1) : 1;
4332         }
4333         if (n2 == 0) {
4334             p2 += u2 ? UTF8SKIP(p2) : 1;
4335         }
4336     } /* End of loop through both strings */
4337
4338     /* A match is defined by each scan that specified an explicit length
4339     * reaching its final goal, and the other not having matched a partial
4340     * character (which can happen when the fold of a character is more than one
4341     * character). */
4342     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4343         return 0;
4344     }
4345
4346     /* Successful match.  Set output pointers */
4347     if (pe1) {
4348         *pe1 = (char*)p1;
4349     }
4350     if (pe2) {
4351         *pe2 = (char*)p2;
4352     }
4353     return 1;
4354 }
4355
4356 /*
4357  * Local variables:
4358  * c-indentation-style: bsd
4359  * c-basic-offset: 4
4360  * indent-tabs-mode: t
4361  * End:
4362  *
4363  * ex: set ts=8 sts=4 sw=4 noet:
4364  */