utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these are
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if the first C<len> bytes of the given string are the same whether
  61 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  62 is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  63 fit this definition, hence the function's name.
  64
  65 See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  66
  67 =cut
  68 */
  69
  70 bool
  71 Perl_is_ascii_string(const U8 *s, STRLEN len)
  72 {
  73     const U8* const send = s + (len ? len : strlen((const char *)s));
  74     const U8* x = s;
  75
  76     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  77
  78     for (; x < send; ++x) {
  79         if (!UTF8_IS_INVARIANT(*x))
  80             break;
  81     }
  82
  83     return x == send;
  84 }
  85
  86 /*
  87 =for apidoc uvuni_to_utf8_flags
  88
  89 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  90 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  91 bytes available. The return value is the pointer to the byte after the
  92 end of the new character. In other words,
  93
  94     d = uvuni_to_utf8_flags(d, uv, flags);
  95
  96 or, in most cases,
  97
  98     d = uvuni_to_utf8(d, uv);
  99
 100 (which is equivalent to)
 101
 102     d = uvuni_to_utf8_flags(d, uv, 0);
 103
 104 is the recommended Unicode-aware way of saying
 105
 106     *(d++) = uv;
 107
 108 =cut
 109 */
 110
 111 U8 *
 112 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 113 {
 114     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 115
 116     if (ckWARN(WARN_UTF8)) {
 117          if (UNICODE_IS_SURROGATE(uv) &&
 118              !(flags & UNICODE_ALLOW_SURROGATE))
 119               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
 120          else if (
 121                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 122                     !(flags & UNICODE_ALLOW_FDD0))
 123                    ||
 124                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
 125                     !(flags & UNICODE_ALLOW_FFFF))) &&
 126                   /* UNICODE_ALLOW_SUPER includes
 127                    * FFFEs and FFFFs beyond 0x10FFFF. */
 128                   ((uv <= PERL_UNICODE_MAX) ||
 129                    !(flags & UNICODE_ALLOW_SUPER))
 130                   )
 131               Perl_warner(aTHX_ packWARN(WARN_UTF8),
 132                       "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
 133     }
 134     if (UNI_IS_INVARIANT(uv)) {
 135         *d++ = (U8)UTF_TO_NATIVE(uv);
 136         return d;
 137     }
 138 #if defined(EBCDIC)
 139     else {
 140         STRLEN len  = UNISKIP(uv);
 141         U8 *p = d+len-1;
 142         while (p > d) {
 143             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 144             uv >>= UTF_ACCUMULATION_SHIFT;
 145         }
 146         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 147         return d+len;
 148     }
 149 #else /* Non loop style */
 150     if (uv < 0x800) {
 151         *d++ = (U8)(( uv >>  6)         | 0xc0);
 152         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 153         return d;
 154     }
 155     if (uv < 0x10000) {
 156         *d++ = (U8)(( uv >> 12)         | 0xe0);
 157         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 158         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 159         return d;
 160     }
 161     if (uv < 0x200000) {
 162         *d++ = (U8)(( uv >> 18)         | 0xf0);
 163         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 164         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 165         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 166         return d;
 167     }
 168     if (uv < 0x4000000) {
 169         *d++ = (U8)(( uv >> 24)         | 0xf8);
 170         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 171         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 172         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 173         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 174         return d;
 175     }
 176     if (uv < 0x80000000) {
 177         *d++ = (U8)(( uv >> 30)         | 0xfc);
 178         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 179         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 180         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 181         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 182         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 183         return d;
 184     }
 185 #ifdef HAS_QUAD
 186     if (uv < UTF8_QUAD_MAX)
 187 #endif
 188     {
 189         *d++ =                            0xfe; /* Can't match U+FEFF! */
 190         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 191         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 192         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 193         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 194         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 195         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 196         return d;
 197     }
 198 #ifdef HAS_QUAD
 199     {
 200         *d++ =                            0xff;         /* Can't match U+FFFE! */
 201         *d++ =                            0x80;         /* 6 Reserved bits */
 202         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 203         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 205         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 206         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 207         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 208         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 209         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 210         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 211         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 212         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 213         return d;
 214     }
 215 #endif
 216 #endif /* Loop style */
 217 }
 218
 219 /*
 220
 221 Tests if some arbitrary number of bytes begins in a valid UTF-8
 222 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 223 UTF-8 character.  The actual number of bytes in the UTF-8 character
 224 will be returned if it is valid, otherwise 0.
 225
 226 This is the "slow" version as opposed to the "fast" version which is
 227 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 228 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 229 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 230 you should use the _slow().  In practice this means that the _slow()
 231 will be used very rarely, since the maximum Unicode code point (as of
 232 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 233 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 234 five bytes or more.
 235
 236 =cut */
 237 STATIC STRLEN
 238 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 239 {
 240     U8 u = *s;
 241     STRLEN slen;
 242     UV uv, ouv;
 243
 244     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 245
 246     if (UTF8_IS_INVARIANT(u))
 247         return 1;
 248
 249     if (!UTF8_IS_START(u))
 250         return 0;
 251
 252     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 253         return 0;
 254
 255     slen = len - 1;
 256     s++;
 257 #ifdef EBCDIC
 258     u = NATIVE_TO_UTF(u);
 259 #endif
 260     u &= UTF_START_MASK(len);
 261     uv  = u;
 262     ouv = uv;
 263     while (slen--) {
 264         if (!UTF8_IS_CONTINUATION(*s))
 265             return 0;
 266         uv = UTF8_ACCUMULATE(uv, *s);
 267         if (uv < ouv)
 268             return 0;
 269         ouv = uv;
 270         s++;
 271     }
 272
 273     if ((STRLEN)UNISKIP(uv) < len)
 274         return 0;
 275
 276     return len;
 277 }
 278
 279 /*
 280 =for apidoc is_utf8_char
 281
 282 Tests if some arbitrary number of bytes begins in a valid UTF-8
 283 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 284 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 285 character will be returned if it is valid, otherwise 0.
 286
 287 =cut */
 288 STRLEN
 289 Perl_is_utf8_char(const U8 *s)
 290 {
 291     const STRLEN len = UTF8SKIP(s);
 292
 293     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 294 #ifdef IS_UTF8_CHAR
 295     if (IS_UTF8_CHAR_FAST(len))
 296         return IS_UTF8_CHAR(s, len) ? len : 0;
 297 #endif /* #ifdef IS_UTF8_CHAR */
 298     return is_utf8_char_slow(s, len);
 299 }
 300
 301
 302 /*
 303 =for apidoc is_utf8_string
 304
 305 Returns true if first C<len> bytes of the given string form a valid
 306 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 307 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 308 because a valid ASCII string is a valid UTF-8 string.
 309
 310 See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
 311
 312 =cut
 313 */
 314
 315 bool
 316 Perl_is_utf8_string(const U8 *s, STRLEN len)
 317 {
 318     const U8* const send = s + (len ? len : strlen((const char *)s));
 319     const U8* x = s;
 320
 321     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 322
 323     while (x < send) {
 324         STRLEN c;
 325          /* Inline the easy bits of is_utf8_char() here for speed... */
 326          if (UTF8_IS_INVARIANT(*x))
 327               c = 1;
 328          else if (!UTF8_IS_START(*x))
 329              goto out;
 330          else {
 331               /* ... and call is_utf8_char() only if really needed. */
 332 #ifdef IS_UTF8_CHAR
 333              c = UTF8SKIP(x);
 334              if (IS_UTF8_CHAR_FAST(c)) {
 335                  if (!IS_UTF8_CHAR(x, c))
 336                      c = 0;
 337              }
 338              else
 339                 c = is_utf8_char_slow(x, c);
 340 #else
 341              c = is_utf8_char(x);
 342 #endif /* #ifdef IS_UTF8_CHAR */
 343               if (!c)
 344                   goto out;
 345          }
 346         x += c;
 347     }
 348
 349  out:
 350     if (x != send)
 351         return FALSE;
 352
 353     return TRUE;
 354 }
 355
 356 /*
 357 Implemented as a macro in utf8.h
 358
 359 =for apidoc is_utf8_string_loc
 360
 361 Like is_utf8_string() but stores the location of the failure (in the
 362 case of "utf8ness failure") or the location s+len (in the case of
 363 "utf8ness success") in the C<ep>.
 364
 365 See also is_utf8_string_loclen() and is_utf8_string().
 366
 367 =for apidoc is_utf8_string_loclen
 368
 369 Like is_utf8_string() but stores the location of the failure (in the
 370 case of "utf8ness failure") or the location s+len (in the case of
 371 "utf8ness success") in the C<ep>, and the number of UTF-8
 372 encoded characters in the C<el>.
 373
 374 See also is_utf8_string_loc() and is_utf8_string().
 375
 376 =cut
 377 */
 378
 379 bool
 380 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 381 {
 382     const U8* const send = s + (len ? len : strlen((const char *)s));
 383     const U8* x = s;
 384     STRLEN c;
 385     STRLEN outlen = 0;
 386
 387     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 388
 389     while (x < send) {
 390          /* Inline the easy bits of is_utf8_char() here for speed... */
 391          if (UTF8_IS_INVARIANT(*x))
 392              c = 1;
 393          else if (!UTF8_IS_START(*x))
 394              goto out;
 395          else {
 396              /* ... and call is_utf8_char() only if really needed. */
 397 #ifdef IS_UTF8_CHAR
 398              c = UTF8SKIP(x);
 399              if (IS_UTF8_CHAR_FAST(c)) {
 400                  if (!IS_UTF8_CHAR(x, c))
 401                      c = 0;
 402              } else
 403                  c = is_utf8_char_slow(x, c);
 404 #else
 405              c = is_utf8_char(x);
 406 #endif /* #ifdef IS_UTF8_CHAR */
 407              if (!c)
 408                  goto out;
 409          }
 410          x += c;
 411          outlen++;
 412     }
 413
 414  out:
 415     if (el)
 416         *el = outlen;
 417
 418     if (ep)
 419         *ep = x;
 420     return (x == send);
 421 }
 422
 423 /*
 424
 425 =for apidoc utf8n_to_uvuni
 426
 427 Bottom level UTF-8 decode routine.
 428 Returns the Unicode code point value of the first character in the string C<s>
 429 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 430 C<retlen> will be set to the length, in bytes, of that character.
 431
 432 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 433 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 434 it is assumed that the caller will raise a warning, and this function
 435 will silently just set C<retlen> to C<-1> and return zero.  If the
 436 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 437 malformations will be given, C<retlen> will be set to the expected
 438 length of the UTF-8 character in bytes, and zero will be returned.
 439
 440 The C<flags> can also contain various flags to allow deviations from
 441 the strict UTF-8 encoding (see F<utf8.h>).
 442
 443 Most code should use utf8_to_uvchr() rather than call this directly.
 444
 445 =cut
 446 */
 447
 448 UV
 449 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 450 {
 451     dVAR;
 452     const U8 * const s0 = s;
 453     UV uv = *s, ouv = 0;
 454     STRLEN len = 1;
 455     const bool dowarn = ckWARN_d(WARN_UTF8);
 456     const UV startbyte = *s;
 457     STRLEN expectlen = 0;
 458     U32 warning = 0;
 459     SV* sv;
 460
 461     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 462
 463 /* This list is a superset of the UTF8_ALLOW_XXX.  BUT it isn't, eg SUPER missing XXX */
 464
 465 #define UTF8_WARN_EMPTY                          1
 466 #define UTF8_WARN_CONTINUATION                   2
 467 #define UTF8_WARN_NON_CONTINUATION               3
 468 #define UTF8_WARN_FE_FF                          4
 469 #define UTF8_WARN_SHORT                          5
 470 #define UTF8_WARN_OVERFLOW                       6
 471 #define UTF8_WARN_SURROGATE                      7
 472 #define UTF8_WARN_LONG                           8
 473 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 474
 475     if (curlen == 0 &&
 476         !(flags & UTF8_ALLOW_EMPTY)) {
 477         warning = UTF8_WARN_EMPTY;
 478         goto malformed;
 479     }
 480
 481     if (UTF8_IS_INVARIANT(uv)) {
 482         if (retlen)
 483             *retlen = 1;
 484         return (UV) (NATIVE_TO_UTF(*s));
 485     }
 486
 487     if (UTF8_IS_CONTINUATION(uv) &&
 488         !(flags & UTF8_ALLOW_CONTINUATION)) {
 489         warning = UTF8_WARN_CONTINUATION;
 490         goto malformed;
 491     }
 492
 493     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 494         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 495         warning = UTF8_WARN_NON_CONTINUATION;
 496         goto malformed;
 497     }
 498
 499 #ifdef EBCDIC
 500     uv = NATIVE_TO_UTF(uv);
 501 #else
 502     if ((uv == 0xfe || uv == 0xff) &&
 503         !(flags & UTF8_ALLOW_FE_FF)) {
 504         warning = UTF8_WARN_FE_FF;
 505         goto malformed;
 506     }
 507 #endif
 508
 509     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 510     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 511     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 512     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 513 #ifdef EBCDIC
 514     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 515     else                        { len =  7; uv &= 0x01; }
 516 #else
 517     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 518     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 519     else                        { len = 13; uv = 0; } /* whoa! */
 520 #endif
 521
 522     if (retlen)
 523         *retlen = len;
 524
 525     expectlen = len;
 526
 527     if ((curlen < expectlen) &&
 528         !(flags & UTF8_ALLOW_SHORT)) {
 529         warning = UTF8_WARN_SHORT;
 530         goto malformed;
 531     }
 532
 533     len--;
 534     s++;
 535     ouv = uv;
 536
 537     while (len--) {
 538         if (!UTF8_IS_CONTINUATION(*s) &&
 539             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 540             s--;
 541             warning = UTF8_WARN_NON_CONTINUATION;
 542             goto malformed;
 543         }
 544         else
 545             uv = UTF8_ACCUMULATE(uv, *s);
 546         if (!(uv > ouv)) {
 547             /* These cannot be allowed. */
 548             if (uv == ouv) {
 549                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 550                     warning = UTF8_WARN_LONG;
 551                     goto malformed;
 552                 }
 553             }
 554             else { /* uv < ouv */
 555                 /* This cannot be allowed. */
 556                 warning = UTF8_WARN_OVERFLOW;
 557                 goto malformed;
 558             }
 559         }
 560         s++;
 561         ouv = uv;
 562     }
 563
 564     if (UNICODE_IS_SURROGATE(uv) &&
 565         !(flags & UTF8_ALLOW_SURROGATE)) {
 566         warning = UTF8_WARN_SURROGATE;
 567         goto malformed;
 568     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 569                !(flags & UTF8_ALLOW_LONG)) {
 570         warning = UTF8_WARN_LONG;
 571         goto malformed;
 572     } else if (UNICODE_IS_ILLEGAL(uv) &&
 573                !(flags & UTF8_ALLOW_FFFF)) {
 574         warning = UTF8_WARN_FFFF;
 575         goto malformed;
 576     }
 577
 578     return uv;
 579
 580 malformed:
 581
 582     if (flags & UTF8_CHECK_ONLY) {
 583         if (retlen)
 584             *retlen = ((STRLEN) -1);
 585         return 0;
 586     }
 587
 588     if (dowarn) {
 589         if (warning == UTF8_WARN_FFFF) {
 590             sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
 591             Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
 592         }
 593         else {
 594             sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
 595
 596             switch (warning) {
 597                 case 0: /* Intentionally empty. */ break;
 598                 case UTF8_WARN_EMPTY:
 599                     sv_catpvs(sv, "(empty string)");
 600                     break;
 601                 case UTF8_WARN_CONTINUATION:
 602                     Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 603                     break;
 604                 case UTF8_WARN_NON_CONTINUATION:
 605                     if (s == s0)
 606                         Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 607                                    (UV)s[1], startbyte);
 608                     else {
 609                         const int len = (int)(s-s0);
 610                         Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 611                                    (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 612                     }
 613
 614                     break;
 615                 case UTF8_WARN_FE_FF:
 616                     Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 617                     break;
 618                 case UTF8_WARN_SHORT:
 619                     Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 620                                    (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 621                     expectlen = curlen;         /* distance for caller to skip */
 622                     break;
 623                 case UTF8_WARN_OVERFLOW:
 624                     Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 625                                    ouv, *s, startbyte);
 626                     break;
 627                 case UTF8_WARN_SURROGATE:
 628                     Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 629                     break;
 630                 case UTF8_WARN_LONG:
 631                     Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 632                                    (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 633                     break;
 634                 default:
 635                     sv_catpvs(sv, "(unknown reason)");
 636                     break;
 637             }
 638         }
 639
 640         if (warning) {
 641             const char * const s = SvPVX_const(sv);
 642
 643             if (PL_op)
 644                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 645                             "%s in %s", s,  OP_DESC(PL_op));
 646             else
 647                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 648         }
 649     }
 650
 651     if (retlen)
 652         *retlen = expectlen ? expectlen : len;
 653
 654     return 0;
 655 }
 656
 657 /*
 658 =for apidoc utf8_to_uvchr
 659
 660 Returns the native character value of the first character in the string C<s>
 661 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 662 length, in bytes, of that character.
 663
 664 If C<s> does not point to a well-formed UTF-8 character, zero is
 665 returned and retlen is set, if possible, to -1.
 666
 667 =cut
 668 */
 669
 670 UV
 671 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 672 {
 673     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 674
 675     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 676                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 677 }
 678
 679 /*
 680 =for apidoc utf8_to_uvuni
 681
 682 Returns the Unicode code point of the first character in the string C<s>
 683 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 684 length, in bytes, of that character.
 685
 686 This function should only be used when the returned UV is considered
 687 an index into the Unicode semantic tables (e.g. swashes).
 688
 689 If C<s> does not point to a well-formed UTF-8 character, zero is
 690 returned and retlen is set, if possible, to -1.
 691
 692 =cut
 693 */
 694
 695 UV
 696 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 697 {
 698     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
 699
 700     /* Call the low level routine asking for checks */
 701     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 702                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 703 }
 704
 705 /*
 706 =for apidoc utf8_length
 707
 708 Return the length of the UTF-8 char encoded string C<s> in characters.
 709 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 710 up past C<e>, croaks.
 711
 712 =cut
 713 */
 714
 715 STRLEN
 716 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 717 {
 718     dVAR;
 719     STRLEN len = 0;
 720
 721     PERL_ARGS_ASSERT_UTF8_LENGTH;
 722
 723     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 724      * the bitops (especially ~) can create illegal UTF-8.
 725      * In other words: in Perl UTF-8 is not just for Unicode. */
 726
 727     if (e < s)
 728         goto warn_and_return;
 729     while (s < e) {
 730         if (!UTF8_IS_INVARIANT(*s))
 731             s += UTF8SKIP(s);
 732         else
 733             s++;
 734         len++;
 735     }
 736
 737     if (e != s) {
 738         len--;
 739         warn_and_return:
 740         if (PL_op)
 741             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
 742                              "%s in %s", unees, OP_DESC(PL_op));
 743         else
 744             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
 745     }
 746
 747     return len;
 748 }
 749
 750 /*
 751 =for apidoc utf8_distance
 752
 753 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 754 and C<b>.
 755
 756 WARNING: use only if you *know* that the pointers point inside the
 757 same UTF-8 buffer.
 758
 759 =cut
 760 */
 761
 762 IV
 763 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 764 {
 765     PERL_ARGS_ASSERT_UTF8_DISTANCE;
 766
 767     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 768 }
 769
 770 /*
 771 =for apidoc utf8_hop
 772
 773 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 774 forward or backward.
 775
 776 WARNING: do not use the following unless you *know* C<off> is within
 777 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 778 on the first byte of character or just after the last byte of a character.
 779
 780 =cut
 781 */
 782
 783 U8 *
 784 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 785 {
 786     PERL_ARGS_ASSERT_UTF8_HOP;
 787
 788     PERL_UNUSED_CONTEXT;
 789     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 790      * the bitops (especially ~) can create illegal UTF-8.
 791      * In other words: in Perl UTF-8 is not just for Unicode. */
 792
 793     if (off >= 0) {
 794         while (off--)
 795             s += UTF8SKIP(s);
 796     }
 797     else {
 798         while (off++) {
 799             s--;
 800             while (UTF8_IS_CONTINUATION(*s))
 801                 s--;
 802         }
 803     }
 804     return (U8 *)s;
 805 }
 806
 807 /*
 808 =for apidoc utf8_to_bytes
 809
 810 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 811 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 812 updates len to contain the new length.
 813 Returns zero on failure, setting C<len> to -1.
 814
 815 If you need a copy of the string, see C<bytes_from_utf8>.
 816
 817 =cut
 818 */
 819
 820 U8 *
 821 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 822 {
 823     U8 * const save = s;
 824     U8 * const send = s + *len;
 825     U8 *d;
 826
 827     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
 828
 829     /* ensure valid UTF-8 and chars < 256 before updating string */
 830     while (s < send) {
 831         U8 c = *s++;
 832
 833         if (!UTF8_IS_INVARIANT(c) &&
 834             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 835              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 836             *len = ((STRLEN) -1);
 837             return 0;
 838         }
 839     }
 840
 841     d = s = save;
 842     while (s < send) {
 843         STRLEN ulen;
 844         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 845         s += ulen;
 846     }
 847     *d = '\0';
 848     *len = d - save;
 849     return save;
 850 }
 851
 852 /*
 853 =for apidoc bytes_from_utf8
 854
 855 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
 856 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 857 the newly-created string, and updates C<len> to contain the new
 858 length.  Returns the original string if no conversion occurs, C<len>
 859 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 860 0 if C<s> is converted or consisted entirely of characters that are invariant
 861 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
 862
 863 =cut
 864 */
 865
 866 U8 *
 867 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 868 {
 869     U8 *d;
 870     const U8 *start = s;
 871     const U8 *send;
 872     I32 count = 0;
 873
 874     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
 875
 876     PERL_UNUSED_CONTEXT;
 877     if (!*is_utf8)
 878         return (U8 *)start;
 879
 880     /* ensure valid UTF-8 and chars < 256 before converting string */
 881     for (send = s + *len; s < send;) {
 882         U8 c = *s++;
 883         if (!UTF8_IS_INVARIANT(c)) {
 884             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 885                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 886                 count++;
 887             else
 888                 return (U8 *)start;
 889         }
 890     }
 891
 892     *is_utf8 = FALSE;
 893
 894     Newx(d, (*len) - count + 1, U8);
 895     s = start; start = d;
 896     while (s < send) {
 897         U8 c = *s++;
 898         if (!UTF8_IS_INVARIANT(c)) {
 899             /* Then it is two-byte encoded */
 900             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 901             c = ASCII_TO_NATIVE(c);
 902         }
 903         *d++ = c;
 904     }
 905     *d = '\0';
 906     *len = d - start;
 907     return (U8 *)start;
 908 }
 909
 910 /*
 911 =for apidoc bytes_to_utf8
 912
 913 Converts a string C<s> of length C<len> from the native encoding into UTF-8.
 914 Returns a pointer to the newly-created string, and sets C<len> to
 915 reflect the new length.
 916
 917 A NUL character will be written after the end of the string.
 918
 919 If you want to convert to UTF-8 from encodings other than
 920 the native (Latin1 or EBCDIC),
 921 see sv_recode_to_utf8().
 922
 923 =cut
 924 */
 925
 926 U8*
 927 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 928 {
 929     const U8 * const send = s + (*len);
 930     U8 *d;
 931     U8 *dst;
 932
 933     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
 934     PERL_UNUSED_CONTEXT;
 935
 936     Newx(d, (*len) * 2 + 1, U8);
 937     dst = d;
 938
 939     while (s < send) {
 940         const UV uv = NATIVE_TO_ASCII(*s++);
 941         if (UNI_IS_INVARIANT(uv))
 942             *d++ = (U8)UTF_TO_NATIVE(uv);
 943         else {
 944             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 945             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 946         }
 947     }
 948     *d = '\0';
 949     *len = d-dst;
 950     return dst;
 951 }
 952
 953 /*
 954  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 955  *
 956  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 957  * We optimize for native, for obvious reasons. */
 958
 959 U8*
 960 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 961 {
 962     U8* pend;
 963     U8* dstart = d;
 964
 965     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
 966
 967     if (bytelen & 1)
 968         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 969
 970     pend = p + bytelen;
 971
 972     while (p < pend) {
 973         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 974         p += 2;
 975         if (uv < 0x80) {
 976 #ifdef EBCDIC
 977             *d++ = UNI_TO_NATIVE(uv);
 978 #else
 979             *d++ = (U8)uv;
 980 #endif
 981             continue;
 982         }
 983         if (uv < 0x800) {
 984             *d++ = (U8)(( uv >>  6)         | 0xc0);
 985             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 986             continue;
 987         }
 988         if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
 989             if (p >= pend) {
 990                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 991             } else {
 992                 UV low = (p[0] << 8) + p[1];
 993                 p += 2;
 994                 if (low < 0xdc00 || low > 0xdfff)
 995                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 996                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 997             }
 998         } else if (uv >= 0xdc00 && uv <= 0xdfff) {
 999             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1000         }
1001         if (uv < 0x10000) {
1002             *d++ = (U8)(( uv >> 12)         | 0xe0);
1003             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1004             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1005             continue;
1006         }
1007         else {
1008             *d++ = (U8)(( uv >> 18)         | 0xf0);
1009             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1010             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1011             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1012             continue;
1013         }
1014     }
1015     *newlen = d - dstart;
1016     return d;
1017 }
1018
1019 /* Note: this one is slightly destructive of the source. */
1020
1021 U8*
1022 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1023 {
1024     U8* s = (U8*)p;
1025     U8* const send = s + bytelen;
1026
1027     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1028
1029     if (bytelen & 1)
1030         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1031                    (UV)bytelen);
1032
1033     while (s < send) {
1034         const U8 tmp = s[0];
1035         s[0] = s[1];
1036         s[1] = tmp;
1037         s += 2;
1038     }
1039     return utf16_to_utf8(p, d, bytelen, newlen);
1040 }
1041
1042 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
1043
1044 bool
1045 Perl_is_uni_alnum(pTHX_ UV c)
1046 {
1047     U8 tmpbuf[UTF8_MAXBYTES+1];
1048     uvchr_to_utf8(tmpbuf, c);
1049     return is_utf8_alnum(tmpbuf);
1050 }
1051
1052 bool
1053 Perl_is_uni_idfirst(pTHX_ UV c)
1054 {
1055     U8 tmpbuf[UTF8_MAXBYTES+1];
1056     uvchr_to_utf8(tmpbuf, c);
1057     return is_utf8_idfirst(tmpbuf);
1058 }
1059
1060 bool
1061 Perl_is_uni_alpha(pTHX_ UV c)
1062 {
1063     U8 tmpbuf[UTF8_MAXBYTES+1];
1064     uvchr_to_utf8(tmpbuf, c);
1065     return is_utf8_alpha(tmpbuf);
1066 }
1067
1068 bool
1069 Perl_is_uni_ascii(pTHX_ UV c)
1070 {
1071     U8 tmpbuf[UTF8_MAXBYTES+1];
1072     uvchr_to_utf8(tmpbuf, c);
1073     return is_utf8_ascii(tmpbuf);
1074 }
1075
1076 bool
1077 Perl_is_uni_space(pTHX_ UV c)
1078 {
1079     U8 tmpbuf[UTF8_MAXBYTES+1];
1080     uvchr_to_utf8(tmpbuf, c);
1081     return is_utf8_space(tmpbuf);
1082 }
1083
1084 bool
1085 Perl_is_uni_digit(pTHX_ UV c)
1086 {
1087     U8 tmpbuf[UTF8_MAXBYTES+1];
1088     uvchr_to_utf8(tmpbuf, c);
1089     return is_utf8_digit(tmpbuf);
1090 }
1091
1092 bool
1093 Perl_is_uni_upper(pTHX_ UV c)
1094 {
1095     U8 tmpbuf[UTF8_MAXBYTES+1];
1096     uvchr_to_utf8(tmpbuf, c);
1097     return is_utf8_upper(tmpbuf);
1098 }
1099
1100 bool
1101 Perl_is_uni_lower(pTHX_ UV c)
1102 {
1103     U8 tmpbuf[UTF8_MAXBYTES+1];
1104     uvchr_to_utf8(tmpbuf, c);
1105     return is_utf8_lower(tmpbuf);
1106 }
1107
1108 bool
1109 Perl_is_uni_cntrl(pTHX_ UV c)
1110 {
1111     U8 tmpbuf[UTF8_MAXBYTES+1];
1112     uvchr_to_utf8(tmpbuf, c);
1113     return is_utf8_cntrl(tmpbuf);
1114 }
1115
1116 bool
1117 Perl_is_uni_graph(pTHX_ UV c)
1118 {
1119     U8 tmpbuf[UTF8_MAXBYTES+1];
1120     uvchr_to_utf8(tmpbuf, c);
1121     return is_utf8_graph(tmpbuf);
1122 }
1123
1124 bool
1125 Perl_is_uni_print(pTHX_ UV c)
1126 {
1127     U8 tmpbuf[UTF8_MAXBYTES+1];
1128     uvchr_to_utf8(tmpbuf, c);
1129     return is_utf8_print(tmpbuf);
1130 }
1131
1132 bool
1133 Perl_is_uni_punct(pTHX_ UV c)
1134 {
1135     U8 tmpbuf[UTF8_MAXBYTES+1];
1136     uvchr_to_utf8(tmpbuf, c);
1137     return is_utf8_punct(tmpbuf);
1138 }
1139
1140 bool
1141 Perl_is_uni_xdigit(pTHX_ UV c)
1142 {
1143     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1144     uvchr_to_utf8(tmpbuf, c);
1145     return is_utf8_xdigit(tmpbuf);
1146 }
1147
1148 UV
1149 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1150 {
1151     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1152
1153     uvchr_to_utf8(p, c);
1154     return to_utf8_upper(p, p, lenp);
1155 }
1156
1157 UV
1158 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1159 {
1160     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1161
1162     uvchr_to_utf8(p, c);
1163     return to_utf8_title(p, p, lenp);
1164 }
1165
1166 UV
1167 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1168 {
1169     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1170
1171     uvchr_to_utf8(p, c);
1172     return to_utf8_lower(p, p, lenp);
1173 }
1174
1175 UV
1176 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1177 {
1178     PERL_ARGS_ASSERT_TO_UNI_FOLD;
1179
1180     uvchr_to_utf8(p, c);
1181     return to_utf8_fold(p, p, lenp);
1182 }
1183
1184 /* for now these all assume no locale info available for Unicode > 255 */
1185
1186 bool
1187 Perl_is_uni_alnum_lc(pTHX_ UV c)
1188 {
1189     return is_uni_alnum(c);     /* XXX no locale support yet */
1190 }
1191
1192 bool
1193 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1194 {
1195     return is_uni_idfirst(c);   /* XXX no locale support yet */
1196 }
1197
1198 bool
1199 Perl_is_uni_alpha_lc(pTHX_ UV c)
1200 {
1201     return is_uni_alpha(c);     /* XXX no locale support yet */
1202 }
1203
1204 bool
1205 Perl_is_uni_ascii_lc(pTHX_ UV c)
1206 {
1207     return is_uni_ascii(c);     /* XXX no locale support yet */
1208 }
1209
1210 bool
1211 Perl_is_uni_space_lc(pTHX_ UV c)
1212 {
1213     return is_uni_space(c);     /* XXX no locale support yet */
1214 }
1215
1216 bool
1217 Perl_is_uni_digit_lc(pTHX_ UV c)
1218 {
1219     return is_uni_digit(c);     /* XXX no locale support yet */
1220 }
1221
1222 bool
1223 Perl_is_uni_upper_lc(pTHX_ UV c)
1224 {
1225     return is_uni_upper(c);     /* XXX no locale support yet */
1226 }
1227
1228 bool
1229 Perl_is_uni_lower_lc(pTHX_ UV c)
1230 {
1231     return is_uni_lower(c);     /* XXX no locale support yet */
1232 }
1233
1234 bool
1235 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1236 {
1237     return is_uni_cntrl(c);     /* XXX no locale support yet */
1238 }
1239
1240 bool
1241 Perl_is_uni_graph_lc(pTHX_ UV c)
1242 {
1243     return is_uni_graph(c);     /* XXX no locale support yet */
1244 }
1245
1246 bool
1247 Perl_is_uni_print_lc(pTHX_ UV c)
1248 {
1249     return is_uni_print(c);     /* XXX no locale support yet */
1250 }
1251
1252 bool
1253 Perl_is_uni_punct_lc(pTHX_ UV c)
1254 {
1255     return is_uni_punct(c);     /* XXX no locale support yet */
1256 }
1257
1258 bool
1259 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1260 {
1261     return is_uni_xdigit(c);    /* XXX no locale support yet */
1262 }
1263
1264 U32
1265 Perl_to_uni_upper_lc(pTHX_ U32 c)
1266 {
1267     /* XXX returns only the first character -- do not use XXX */
1268     /* XXX no locale support yet */
1269     STRLEN len;
1270     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1271     return (U32)to_uni_upper(c, tmpbuf, &len);
1272 }
1273
1274 U32
1275 Perl_to_uni_title_lc(pTHX_ U32 c)
1276 {
1277     /* XXX returns only the first character XXX -- do not use XXX */
1278     /* XXX no locale support yet */
1279     STRLEN len;
1280     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1281     return (U32)to_uni_title(c, tmpbuf, &len);
1282 }
1283
1284 U32
1285 Perl_to_uni_lower_lc(pTHX_ U32 c)
1286 {
1287     /* XXX returns only the first character -- do not use XXX */
1288     /* XXX no locale support yet */
1289     STRLEN len;
1290     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1291     return (U32)to_uni_lower(c, tmpbuf, &len);
1292 }
1293
1294 static bool
1295 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1296                  const char *const swashname)
1297 {
1298     dVAR;
1299
1300     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1301
1302     if (!is_utf8_char(p))
1303         return FALSE;
1304     if (!*swash)
1305         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1306     return swash_fetch(*swash, p, TRUE) != 0;
1307 }
1308
1309 bool
1310 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1311 {
1312     dVAR;
1313
1314     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1315
1316     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1317      * descendant of isalnum(3), in other words, it doesn't
1318      * contain the '_'. --jhi */
1319     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1320 }
1321
1322 bool
1323 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1324 {
1325     dVAR;
1326
1327     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1328
1329     if (*p == '_')
1330         return TRUE;
1331     /* is_utf8_idstart would be more logical. */
1332     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1333 }
1334
1335 bool
1336 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1337 {
1338     dVAR;
1339
1340     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1341
1342     if (*p == '_')
1343         return TRUE;
1344     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1345 }
1346
1347 bool
1348 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1349 {
1350     dVAR;
1351
1352     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1353
1354     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1355 }
1356
1357 bool
1358 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1359 {
1360     dVAR;
1361
1362     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1363
1364     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1365 }
1366
1367 bool
1368 Perl_is_utf8_space(pTHX_ const U8 *p)
1369 {
1370     dVAR;
1371
1372     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1373
1374     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1375 }
1376
1377 bool
1378 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1379 {
1380     dVAR;
1381
1382     PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1383
1384     return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
1385 }
1386
1387 bool
1388 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1389 {
1390     dVAR;
1391
1392     PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1393
1394     return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
1395 }
1396
1397 bool
1398 Perl_is_utf8_digit(pTHX_ const U8 *p)
1399 {
1400     dVAR;
1401
1402     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1403
1404     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1405 }
1406
1407 bool
1408 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1409 {
1410     dVAR;
1411
1412     PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1413
1414     return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
1415 }
1416
1417 bool
1418 Perl_is_utf8_upper(pTHX_ const U8 *p)
1419 {
1420     dVAR;
1421
1422     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1423
1424     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1425 }
1426
1427 bool
1428 Perl_is_utf8_lower(pTHX_ const U8 *p)
1429 {
1430     dVAR;
1431
1432     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1433
1434     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1435 }
1436
1437 bool
1438 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1439 {
1440     dVAR;
1441
1442     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1443
1444     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1445 }
1446
1447 bool
1448 Perl_is_utf8_graph(pTHX_ const U8 *p)
1449 {
1450     dVAR;
1451
1452     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1453
1454     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1455 }
1456
1457 bool
1458 Perl_is_utf8_print(pTHX_ const U8 *p)
1459 {
1460     dVAR;
1461
1462     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1463
1464     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1465 }
1466
1467 bool
1468 Perl_is_utf8_punct(pTHX_ const U8 *p)
1469 {
1470     dVAR;
1471
1472     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1473
1474     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1475 }
1476
1477 bool
1478 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1479 {
1480     dVAR;
1481
1482     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1483
1484     return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
1485 }
1486
1487 bool
1488 Perl_is_utf8_mark(pTHX_ const U8 *p)
1489 {
1490     dVAR;
1491
1492     PERL_ARGS_ASSERT_IS_UTF8_MARK;
1493
1494     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1495 }
1496
1497 bool
1498 Perl_is_utf8_X_begin(pTHX_ const U8 *p)
1499 {
1500     dVAR;
1501
1502     PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
1503
1504     return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
1505 }
1506
1507 bool
1508 Perl_is_utf8_X_extend(pTHX_ const U8 *p)
1509 {
1510     dVAR;
1511
1512     PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
1513
1514     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
1515 }
1516
1517 bool
1518 Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
1519 {
1520     dVAR;
1521
1522     PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
1523
1524     return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
1525 }
1526
1527 bool
1528 Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
1529 {
1530     dVAR;
1531
1532     PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
1533
1534     return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
1535 }
1536
1537 bool
1538 Perl_is_utf8_X_L(pTHX_ const U8 *p)
1539 {
1540     dVAR;
1541
1542     PERL_ARGS_ASSERT_IS_UTF8_X_L;
1543
1544     return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
1545 }
1546
1547 bool
1548 Perl_is_utf8_X_LV(pTHX_ const U8 *p)
1549 {
1550     dVAR;
1551
1552     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
1553
1554     return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
1555 }
1556
1557 bool
1558 Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
1559 {
1560     dVAR;
1561
1562     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
1563
1564     return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
1565 }
1566
1567 bool
1568 Perl_is_utf8_X_T(pTHX_ const U8 *p)
1569 {
1570     dVAR;
1571
1572     PERL_ARGS_ASSERT_IS_UTF8_X_T;
1573
1574     return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
1575 }
1576
1577 bool
1578 Perl_is_utf8_X_V(pTHX_ const U8 *p)
1579 {
1580     dVAR;
1581
1582     PERL_ARGS_ASSERT_IS_UTF8_X_V;
1583
1584     return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
1585 }
1586
1587 bool
1588 Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
1589 {
1590     dVAR;
1591
1592     PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
1593
1594     return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
1595 }
1596
1597 /*
1598 =for apidoc to_utf8_case
1599
1600 The "p" contains the pointer to the UTF-8 string encoding
1601 the character that is being converted.
1602
1603 The "ustrp" is a pointer to the character buffer to put the
1604 conversion result to.  The "lenp" is a pointer to the length
1605 of the result.
1606
1607 The "swashp" is a pointer to the swash to use.
1608
1609 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1610 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1611 but not always, a multicharacter mapping), is tried first.
1612
1613 The "special" is a string like "utf8::ToSpecLower", which means the
1614 hash %utf8::ToSpecLower.  The access to the hash is through
1615 Perl_to_utf8_case().
1616
1617 The "normal" is a string like "ToLower" which means the swash
1618 %utf8::ToLower.
1619
1620 =cut */
1621
1622 UV
1623 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1624                         SV **swashp, const char *normal, const char *special)
1625 {
1626     dVAR;
1627     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1628     STRLEN len = 0;
1629     const UV uv0 = utf8_to_uvchr(p, NULL);
1630     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1631      * are necessary in EBCDIC, they are redundant no-ops
1632      * in ASCII-ish platforms, and hopefully optimized away. */
1633     const UV uv1 = NATIVE_TO_UNI(uv0);
1634
1635     PERL_ARGS_ASSERT_TO_UTF8_CASE;
1636
1637     uvuni_to_utf8(tmpbuf, uv1);
1638
1639     if (!*swashp) /* load on-demand */
1640          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1641     /* This is the beginnings of a skeleton of code to read the info section
1642      * that is in all the swashes in case we ever want to do that, so one can
1643      * read things whose maps aren't code points, and whose default if missing
1644      * is not to the code point itself.  This was just to see if it actually
1645      * worked.  Details on what the possibilities are are in perluniprops.pod
1646         HV * const hv = get_hv("utf8::SwashInfo", 0);
1647         if (hv) {
1648          SV **svp;
1649          svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
1650              const char *s;
1651
1652               HV * const this_hash = SvRV(*svp);
1653                 svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
1654               s = SvPV_const(*svp, len);
1655         }
1656     }*/
1657
1658     if (special) {
1659          /* It might be "special" (sometimes, but not always,
1660           * a multicharacter mapping) */
1661          HV * const hv = get_hv(special, 0);
1662          SV **svp;
1663
1664          if (hv &&
1665              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1666              (*svp)) {
1667              const char *s;
1668
1669               s = SvPV_const(*svp, len);
1670               if (len == 1)
1671                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1672               else {
1673 #ifdef EBCDIC
1674                    /* If we have EBCDIC we need to remap the characters
1675                     * since any characters in the low 256 are Unicode
1676                     * code points, not EBCDIC. */
1677                    U8 *t = (U8*)s, *tend = t + len, *d;
1678
1679                    d = tmpbuf;
1680                    if (SvUTF8(*svp)) {
1681                         STRLEN tlen = 0;
1682
1683                         while (t < tend) {
1684                              const UV c = utf8_to_uvchr(t, &tlen);
1685                              if (tlen > 0) {
1686                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1687                                   t += tlen;
1688                              }
1689                              else
1690                                   break;
1691                         }
1692                    }
1693                    else {
1694                         while (t < tend) {
1695                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1696                              t++;
1697                         }
1698                    }
1699                    len = d - tmpbuf;
1700                    Copy(tmpbuf, ustrp, len, U8);
1701 #else
1702                    Copy(s, ustrp, len, U8);
1703 #endif
1704               }
1705          }
1706     }
1707
1708     if (!len && *swashp) {
1709         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1710
1711          if (uv2) {
1712               /* It was "normal" (a single character mapping). */
1713               const UV uv3 = UNI_TO_NATIVE(uv2);
1714               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1715          }
1716     }
1717
1718     if (!len) /* Neither: just copy.  In other words, there was no mapping
1719                  defined, which means that the code point maps to itself */
1720          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1721
1722     if (lenp)
1723          *lenp = len;
1724
1725     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1726 }
1727
1728 /*
1729 =for apidoc to_utf8_upper
1730
1731 Convert the UTF-8 encoded character at p to its uppercase version and
1732 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1733 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1734 the uppercase version may be longer than the original character.
1735
1736 The first character of the uppercased version is returned
1737 (but note, as explained above, that there may be more.)
1738
1739 =cut */
1740
1741 UV
1742 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1743 {
1744     dVAR;
1745
1746     PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1747
1748     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1749                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1750 }
1751
1752 /*
1753 =for apidoc to_utf8_title
1754
1755 Convert the UTF-8 encoded character at p to its titlecase version and
1756 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1757 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1758 titlecase version may be longer than the original character.
1759
1760 The first character of the titlecased version is returned
1761 (but note, as explained above, that there may be more.)
1762
1763 =cut */
1764
1765 UV
1766 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1767 {
1768     dVAR;
1769
1770     PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1771
1772     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1773                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1774 }
1775
1776 /*
1777 =for apidoc to_utf8_lower
1778
1779 Convert the UTF-8 encoded character at p to its lowercase version and
1780 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1781 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1782 lowercase version may be longer than the original character.
1783
1784 The first character of the lowercased version is returned
1785 (but note, as explained above, that there may be more.)
1786
1787 =cut */
1788
1789 UV
1790 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1791 {
1792     dVAR;
1793
1794     PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1795
1796     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1797                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1798 }
1799
1800 /*
1801 =for apidoc to_utf8_fold
1802
1803 Convert the UTF-8 encoded character at p to its foldcase version and
1804 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1805 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1806 foldcase version may be longer than the original character (up to
1807 three characters).
1808
1809 The first character of the foldcased version is returned
1810 (but note, as explained above, that there may be more.)
1811
1812 =cut */
1813
1814 UV
1815 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1816 {
1817     dVAR;
1818
1819     PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1820
1821     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1822                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1823 }
1824
1825 /* Note:
1826  * A "swash" is a swatch hash.
1827  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1828  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1829  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1830  */
1831 SV*
1832 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1833 {
1834     dVAR;
1835     SV* retval;
1836     dSP;
1837     const size_t pkg_len = strlen(pkg);
1838     const size_t name_len = strlen(name);
1839     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1840     SV* errsv_save;
1841
1842     PERL_ARGS_ASSERT_SWASH_INIT;
1843
1844     PUSHSTACKi(PERLSI_MAGIC);
1845     ENTER;
1846     SAVEHINTS();
1847     save_re_context();
1848     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1849         ENTER;
1850         errsv_save = newSVsv(ERRSV);
1851         /* It is assumed that callers of this routine are not passing in any
1852            user derived data.  */
1853         /* Need to do this after save_re_context() as it will set PL_tainted to
1854            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1855            Even line to create errsv_save can turn on PL_tainted.  */
1856         SAVEBOOL(PL_tainted);
1857         PL_tainted = 0;
1858         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1859                          NULL);
1860         if (!SvTRUE(ERRSV))
1861             sv_setsv(ERRSV, errsv_save);
1862         SvREFCNT_dec(errsv_save);
1863         LEAVE;
1864     }
1865     SPAGAIN;
1866     PUSHMARK(SP);
1867     EXTEND(SP,5);
1868     mPUSHp(pkg, pkg_len);
1869     mPUSHp(name, name_len);
1870     PUSHs(listsv);
1871     mPUSHi(minbits);
1872     mPUSHi(none);
1873     PUTBACK;
1874     errsv_save = newSVsv(ERRSV);
1875     if (call_method("SWASHNEW", G_SCALAR))
1876         retval = newSVsv(*PL_stack_sp--);
1877     else
1878         retval = &PL_sv_undef;
1879     if (!SvTRUE(ERRSV))
1880         sv_setsv(ERRSV, errsv_save);
1881     SvREFCNT_dec(errsv_save);
1882     LEAVE;
1883     POPSTACK;
1884     if (IN_PERL_COMPILETIME) {
1885         CopHINTS_set(PL_curcop, PL_hints);
1886     }
1887     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1888         if (SvPOK(retval))
1889             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1890                        SVfARG(retval));
1891         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1892     }
1893     return retval;
1894 }
1895
1896
1897 /* This API is wrong for special case conversions since we may need to
1898  * return several Unicode characters for a single Unicode character
1899  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1900  * the lower-level routine, and it is similarly broken for returning
1901  * multiple values.  --jhi */
1902 /* Now SWASHGET is recasted into S_swash_get in this file. */
1903
1904 /* Note:
1905  * Returns the value of property/mapping C<swash> for the first character
1906  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1907  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1908  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1909  */
1910 UV
1911 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1912 {
1913     dVAR;
1914     HV *const hv = MUTABLE_HV(SvRV(swash));
1915     U32 klen;
1916     U32 off;
1917     STRLEN slen;
1918     STRLEN needents;
1919     const U8 *tmps = NULL;
1920     U32 bit;
1921     SV *swatch;
1922     U8 tmputf8[2];
1923     const UV c = NATIVE_TO_ASCII(*ptr);
1924
1925     PERL_ARGS_ASSERT_SWASH_FETCH;
1926
1927     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1928         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1929         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1930         ptr = tmputf8;
1931     }
1932     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1933      * then the "swatch" is a vec() for all the chars which start
1934      * with 0xAA..0xYY
1935      * So the key in the hash (klen) is length of encoded char -1
1936      */
1937     klen = UTF8SKIP(ptr) - 1;
1938     off  = ptr[klen];
1939
1940     if (klen == 0) {
1941       /* If char is invariant then swatch is for all the invariant chars
1942        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1943        */
1944         needents = UTF_CONTINUATION_MARK;
1945         off      = NATIVE_TO_UTF(ptr[klen]);
1946     }
1947     else {
1948       /* If char is encoded then swatch is for the prefix */
1949         needents = (1 << UTF_ACCUMULATION_SHIFT);
1950         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1951     }
1952
1953     /*
1954      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1955      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1956      * it's nothing to sniff at.)  Pity we usually come through at least
1957      * two function calls to get here...
1958      *
1959      * NB: this code assumes that swatches are never modified, once generated!
1960      */
1961
1962     if (hv   == PL_last_swash_hv &&
1963         klen == PL_last_swash_klen &&
1964         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1965     {
1966         tmps = PL_last_swash_tmps;
1967         slen = PL_last_swash_slen;
1968     }
1969     else {
1970         /* Try our second-level swatch cache, kept in a hash. */
1971         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1972
1973         /* If not cached, generate it via swash_get */
1974         if (!svp || !SvPOK(*svp)
1975                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1976             /* We use utf8n_to_uvuni() as we want an index into
1977                Unicode tables, not a native character number.
1978              */
1979             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1980                                            ckWARN(WARN_UTF8) ?
1981                                            0 : UTF8_ALLOW_ANY);
1982             swatch = swash_get(swash,
1983                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1984                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1985                                 needents);
1986
1987             if (IN_PERL_COMPILETIME)
1988                 CopHINTS_set(PL_curcop, PL_hints);
1989
1990             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1991
1992             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1993                      || (slen << 3) < needents)
1994                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1995         }
1996
1997         PL_last_swash_hv = hv;
1998         assert(klen <= sizeof(PL_last_swash_key));
1999         PL_last_swash_klen = (U8)klen;
2000         /* FIXME change interpvar.h?  */
2001         PL_last_swash_tmps = (U8 *) tmps;
2002         PL_last_swash_slen = slen;
2003         if (klen)
2004             Copy(ptr, PL_last_swash_key, klen, U8);
2005     }
2006
2007     switch ((int)((slen << 3) / needents)) {
2008     case 1:
2009         bit = 1 << (off & 7);
2010         off >>= 3;
2011         return (tmps[off] & bit) != 0;
2012     case 8:
2013         return tmps[off];
2014     case 16:
2015         off <<= 1;
2016         return (tmps[off] << 8) + tmps[off + 1] ;
2017     case 32:
2018         off <<= 2;
2019         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2020     }
2021     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
2022     NORETURN_FUNCTION_END;
2023 }
2024
2025 /* Note:
2026  * Returns a swatch (a bit vector string) for a code point sequence
2027  * that starts from the value C<start> and comprises the number C<span>.
2028  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
2029  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
2030  */
2031 STATIC SV*
2032 S_swash_get(pTHX_ SV* swash, UV start, UV span)
2033 {
2034     SV *swatch;
2035     U8 *l, *lend, *x, *xend, *s;
2036     STRLEN lcur, xcur, scur;
2037     HV *const hv = MUTABLE_HV(SvRV(swash));
2038     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
2039     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
2040     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2041     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
2042     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
2043     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
2044     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
2045     const STRLEN bits  = SvUV(*bitssvp);
2046     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
2047     const UV     none  = SvUV(*nonesvp);
2048     const UV     end   = start + span;
2049
2050     PERL_ARGS_ASSERT_SWASH_GET;
2051
2052     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
2053         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
2054                                                  (UV)bits);
2055     }
2056
2057     /* create and initialize $swatch */
2058     scur   = octets ? (span * octets) : (span + 7) / 8;
2059     swatch = newSV(scur);
2060     SvPOK_on(swatch);
2061     s = (U8*)SvPVX(swatch);
2062     if (octets && none) {
2063         const U8* const e = s + scur;
2064         while (s < e) {
2065             if (bits == 8)
2066                 *s++ = (U8)(none & 0xff);
2067             else if (bits == 16) {
2068                 *s++ = (U8)((none >>  8) & 0xff);
2069                 *s++ = (U8)( none        & 0xff);
2070             }
2071             else if (bits == 32) {
2072                 *s++ = (U8)((none >> 24) & 0xff);
2073                 *s++ = (U8)((none >> 16) & 0xff);
2074                 *s++ = (U8)((none >>  8) & 0xff);
2075                 *s++ = (U8)( none        & 0xff);
2076             }
2077         }
2078         *s = '\0';
2079     }
2080     else {
2081         (void)memzero((U8*)s, scur + 1);
2082     }
2083     SvCUR_set(swatch, scur);
2084     s = (U8*)SvPVX(swatch);
2085
2086     /* read $swash->{LIST} */
2087     l = (U8*)SvPV(*listsvp, lcur);
2088     lend = l + lcur;
2089     while (l < lend) {
2090         UV min, max, val;
2091         STRLEN numlen;
2092         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2093
2094         U8* const nl = (U8*)memchr(l, '\n', lend - l);
2095
2096         numlen = lend - l;
2097         min = grok_hex((char *)l, &numlen, &flags, NULL);
2098         if (numlen)
2099             l += numlen;
2100         else if (nl) {
2101             l = nl + 1; /* 1 is length of "\n" */
2102             continue;
2103         }
2104         else {
2105             l = lend; /* to LIST's end at which \n is not found */
2106             break;
2107         }
2108
2109         if (isBLANK(*l)) {
2110             ++l;
2111             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2112             numlen = lend - l;
2113             max = grok_hex((char *)l, &numlen, &flags, NULL);
2114             if (numlen)
2115                 l += numlen;
2116             else
2117                 max = min;
2118
2119             if (octets) {
2120                 if (isBLANK(*l)) {
2121                     ++l;
2122                     flags = PERL_SCAN_SILENT_ILLDIGIT |
2123                             PERL_SCAN_DISALLOW_PREFIX;
2124                     numlen = lend - l;
2125                     val = grok_hex((char *)l, &numlen, &flags, NULL);
2126                     if (numlen)
2127                         l += numlen;
2128                     else
2129                         val = 0;
2130                 }
2131                 else {
2132                     val = 0;
2133                     if (typeto) {
2134                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2135                                          typestr, l);
2136                     }
2137                 }
2138             }
2139             else
2140                 val = 0; /* bits == 1, then val should be ignored */
2141         }
2142         else {
2143             max = min;
2144             if (octets) {
2145                 val = 0;
2146                 if (typeto) {
2147                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2148                 }
2149             }
2150             else
2151                 val = 0; /* bits == 1, then val should be ignored */
2152         }
2153
2154         if (nl)
2155             l = nl + 1;
2156         else
2157             l = lend;
2158
2159         if (max < start)
2160             continue;
2161
2162         if (octets) {
2163             UV key;
2164             if (min < start) {
2165                 if (!none || val < none) {
2166                     val += start - min;
2167                 }
2168                 min = start;
2169             }
2170             for (key = min; key <= max; key++) {
2171                 STRLEN offset;
2172                 if (key >= end)
2173                     goto go_out_list;
2174                 /* offset must be non-negative (start <= min <= key < end) */
2175                 offset = octets * (key - start);
2176                 if (bits == 8)
2177                     s[offset] = (U8)(val & 0xff);
2178                 else if (bits == 16) {
2179                     s[offset    ] = (U8)((val >>  8) & 0xff);
2180                     s[offset + 1] = (U8)( val        & 0xff);
2181                 }
2182                 else if (bits == 32) {
2183                     s[offset    ] = (U8)((val >> 24) & 0xff);
2184                     s[offset + 1] = (U8)((val >> 16) & 0xff);
2185                     s[offset + 2] = (U8)((val >>  8) & 0xff);
2186                     s[offset + 3] = (U8)( val        & 0xff);
2187                 }
2188
2189                 if (!none || val < none)
2190                     ++val;
2191             }
2192         }
2193         else { /* bits == 1, then val should be ignored */
2194             UV key;
2195             if (min < start)
2196                 min = start;
2197             for (key = min; key <= max; key++) {
2198                 const STRLEN offset = (STRLEN)(key - start);
2199                 if (key >= end)
2200                     goto go_out_list;
2201                 s[offset >> 3] |= 1 << (offset & 7);
2202             }
2203         }
2204     } /* while */
2205   go_out_list:
2206
2207     /* read $swash->{EXTRAS} */
2208     x = (U8*)SvPV(*extssvp, xcur);
2209     xend = x + xcur;
2210     while (x < xend) {
2211         STRLEN namelen;
2212         U8 *namestr;
2213         SV** othersvp;
2214         HV* otherhv;
2215         STRLEN otherbits;
2216         SV **otherbitssvp, *other;
2217         U8 *s, *o, *nl;
2218         STRLEN slen, olen;
2219
2220         const U8 opc = *x++;
2221         if (opc == '\n')
2222             continue;
2223
2224         nl = (U8*)memchr(x, '\n', xend - x);
2225
2226         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2227             if (nl) {
2228                 x = nl + 1; /* 1 is length of "\n" */
2229                 continue;
2230             }
2231             else {
2232                 x = xend; /* to EXTRAS' end at which \n is not found */
2233                 break;
2234             }
2235         }
2236
2237         namestr = x;
2238         if (nl) {
2239             namelen = nl - namestr;
2240             x = nl + 1;
2241         }
2242         else {
2243             namelen = xend - namestr;
2244             x = xend;
2245         }
2246
2247         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2248         otherhv = MUTABLE_HV(SvRV(*othersvp));
2249         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2250         otherbits = (STRLEN)SvUV(*otherbitssvp);
2251         if (bits < otherbits)
2252             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2253
2254         /* The "other" swatch must be destroyed after. */
2255         other = swash_get(*othersvp, start, span);
2256         o = (U8*)SvPV(other, olen);
2257
2258         if (!olen)
2259             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2260
2261         s = (U8*)SvPV(swatch, slen);
2262         if (bits == 1 && otherbits == 1) {
2263             if (slen != olen)
2264                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2265
2266             switch (opc) {
2267             case '+':
2268                 while (slen--)
2269                     *s++ |= *o++;
2270                 break;
2271             case '!':
2272                 while (slen--)
2273                     *s++ |= ~*o++;
2274                 break;
2275             case '-':
2276                 while (slen--)
2277                     *s++ &= ~*o++;
2278                 break;
2279             case '&':
2280                 while (slen--)
2281                     *s++ &= *o++;
2282                 break;
2283             default:
2284                 break;
2285             }
2286         }
2287         else {
2288             STRLEN otheroctets = otherbits >> 3;
2289             STRLEN offset = 0;
2290             U8* const send = s + slen;
2291
2292             while (s < send) {
2293                 UV otherval = 0;
2294
2295                 if (otherbits == 1) {
2296                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2297                     ++offset;
2298                 }
2299                 else {
2300                     STRLEN vlen = otheroctets;
2301                     otherval = *o++;
2302                     while (--vlen) {
2303                         otherval <<= 8;
2304                         otherval |= *o++;
2305                     }
2306                 }
2307
2308                 if (opc == '+' && otherval)
2309                     NOOP;   /* replace with otherval */
2310                 else if (opc == '!' && !otherval)
2311                     otherval = 1;
2312                 else if (opc == '-' && otherval)
2313                     otherval = 0;
2314                 else if (opc == '&' && !otherval)
2315                     otherval = 0;
2316                 else {
2317                     s += octets; /* no replacement */
2318                     continue;
2319                 }
2320
2321                 if (bits == 8)
2322                     *s++ = (U8)( otherval & 0xff);
2323                 else if (bits == 16) {
2324                     *s++ = (U8)((otherval >>  8) & 0xff);
2325                     *s++ = (U8)( otherval        & 0xff);
2326                 }
2327                 else if (bits == 32) {
2328                     *s++ = (U8)((otherval >> 24) & 0xff);
2329                     *s++ = (U8)((otherval >> 16) & 0xff);
2330                     *s++ = (U8)((otherval >>  8) & 0xff);
2331                     *s++ = (U8)( otherval        & 0xff);
2332                 }
2333             }
2334         }
2335         sv_free(other); /* through with it! */
2336     } /* while */
2337     return swatch;
2338 }
2339
2340 /*
2341 =for apidoc uvchr_to_utf8
2342
2343 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2344 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2345 bytes available. The return value is the pointer to the byte after the
2346 end of the new character. In other words,
2347
2348     d = uvchr_to_utf8(d, uv);
2349
2350 is the recommended wide native character-aware way of saying
2351
2352     *(d++) = uv;
2353
2354 =cut
2355 */
2356
2357 /* On ASCII machines this is normally a macro but we want a
2358    real function in case XS code wants it
2359 */
2360 U8 *
2361 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2362 {
2363     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2364
2365     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2366 }
2367
2368 U8 *
2369 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2370 {
2371     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2372
2373     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2374 }
2375
2376 /*
2377 =for apidoc utf8n_to_uvchr
2378 flags
2379
2380 Returns the native character value of the first character in the string
2381 C<s>
2382 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2383 length, in bytes, of that character.
2384
2385 Allows length and flags to be passed to low level routine.
2386
2387 =cut
2388 */
2389 /* On ASCII machines this is normally a macro but we want
2390    a real function in case XS code wants it
2391 */
2392 UV
2393 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2394 U32 flags)
2395 {
2396     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2397
2398     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2399
2400     return UNI_TO_NATIVE(uv);
2401 }
2402
2403 /*
2404 =for apidoc pv_uni_display
2405
2406 Build to the scalar dsv a displayable version of the string spv,
2407 length len, the displayable version being at most pvlim bytes long
2408 (if longer, the rest is truncated and "..." will be appended).
2409
2410 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2411 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2412 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2413 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2414 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2415 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2416
2417 The pointer to the PV of the dsv is returned.
2418
2419 =cut */
2420 char *
2421 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2422 {
2423     int truncated = 0;
2424     const char *s, *e;
2425
2426     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2427
2428     sv_setpvs(dsv, "");
2429     SvUTF8_off(dsv);
2430     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2431          UV u;
2432           /* This serves double duty as a flag and a character to print after
2433              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2434           */
2435          char ok = 0;
2436
2437          if (pvlim && SvCUR(dsv) >= pvlim) {
2438               truncated++;
2439               break;
2440          }
2441          u = utf8_to_uvchr((U8*)s, 0);
2442          if (u < 256) {
2443              const unsigned char c = (unsigned char)u & 0xFF;
2444              if (flags & UNI_DISPLAY_BACKSLASH) {
2445                  switch (c) {
2446                  case '\n':
2447                      ok = 'n'; break;
2448                  case '\r':
2449                      ok = 'r'; break;
2450                  case '\t':
2451                      ok = 't'; break;
2452                  case '\f':
2453                      ok = 'f'; break;
2454                  case '\a':
2455                      ok = 'a'; break;
2456                  case '\\':
2457                      ok = '\\'; break;
2458                  default: break;
2459                  }
2460                  if (ok) {
2461                      const char string = ok;
2462                      sv_catpvs(dsv, "\\");
2463                      sv_catpvn(dsv, &string, 1);
2464                  }
2465              }
2466              /* isPRINT() is the locale-blind version. */
2467              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2468                  const char string = c;
2469                  sv_catpvn(dsv, &string, 1);
2470                  ok = 1;
2471              }
2472          }
2473          if (!ok)
2474              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2475     }
2476     if (truncated)
2477          sv_catpvs(dsv, "...");
2478
2479     return SvPVX(dsv);
2480 }
2481
2482 /*
2483 =for apidoc sv_uni_display
2484
2485 Build to the scalar dsv a displayable version of the scalar sv,
2486 the displayable version being at most pvlim bytes long
2487 (if longer, the rest is truncated and "..." will be appended).
2488
2489 The flags argument is as in pv_uni_display().
2490
2491 The pointer to the PV of the dsv is returned.
2492
2493 =cut
2494 */
2495 char *
2496 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2497 {
2498     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2499
2500      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2501                                 SvCUR(ssv), pvlim, flags);
2502 }
2503
2504 /*
2505 =for apidoc foldEQ_utf8
2506
2507 Returns true if the leading portions of the strings s1 and s2 (either or both
2508 of which may be in UTF-8) are the same case-insensitively; false otherwise.
2509 How far into the strings to compare is determined by other input parameters.
2510
2511 If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
2512 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for u2
2513 with respect to s2.
2514
2515 If the byte length l1 is non-zero, it says how far into s1 to check for fold
2516 equality.  In other words, s1+l1 will be used as a goal to reach.  The
2517 scan will not be considered to be a match unless the goal is reached, and
2518 scanning won't continue past that goal.  Correspondingly for l2 with respect to
2519 s2.
2520
2521 If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
2522 considered an end pointer beyond which scanning of s1 will not continue under
2523 any circumstances.  This means that if both l1 and pe1 are specified, and pe1
2524 is less than s1+l1, the match will never be successful because it can never
2525 get as far as its goal (and in fact is asserted against).  Correspondingly for
2526 pe2 with respect to s2.
2527
2528 At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
2529 non-zero), and if both do, both have to be
2530 reached for a successful match.   Also, if the fold of a character is multiple
2531 characters, all of them must be matched (see tr21 reference below for
2532 'folding').
2533
2534 Upon a successful match, if pe1 is non-NULL,
2535 it will be set to point to the beginning of the I<next> character of s1 beyond
2536 what was matched.  Correspondingly for pe2 and s2.
2537
2538 For case-insensitiveness, the "casefolding" of Unicode is used
2539 instead of upper/lowercasing both the characters, see
2540 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2541
2542 =cut */
2543 I32
2544 Perl_foldEQ_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2545 {
2546     dVAR;
2547     register const U8 *p1  = (const U8*)s1; /* Point to current char */
2548     register const U8 *p2  = (const U8*)s2;
2549     register const U8 *g1 = NULL;       /* goal for s1 */
2550     register const U8 *g2 = NULL;
2551     register const U8 *e1 = NULL;       /* Don't scan s1 past this */
2552     register U8 *f1 = NULL;             /* Point to current folded */
2553     register const U8 *e2 = NULL;
2554     register U8 *f2 = NULL;
2555     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
2556     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2557     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2558     U8 natbuf[2];               /* Holds native 8-bit char converted to utf8;
2559                                    these always fit in 2 bytes */
2560
2561     PERL_ARGS_ASSERT_FOLDEQ_UTF8;
2562
2563     if (pe1) {
2564         e1 = *(U8**)pe1;
2565     }
2566
2567     if (l1) {
2568         g1 = (const U8*)s1 + l1;
2569     }
2570
2571     if (pe2) {
2572         e2 = *(U8**)pe2;
2573     }
2574
2575     if (l2) {
2576         g2 = (const U8*)s2 + l2;
2577     }
2578
2579     /* Must have at least one goal */
2580     assert(g1 || g2);
2581
2582     if (g1) {
2583
2584         /* Will never match if goal is out-of-bounds */
2585         assert(! e1  || e1 >= g1);
2586
2587         /* Here, there isn't an end pointer, or it is beyond the goal.  We
2588         * only go as far as the goal */
2589         e1 = g1;
2590     }
2591     else {
2592         assert(e1);    /* Must have an end for looking at s1 */
2593     }
2594
2595     /* Same for goal for s2 */
2596     if (g2) {
2597         assert(! e2  || e2 >= g2);
2598         e2 = g2;
2599     }
2600     else {
2601         assert(e2);
2602     }
2603
2604     /* Look through both strings, a character at a time */
2605     while (p1 < e1 && p2 < e2) {
2606
2607         /* If at the beginning of a new character in s1, get its fold to use
2608          * and the length of the fold */
2609         if (n1 == 0) {
2610             if (u1) {
2611                 to_utf8_fold(p1, foldbuf1, &n1);
2612             }
2613             else {  /* Not utf8, convert to it first and then get fold */
2614                 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2615                 to_utf8_fold(natbuf, foldbuf1, &n1);
2616             }
2617             f1 = foldbuf1;
2618         }
2619
2620         if (n2 == 0) {    /* Same for s2 */
2621             if (u2) {
2622                 to_utf8_fold(p2, foldbuf2, &n2);
2623             }
2624             else {
2625                 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2626                 to_utf8_fold(natbuf, foldbuf2, &n2);
2627             }
2628             f2 = foldbuf2;
2629         }
2630
2631         /* While there is more to look for in both folds, see if they
2632         * continue to match */
2633         while (n1 && n2) {
2634             U8 fold_length = UTF8SKIP(f1);
2635             if (fold_length != UTF8SKIP(f2)
2636                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
2637                                                        function call for single
2638                                                        character */
2639                 || memNE((char*)f1, (char*)f2, fold_length))
2640             {
2641                 return 0; /* mismatch */
2642             }
2643
2644             /* Here, they matched, advance past them */
2645             n1 -= fold_length;
2646             f1 += fold_length;
2647             n2 -= fold_length;
2648             f2 += fold_length;
2649         }
2650
2651         /* When reach the end of any fold, advance the input past it */
2652         if (n1 == 0) {
2653             p1 += u1 ? UTF8SKIP(p1) : 1;
2654         }
2655         if (n2 == 0) {
2656             p2 += u2 ? UTF8SKIP(p2) : 1;
2657         }
2658     } /* End of loop through both strings */
2659
2660     /* A match is defined by each scan that specified an explicit length
2661     * reaching its final goal, and the other not having matched a partial
2662     * character (which can happen when the fold of a character is more than one
2663     * character). */
2664     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
2665         return 0;
2666     }
2667
2668     /* Successful match.  Set output pointers */
2669     if (pe1) {
2670         *pe1 = (char*)p1;
2671     }
2672     if (pe2) {
2673         *pe2 = (char*)p2;
2674     }
2675     return 1;
2676 }
2677
2678 /*
2679  * Local variables:
2680  * c-indentation-style: bsd
2681  * c-basic-offset: 4
2682  * indent-tabs-mode: t
2683  * End:
2684  *
2685  * ex: set ts=8 sts=4 sw=4 noet:
2686  */