utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2002, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  33
  34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  36 bytes available. The return value is the pointer to the byte after the
  37 end of the new character. In other words,
  38
  39     d = uvuni_to_utf8_flags(d, uv, flags);
  40
  41 or, in most cases,
  42
  43     d = uvuni_to_utf8(d, uv);
  44
  45 (which is equivalent to)
  46
  47     d = uvuni_to_utf8_flags(d, uv, 0);
  48
  49 is the recommended Unicode-aware way of saying
  50
  51     *(d++) = uv;
  52
  53 =cut
  54 */
  55
  56 U8 *
  57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  58 {
  59     if (ckWARN(WARN_UTF8)) {
  60          if (UNICODE_IS_SURROGATE(uv) &&
  61              !(flags & UNICODE_ALLOW_SURROGATE))
  62               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  63          else if (
  64                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  65                     !(flags & UNICODE_ALLOW_FDD0))
  66                    ||
  67                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  68                     !(flags & UNICODE_ALLOW_FFFF))) &&
  69                   /* UNICODE_ALLOW_SUPER includes
  70                    * FFFEs and FFFFs beyond 0x10FFFF. */
  71                   ((uv <= PERL_UNICODE_MAX) ||
  72                    !(flags & UNICODE_ALLOW_SUPER))
  73                   )
  74               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  75                          "Unicode character 0x%04"UVxf" is illegal", uv);
  76     }
  77     if (UNI_IS_INVARIANT(uv)) {
  78         *d++ = (U8)UTF_TO_NATIVE(uv);
  79         return d;
  80     }
  81 #if defined(EBCDIC)
  82     else {
  83         STRLEN len  = UNISKIP(uv);
  84         U8 *p = d+len-1;
  85         while (p > d) {
  86             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  87             uv >>= UTF_ACCUMULATION_SHIFT;
  88         }
  89         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  90         return d+len;
  91     }
  92 #else /* Non loop style */
  93     if (uv < 0x800) {
  94         *d++ = (U8)(( uv >>  6)         | 0xc0);
  95         *d++ = (U8)(( uv        & 0x3f) | 0x80);
  96         return d;
  97     }
  98     if (uv < 0x10000) {
  99         *d++ = (U8)(( uv >> 12)         | 0xe0);
 100         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 101         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 102         return d;
 103     }
 104     if (uv < 0x200000) {
 105         *d++ = (U8)(( uv >> 18)         | 0xf0);
 106         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 107         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 108         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 109         return d;
 110     }
 111     if (uv < 0x4000000) {
 112         *d++ = (U8)(( uv >> 24)         | 0xf8);
 113         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x80000000) {
 120         *d++ = (U8)(( uv >> 30)         | 0xfc);
 121         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 124         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 125         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 126         return d;
 127     }
 128 #ifdef HAS_QUAD
 129     if (uv < UTF8_QUAD_MAX)
 130 #endif
 131     {
 132         *d++ =                            0xfe; /* Can't match U+FEFF! */
 133         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 134         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 135         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 136         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 137         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 138         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 139         return d;
 140     }
 141 #ifdef HAS_QUAD
 142     {
 143         *d++ =                            0xff;         /* Can't match U+FFFE! */
 144         *d++ =                            0x80;         /* 6 Reserved bits */
 145         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 146         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 147         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 148         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 149         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 150         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 151         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 152         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 153         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 154         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 155         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 156         return d;
 157     }
 158 #endif
 159 #endif /* Loop style */
 160 }
 161
 162 U8 *
 163 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 164 {
 165     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 166 }
 167
 168
 169 /*
 170 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 =cut */
 178 STRLEN
 179 Perl_is_utf8_char(pTHX_ U8 *s)
 180 {
 181     U8 u = *s;
 182     STRLEN slen, len;
 183     UV uv, ouv;
 184
 185     if (UTF8_IS_INVARIANT(u))
 186         return 1;
 187
 188     if (!UTF8_IS_START(u))
 189         return 0;
 190
 191     len = UTF8SKIP(s);
 192
 193     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 194         return 0;
 195
 196     slen = len - 1;
 197     s++;
 198     u &= UTF_START_MASK(len);
 199     uv  = u;
 200     ouv = uv;
 201     while (slen--) {
 202         if (!UTF8_IS_CONTINUATION(*s))
 203             return 0;
 204         uv = UTF8_ACCUMULATE(uv, *s);
 205         if (uv < ouv)
 206             return 0;
 207         ouv = uv;
 208         s++;
 209     }
 210
 211     if ((STRLEN)UNISKIP(uv) < len)
 212         return 0;
 213
 214     return len;
 215 }
 216
 217 /*
 218 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 219
 220 Returns true if first C<len> bytes of the given string form a valid UTF8
 221 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 222 'a string that contains UTF8' because a valid ASCII string is a valid
 223 UTF8 string.
 224
 225 =cut
 226 */
 227
 228 bool
 229 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 230 {
 231     U8* x = s;
 232     U8* send;
 233     STRLEN c;
 234
 235     if (!len)
 236         len = strlen((char *)s);
 237     send = s + len;
 238
 239     while (x < send) {
 240         c = is_utf8_char(x);
 241         if (!c)
 242             return FALSE;
 243         x += c;
 244     }
 245     if (x != send)
 246         return FALSE;
 247
 248     return TRUE;
 249 }
 250
 251 /*
 252 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 253
 254 Bottom level UTF-8 decode routine.
 255 Returns the unicode code point value of the first character in the string C<s>
 256 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 257 C<retlen> will be set to the length, in bytes, of that character.
 258
 259 If C<s> does not point to a well-formed UTF8 character, the behaviour
 260 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 261 it is assumed that the caller will raise a warning, and this function
 262 will silently just set C<retlen> to C<-1> and return zero.  If the
 263 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 264 malformations will be given, C<retlen> will be set to the expected
 265 length of the UTF-8 character in bytes, and zero will be returned.
 266
 267 The C<flags> can also contain various flags to allow deviations from
 268 the strict UTF-8 encoding (see F<utf8.h>).
 269
 270 Most code should use utf8_to_uvchr() rather than call this directly.
 271
 272 =cut
 273 */
 274
 275 UV
 276 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 277 {
 278     U8 *s0 = s;
 279     UV uv = *s, ouv = 0;
 280     STRLEN len = 1;
 281     bool dowarn = ckWARN_d(WARN_UTF8);
 282     UV startbyte = *s;
 283     STRLEN expectlen = 0;
 284     U32 warning = 0;
 285
 286 /* This list is a superset of the UTF8_ALLOW_XXX. */
 287
 288 #define UTF8_WARN_EMPTY                          1
 289 #define UTF8_WARN_CONTINUATION                   2
 290 #define UTF8_WARN_NON_CONTINUATION               3
 291 #define UTF8_WARN_FE_FF                          4
 292 #define UTF8_WARN_SHORT                          5
 293 #define UTF8_WARN_OVERFLOW                       6
 294 #define UTF8_WARN_SURROGATE                      7
 295 #define UTF8_WARN_LONG                           8
 296 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 297
 298     if (curlen == 0 &&
 299         !(flags & UTF8_ALLOW_EMPTY)) {
 300         warning = UTF8_WARN_EMPTY;
 301         goto malformed;
 302     }
 303
 304     if (UTF8_IS_INVARIANT(uv)) {
 305         if (retlen)
 306             *retlen = 1;
 307         return (UV) (NATIVE_TO_UTF(*s));
 308     }
 309
 310     if (UTF8_IS_CONTINUATION(uv) &&
 311         !(flags & UTF8_ALLOW_CONTINUATION)) {
 312         warning = UTF8_WARN_CONTINUATION;
 313         goto malformed;
 314     }
 315
 316     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 317         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 318         warning = UTF8_WARN_NON_CONTINUATION;
 319         goto malformed;
 320     }
 321
 322 #ifdef EBCDIC
 323     uv = NATIVE_TO_UTF(uv);
 324 #else
 325     if ((uv == 0xfe || uv == 0xff) &&
 326         !(flags & UTF8_ALLOW_FE_FF)) {
 327         warning = UTF8_WARN_FE_FF;
 328         goto malformed;
 329     }
 330 #endif
 331
 332     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 333     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 334     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 335     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 336 #ifdef EBCDIC
 337     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 338     else                        { len =  7; uv &= 0x01; }
 339 #else
 340     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 341     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 342     else                        { len = 13; uv = 0; } /* whoa! */
 343 #endif
 344
 345     if (retlen)
 346         *retlen = len;
 347
 348     expectlen = len;
 349
 350     if ((curlen < expectlen) &&
 351         !(flags & UTF8_ALLOW_SHORT)) {
 352         warning = UTF8_WARN_SHORT;
 353         goto malformed;
 354     }
 355
 356     len--;
 357     s++;
 358     ouv = uv;
 359
 360     while (len--) {
 361         if (!UTF8_IS_CONTINUATION(*s) &&
 362             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 363             s--;
 364             warning = UTF8_WARN_NON_CONTINUATION;
 365             goto malformed;
 366         }
 367         else
 368             uv = UTF8_ACCUMULATE(uv, *s);
 369         if (!(uv > ouv)) {
 370             /* These cannot be allowed. */
 371             if (uv == ouv) {
 372                 if (!(flags & UTF8_ALLOW_LONG)) {
 373                     warning = UTF8_WARN_LONG;
 374                     goto malformed;
 375                 }
 376             }
 377             else { /* uv < ouv */
 378                 /* This cannot be allowed. */
 379                 warning = UTF8_WARN_OVERFLOW;
 380                 goto malformed;
 381             }
 382         }
 383         s++;
 384         ouv = uv;
 385     }
 386
 387     if (UNICODE_IS_SURROGATE(uv) &&
 388         !(flags & UTF8_ALLOW_SURROGATE)) {
 389         warning = UTF8_WARN_SURROGATE;
 390         goto malformed;
 391     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 392                !(flags & UTF8_ALLOW_LONG)) {
 393         warning = UTF8_WARN_LONG;
 394         goto malformed;
 395     } else if (UNICODE_IS_ILLEGAL(uv) &&
 396                !(flags & UTF8_ALLOW_FFFF)) {
 397         warning = UTF8_WARN_FFFF;
 398         goto malformed;
 399     }
 400
 401     return uv;
 402
 403 malformed:
 404
 405     if (flags & UTF8_CHECK_ONLY) {
 406         if (retlen)
 407             *retlen = -1;
 408         return 0;
 409     }
 410
 411     if (dowarn) {
 412         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 413
 414         switch (warning) {
 415         case 0: /* Intentionally empty. */ break;
 416         case UTF8_WARN_EMPTY:
 417             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 418             break;
 419         case UTF8_WARN_CONTINUATION:
 420             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 421             break;
 422         case UTF8_WARN_NON_CONTINUATION:
 423             if (s == s0)
 424                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 425                            (UV)s[1], startbyte);
 426             else
 427                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 428                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 429
 430             break;
 431         case UTF8_WARN_FE_FF:
 432             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 433             break;
 434         case UTF8_WARN_SHORT:
 435             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 436                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 437             expectlen = curlen;         /* distance for caller to skip */
 438             break;
 439         case UTF8_WARN_OVERFLOW:
 440             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 441                            ouv, *s, startbyte);
 442             break;
 443         case UTF8_WARN_SURROGATE:
 444             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 445             break;
 446         case UTF8_WARN_LONG:
 447             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 448                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 449             break;
 450         case UTF8_WARN_FFFF:
 451             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 452             break;
 453         default:
 454             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 455             break;
 456         }
 457
 458         if (warning) {
 459             char *s = SvPVX(sv);
 460
 461             if (PL_op)
 462                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 463                             "%s in %s", s,  OP_DESC(PL_op));
 464             else
 465                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 466         }
 467     }
 468
 469     if (retlen)
 470         *retlen = expectlen ? expectlen : len;
 471
 472     return 0;
 473 }
 474
 475 /*
 476 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 477
 478 Returns the native character value of the first character in the string C<s>
 479 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 480 length, in bytes, of that character.
 481
 482 If C<s> does not point to a well-formed UTF8 character, zero is
 483 returned and retlen is set, if possible, to -1.
 484
 485 =cut
 486 */
 487
 488 UV
 489 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 490 {
 491     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
 492                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 493 }
 494
 495 /*
 496 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 497
 498 Returns the Unicode code point of the first character in the string C<s>
 499 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 500 length, in bytes, of that character.
 501
 502 This function should only be used when returned UV is considered
 503 an index into the Unicode semantic tables (e.g. swashes).
 504
 505 If C<s> does not point to a well-formed UTF8 character, zero is
 506 returned and retlen is set, if possible, to -1.
 507
 508 =cut
 509 */
 510
 511 UV
 512 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 513 {
 514     /* Call the low level routine asking for checks */
 515     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
 516                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 517 }
 518
 519 /*
 520 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 521
 522 Return the length of the UTF-8 char encoded string C<s> in characters.
 523 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 524 up past C<e>, croaks.
 525
 526 =cut
 527 */
 528
 529 STRLEN
 530 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 531 {
 532     STRLEN len = 0;
 533
 534     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 535      * the bitops (especially ~) can create illegal UTF-8.
 536      * In other words: in Perl UTF-8 is not just for Unicode. */
 537
 538     if (e < s) {
 539         if (ckWARN_d(WARN_UTF8)) {
 540             if (PL_op)
 541                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 542                             "%s in %s", unees, OP_DESC(PL_op));
 543             else
 544                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 545         }
 546         return 0;
 547     }
 548     while (s < e) {
 549         U8 t = UTF8SKIP(s);
 550
 551         if (e - s < t) {
 552             if (ckWARN_d(WARN_UTF8)) {
 553                 if (PL_op)
 554                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 555                                 unees, OP_DESC(PL_op));
 556                 else
 557                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 558             }
 559             return len;
 560         }
 561         s += t;
 562         len++;
 563     }
 564
 565     return len;
 566 }
 567
 568 /*
 569 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 570
 571 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 572 and C<b>.
 573
 574 WARNING: use only if you *know* that the pointers point inside the
 575 same UTF-8 buffer.
 576
 577 =cut
 578 */
 579
 580 IV
 581 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 582 {
 583     IV off = 0;
 584
 585     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 586      * the bitops (especially ~) can create illegal UTF-8.
 587      * In other words: in Perl UTF-8 is not just for Unicode. */
 588
 589     if (a < b) {
 590         while (a < b) {
 591             U8 c = UTF8SKIP(a);
 592
 593             if (b - a < c) {
 594                 if (ckWARN_d(WARN_UTF8)) {
 595                     if (PL_op)
 596                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 597                                     "%s in %s", unees, OP_DESC(PL_op));
 598                     else
 599                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 600                 }
 601                 return off;
 602             }
 603             a += c;
 604             off--;
 605         }
 606     }
 607     else {
 608         while (b < a) {
 609             U8 c = UTF8SKIP(b);
 610
 611             if (a - b < c) {
 612                 if (ckWARN_d(WARN_UTF8)) {
 613                     if (PL_op)
 614                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 615                                     "%s in %s", unees, OP_DESC(PL_op));
 616                     else
 617                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 618                 }
 619                 return off;
 620             }
 621             b += c;
 622             off++;
 623         }
 624     }
 625
 626     return off;
 627 }
 628
 629 /*
 630 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 631
 632 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 633 forward or backward.
 634
 635 WARNING: do not use the following unless you *know* C<off> is within
 636 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 637 on the first byte of character or just after the last byte of a character.
 638
 639 =cut
 640 */
 641
 642 U8 *
 643 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 644 {
 645     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 646      * the bitops (especially ~) can create illegal UTF-8.
 647      * In other words: in Perl UTF-8 is not just for Unicode. */
 648
 649     if (off >= 0) {
 650         while (off--)
 651             s += UTF8SKIP(s);
 652     }
 653     else {
 654         while (off++) {
 655             s--;
 656             while (UTF8_IS_CONTINUATION(*s))
 657                 s--;
 658         }
 659     }
 660     return s;
 661 }
 662
 663 /*
 664 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 665
 666 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 667 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 668 updates len to contain the new length.
 669 Returns zero on failure, setting C<len> to -1.
 670
 671 =cut
 672 */
 673
 674 U8 *
 675 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 676 {
 677     U8 *send;
 678     U8 *d;
 679     U8 *save = s;
 680
 681     /* ensure valid UTF8 and chars < 256 before updating string */
 682     for (send = s + *len; s < send; ) {
 683         U8 c = *s++;
 684
 685         if (!UTF8_IS_INVARIANT(c) &&
 686             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 687              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 688             *len = -1;
 689             return 0;
 690         }
 691     }
 692
 693     d = s = save;
 694     while (s < send) {
 695         STRLEN ulen;
 696         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 697         s += ulen;
 698     }
 699     *d = '\0';
 700     *len = d - save;
 701     return save;
 702 }
 703
 704 /*
 705 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 706
 707 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 708 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 709 the newly-created string, and updates C<len> to contain the new
 710 length.  Returns the original string if no conversion occurs, C<len>
 711 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 712 0 if C<s> is converted or contains all 7bit characters.
 713
 714 =cut
 715 */
 716
 717 U8 *
 718 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 719 {
 720     U8 *d;
 721     U8 *start = s;
 722     U8 *send;
 723     I32 count = 0;
 724
 725     if (!*is_utf8)
 726         return start;
 727
 728     /* ensure valid UTF8 and chars < 256 before converting string */
 729     for (send = s + *len; s < send;) {
 730         U8 c = *s++;
 731         if (!UTF8_IS_INVARIANT(c)) {
 732             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 733                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 734                 count++;
 735             else
 736                 return start;
 737         }
 738     }
 739
 740     *is_utf8 = 0;
 741
 742     Newz(801, d, (*len) - count + 1, U8);
 743     s = start; start = d;
 744     while (s < send) {
 745         U8 c = *s++;
 746         if (!UTF8_IS_INVARIANT(c)) {
 747             /* Then it is two-byte encoded */
 748             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 749             c = ASCII_TO_NATIVE(c);
 750         }
 751         *d++ = c;
 752     }
 753     *d = '\0';
 754     *len = d - start;
 755     return start;
 756 }
 757
 758 /*
 759 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 760
 761 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 762 Returns a pointer to the newly-created string, and sets C<len> to
 763 reflect the new length.
 764
 765 =cut
 766 */
 767
 768 U8*
 769 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 770 {
 771     U8 *send;
 772     U8 *d;
 773     U8 *dst;
 774     send = s + (*len);
 775
 776     Newz(801, d, (*len) * 2 + 1, U8);
 777     dst = d;
 778
 779     while (s < send) {
 780         UV uv = NATIVE_TO_ASCII(*s++);
 781         if (UNI_IS_INVARIANT(uv))
 782             *d++ = (U8)UTF_TO_NATIVE(uv);
 783         else {
 784             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 785             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 786         }
 787     }
 788     *d = '\0';
 789     *len = d-dst;
 790     return dst;
 791 }
 792
 793 /*
 794  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 795  *
 796  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 797  * We optimize for native, for obvious reasons. */
 798
 799 U8*
 800 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 801 {
 802     U8* pend;
 803     U8* dstart = d;
 804
 805     if (bytelen & 1)
 806         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 807
 808     pend = p + bytelen;
 809
 810     while (p < pend) {
 811         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 812         p += 2;
 813         if (uv < 0x80) {
 814             *d++ = (U8)uv;
 815             continue;
 816         }
 817         if (uv < 0x800) {
 818             *d++ = (U8)(( uv >>  6)         | 0xc0);
 819             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 820             continue;
 821         }
 822         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 823             UV low = *p++;
 824             if (low < 0xdc00 || low >= 0xdfff)
 825                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 826             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 827         }
 828         if (uv < 0x10000) {
 829             *d++ = (U8)(( uv >> 12)         | 0xe0);
 830             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 831             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 832             continue;
 833         }
 834         else {
 835             *d++ = (U8)(( uv >> 18)         | 0xf0);
 836             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 837             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 838             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 839             continue;
 840         }
 841     }
 842     *newlen = d - dstart;
 843     return d;
 844 }
 845
 846 /* Note: this one is slightly destructive of the source. */
 847
 848 U8*
 849 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 850 {
 851     U8* s = (U8*)p;
 852     U8* send = s + bytelen;
 853     while (s < send) {
 854         U8 tmp = s[0];
 855         s[0] = s[1];
 856         s[1] = tmp;
 857         s += 2;
 858     }
 859     return utf16_to_utf8(p, d, bytelen, newlen);
 860 }
 861
 862 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 863
 864 bool
 865 Perl_is_uni_alnum(pTHX_ UV c)
 866 {
 867     U8 tmpbuf[UTF8_MAXLEN+1];
 868     uvchr_to_utf8(tmpbuf, c);
 869     return is_utf8_alnum(tmpbuf);
 870 }
 871
 872 bool
 873 Perl_is_uni_alnumc(pTHX_ UV c)
 874 {
 875     U8 tmpbuf[UTF8_MAXLEN+1];
 876     uvchr_to_utf8(tmpbuf, c);
 877     return is_utf8_alnumc(tmpbuf);
 878 }
 879
 880 bool
 881 Perl_is_uni_idfirst(pTHX_ UV c)
 882 {
 883     U8 tmpbuf[UTF8_MAXLEN+1];
 884     uvchr_to_utf8(tmpbuf, c);
 885     return is_utf8_idfirst(tmpbuf);
 886 }
 887
 888 bool
 889 Perl_is_uni_alpha(pTHX_ UV c)
 890 {
 891     U8 tmpbuf[UTF8_MAXLEN+1];
 892     uvchr_to_utf8(tmpbuf, c);
 893     return is_utf8_alpha(tmpbuf);
 894 }
 895
 896 bool
 897 Perl_is_uni_ascii(pTHX_ UV c)
 898 {
 899     U8 tmpbuf[UTF8_MAXLEN+1];
 900     uvchr_to_utf8(tmpbuf, c);
 901     return is_utf8_ascii(tmpbuf);
 902 }
 903
 904 bool
 905 Perl_is_uni_space(pTHX_ UV c)
 906 {
 907     U8 tmpbuf[UTF8_MAXLEN+1];
 908     uvchr_to_utf8(tmpbuf, c);
 909     return is_utf8_space(tmpbuf);
 910 }
 911
 912 bool
 913 Perl_is_uni_digit(pTHX_ UV c)
 914 {
 915     U8 tmpbuf[UTF8_MAXLEN+1];
 916     uvchr_to_utf8(tmpbuf, c);
 917     return is_utf8_digit(tmpbuf);
 918 }
 919
 920 bool
 921 Perl_is_uni_upper(pTHX_ UV c)
 922 {
 923     U8 tmpbuf[UTF8_MAXLEN+1];
 924     uvchr_to_utf8(tmpbuf, c);
 925     return is_utf8_upper(tmpbuf);
 926 }
 927
 928 bool
 929 Perl_is_uni_lower(pTHX_ UV c)
 930 {
 931     U8 tmpbuf[UTF8_MAXLEN+1];
 932     uvchr_to_utf8(tmpbuf, c);
 933     return is_utf8_lower(tmpbuf);
 934 }
 935
 936 bool
 937 Perl_is_uni_cntrl(pTHX_ UV c)
 938 {
 939     U8 tmpbuf[UTF8_MAXLEN+1];
 940     uvchr_to_utf8(tmpbuf, c);
 941     return is_utf8_cntrl(tmpbuf);
 942 }
 943
 944 bool
 945 Perl_is_uni_graph(pTHX_ UV c)
 946 {
 947     U8 tmpbuf[UTF8_MAXLEN+1];
 948     uvchr_to_utf8(tmpbuf, c);
 949     return is_utf8_graph(tmpbuf);
 950 }
 951
 952 bool
 953 Perl_is_uni_print(pTHX_ UV c)
 954 {
 955     U8 tmpbuf[UTF8_MAXLEN+1];
 956     uvchr_to_utf8(tmpbuf, c);
 957     return is_utf8_print(tmpbuf);
 958 }
 959
 960 bool
 961 Perl_is_uni_punct(pTHX_ UV c)
 962 {
 963     U8 tmpbuf[UTF8_MAXLEN+1];
 964     uvchr_to_utf8(tmpbuf, c);
 965     return is_utf8_punct(tmpbuf);
 966 }
 967
 968 bool
 969 Perl_is_uni_xdigit(pTHX_ UV c)
 970 {
 971     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 972     uvchr_to_utf8(tmpbuf, c);
 973     return is_utf8_xdigit(tmpbuf);
 974 }
 975
 976 UV
 977 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 978 {
 979     uvchr_to_utf8(p, c);
 980     return to_utf8_upper(p, p, lenp);
 981 }
 982
 983 UV
 984 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 985 {
 986     uvchr_to_utf8(p, c);
 987     return to_utf8_title(p, p, lenp);
 988 }
 989
 990 UV
 991 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 992 {
 993     uvchr_to_utf8(p, c);
 994     return to_utf8_lower(p, p, lenp);
 995 }
 996
 997 UV
 998 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 999 {
1000     uvchr_to_utf8(p, c);
1001     return to_utf8_fold(p, p, lenp);
1002 }
1003
1004 /* for now these all assume no locale info available for Unicode > 255 */
1005
1006 bool
1007 Perl_is_uni_alnum_lc(pTHX_ UV c)
1008 {
1009     return is_uni_alnum(c);     /* XXX no locale support yet */
1010 }
1011
1012 bool
1013 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1014 {
1015     return is_uni_alnumc(c);    /* XXX no locale support yet */
1016 }
1017
1018 bool
1019 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1020 {
1021     return is_uni_idfirst(c);   /* XXX no locale support yet */
1022 }
1023
1024 bool
1025 Perl_is_uni_alpha_lc(pTHX_ UV c)
1026 {
1027     return is_uni_alpha(c);     /* XXX no locale support yet */
1028 }
1029
1030 bool
1031 Perl_is_uni_ascii_lc(pTHX_ UV c)
1032 {
1033     return is_uni_ascii(c);     /* XXX no locale support yet */
1034 }
1035
1036 bool
1037 Perl_is_uni_space_lc(pTHX_ UV c)
1038 {
1039     return is_uni_space(c);     /* XXX no locale support yet */
1040 }
1041
1042 bool
1043 Perl_is_uni_digit_lc(pTHX_ UV c)
1044 {
1045     return is_uni_digit(c);     /* XXX no locale support yet */
1046 }
1047
1048 bool
1049 Perl_is_uni_upper_lc(pTHX_ UV c)
1050 {
1051     return is_uni_upper(c);     /* XXX no locale support yet */
1052 }
1053
1054 bool
1055 Perl_is_uni_lower_lc(pTHX_ UV c)
1056 {
1057     return is_uni_lower(c);     /* XXX no locale support yet */
1058 }
1059
1060 bool
1061 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1062 {
1063     return is_uni_cntrl(c);     /* XXX no locale support yet */
1064 }
1065
1066 bool
1067 Perl_is_uni_graph_lc(pTHX_ UV c)
1068 {
1069     return is_uni_graph(c);     /* XXX no locale support yet */
1070 }
1071
1072 bool
1073 Perl_is_uni_print_lc(pTHX_ UV c)
1074 {
1075     return is_uni_print(c);     /* XXX no locale support yet */
1076 }
1077
1078 bool
1079 Perl_is_uni_punct_lc(pTHX_ UV c)
1080 {
1081     return is_uni_punct(c);     /* XXX no locale support yet */
1082 }
1083
1084 bool
1085 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1086 {
1087     return is_uni_xdigit(c);    /* XXX no locale support yet */
1088 }
1089
1090 U32
1091 Perl_to_uni_upper_lc(pTHX_ U32 c)
1092 {
1093     /* XXX returns only the first character -- do not use XXX */
1094     /* XXX no locale support yet */
1095     STRLEN len;
1096     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097     return (U32)to_uni_upper(c, tmpbuf, &len);
1098 }
1099
1100 U32
1101 Perl_to_uni_title_lc(pTHX_ U32 c)
1102 {
1103     /* XXX returns only the first character XXX -- do not use XXX */
1104     /* XXX no locale support yet */
1105     STRLEN len;
1106     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107     return (U32)to_uni_title(c, tmpbuf, &len);
1108 }
1109
1110 U32
1111 Perl_to_uni_lower_lc(pTHX_ U32 c)
1112 {
1113     /* XXX returns only the first character -- do not use XXX */
1114     /* XXX no locale support yet */
1115     STRLEN len;
1116     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117     return (U32)to_uni_lower(c, tmpbuf, &len);
1118 }
1119
1120 bool
1121 Perl_is_utf8_alnum(pTHX_ U8 *p)
1122 {
1123     if (!is_utf8_char(p))
1124         return FALSE;
1125     if (!PL_utf8_alnum)
1126         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1127          * descendant of isalnum(3), in other words, it doesn't
1128          * contain the '_'. --jhi */
1129         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1130     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1131 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1132 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1133     if (!PL_utf8_alnum)
1134         PL_utf8_alnum = swash_init("utf8", "",
1135             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1136     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1137 #endif
1138 }
1139
1140 bool
1141 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1142 {
1143     if (!is_utf8_char(p))
1144         return FALSE;
1145     if (!PL_utf8_alnum)
1146         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1147     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1148 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1149 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1150     if (!PL_utf8_alnum)
1151         PL_utf8_alnum = swash_init("utf8", "",
1152             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1153     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1154 #endif
1155 }
1156
1157 bool
1158 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1159 {
1160     if (*p == '_')
1161         return TRUE;
1162     if (!is_utf8_char(p))
1163         return FALSE;
1164     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1165         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1166     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1167 }
1168
1169 bool
1170 Perl_is_utf8_idcont(pTHX_ U8 *p)
1171 {
1172     if (*p == '_')
1173         return TRUE;
1174     if (!is_utf8_char(p))
1175         return FALSE;
1176     if (!PL_utf8_idcont)
1177         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1178     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1179 }
1180
1181 bool
1182 Perl_is_utf8_alpha(pTHX_ U8 *p)
1183 {
1184     if (!is_utf8_char(p))
1185         return FALSE;
1186     if (!PL_utf8_alpha)
1187         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1188     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1189 }
1190
1191 bool
1192 Perl_is_utf8_ascii(pTHX_ U8 *p)
1193 {
1194     if (!is_utf8_char(p))
1195         return FALSE;
1196     if (!PL_utf8_ascii)
1197         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1198     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1199 }
1200
1201 bool
1202 Perl_is_utf8_space(pTHX_ U8 *p)
1203 {
1204     if (!is_utf8_char(p))
1205         return FALSE;
1206     if (!PL_utf8_space)
1207         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1208     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1209 }
1210
1211 bool
1212 Perl_is_utf8_digit(pTHX_ U8 *p)
1213 {
1214     if (!is_utf8_char(p))
1215         return FALSE;
1216     if (!PL_utf8_digit)
1217         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1218     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1219 }
1220
1221 bool
1222 Perl_is_utf8_upper(pTHX_ U8 *p)
1223 {
1224     if (!is_utf8_char(p))
1225         return FALSE;
1226     if (!PL_utf8_upper)
1227         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1228     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1229 }
1230
1231 bool
1232 Perl_is_utf8_lower(pTHX_ U8 *p)
1233 {
1234     if (!is_utf8_char(p))
1235         return FALSE;
1236     if (!PL_utf8_lower)
1237         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1238     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1239 }
1240
1241 bool
1242 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1243 {
1244     if (!is_utf8_char(p))
1245         return FALSE;
1246     if (!PL_utf8_cntrl)
1247         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1248     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1249 }
1250
1251 bool
1252 Perl_is_utf8_graph(pTHX_ U8 *p)
1253 {
1254     if (!is_utf8_char(p))
1255         return FALSE;
1256     if (!PL_utf8_graph)
1257         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1258     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1259 }
1260
1261 bool
1262 Perl_is_utf8_print(pTHX_ U8 *p)
1263 {
1264     if (!is_utf8_char(p))
1265         return FALSE;
1266     if (!PL_utf8_print)
1267         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1268     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1269 }
1270
1271 bool
1272 Perl_is_utf8_punct(pTHX_ U8 *p)
1273 {
1274     if (!is_utf8_char(p))
1275         return FALSE;
1276     if (!PL_utf8_punct)
1277         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1278     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1279 }
1280
1281 bool
1282 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1283 {
1284     if (!is_utf8_char(p))
1285         return FALSE;
1286     if (!PL_utf8_xdigit)
1287         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1288     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1289 }
1290
1291 bool
1292 Perl_is_utf8_mark(pTHX_ U8 *p)
1293 {
1294     if (!is_utf8_char(p))
1295         return FALSE;
1296     if (!PL_utf8_mark)
1297         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1298     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1299 }
1300
1301 /*
1302 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1303
1304 The "p" contains the pointer to the UTF-8 string encoding
1305 the character that is being converted.
1306
1307 The "ustrp" is a pointer to the character buffer to put the
1308 conversion result to.  The "lenp" is a pointer to the length
1309 of the result.
1310
1311 The "swashp" is a pointer to the swash to use.
1312
1313 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1314 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1315 but not always, a multicharacter mapping), is tried first.
1316
1317 The "special" is a string like "utf8::ToSpecLower", which means the
1318 hash %utf8::ToSpecLower.  The access to the hash is through
1319 Perl_to_utf8_case().
1320
1321 The "normal" is a string like "ToLower" which means the swash
1322 %utf8::ToLower.
1323
1324 =cut */
1325
1326 UV
1327 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1328 {
1329     UV uv0, uv1;
1330     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1331     STRLEN len = 0;
1332
1333     uv0 = utf8_to_uvchr(p, 0);
1334     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1335      * are necessary in EBCDIC, they are redundant no-ops
1336      * in ASCII-ish platforms, and hopefully optimized away. */
1337     uv1 = NATIVE_TO_UNI(uv0);
1338     uvuni_to_utf8(tmpbuf, uv1);
1339
1340     if (!*swashp) /* load on-demand */
1341          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1342
1343     if (special) {
1344          /* It might be "special" (sometimes, but not always,
1345           * a multicharacter mapping) */
1346          HV *hv;
1347          SV *keysv;
1348          HE *he;
1349          SV *val;
1350
1351          if ((hv    = get_hv(special, FALSE)) &&
1352              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1353              (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1354              (val   = HeVAL(he))) {
1355              char *s;
1356
1357               s = SvPV(val, len);
1358               if (len == 1)
1359                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1360               else {
1361 #ifdef EBCDIC
1362                    /* If we have EBCDIC we need to remap the characters
1363                     * since any characters in the low 256 are Unicode
1364                     * code points, not EBCDIC. */
1365                    U8 *t = (U8*)s, *tend = t + len, *d;
1366
1367                    d = tmpbuf;
1368                    if (SvUTF8(val)) {
1369                         STRLEN tlen = 0;
1370
1371                         while (t < tend) {
1372                              UV c = utf8_to_uvchr(t, &tlen);
1373                              if (tlen > 0) {
1374                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1375                                   t += tlen;
1376                              }
1377                              else
1378                                   break;
1379                         }
1380                    }
1381                    else {
1382                         while (t < tend) {
1383                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1384                              t++;
1385                         }
1386                    }
1387                    len = d - tmpbuf;
1388                    Copy(tmpbuf, ustrp, len, U8);
1389 #else
1390                    Copy(s, ustrp, len, U8);
1391 #endif
1392               }
1393          }
1394     }
1395
1396     if (!len && *swashp) {
1397          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1398
1399          if (uv2) {
1400               /* It was "normal" (a single character mapping). */
1401               UV uv3 = UNI_TO_NATIVE(uv2);
1402
1403               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1404          }
1405     }
1406
1407     if (!len) /* Neither: just copy. */
1408          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1409
1410     if (lenp)
1411          *lenp = len;
1412
1413     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1414 }
1415
1416 /*
1417 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1418
1419 Convert the UTF-8 encoded character at p to its uppercase version and
1420 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1421 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1422 uppercase version may be longer than the original character (up to two
1423 characters).
1424
1425 The first character of the uppercased version is returned
1426 (but note, as explained above, that there may be more.)
1427
1428 =cut */
1429
1430 UV
1431 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1432 {
1433     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1434                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1435 }
1436
1437 /*
1438 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1439
1440 Convert the UTF-8 encoded character at p to its titlecase version and
1441 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1442 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1443 titlecase version may be longer than the original character (up to two
1444 characters).
1445
1446 The first character of the titlecased version is returned
1447 (but note, as explained above, that there may be more.)
1448
1449 =cut */
1450
1451 UV
1452 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1453 {
1454     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1455                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1456 }
1457
1458 /*
1459 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1460
1461 Convert the UTF-8 encoded character at p to its lowercase version and
1462 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1463 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1464 lowercase version may be longer than the original character (up to two
1465 characters).
1466
1467 The first character of the lowercased version is returned
1468 (but note, as explained above, that there may be more.)
1469
1470 =cut */
1471
1472 UV
1473 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1474 {
1475     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1476                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1477 }
1478
1479 /*
1480 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1481
1482 Convert the UTF-8 encoded character at p to its foldcase version and
1483 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1484 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1485 foldcase version may be longer than the original character (up to
1486 three characters).
1487
1488 The first character of the foldcased version is returned
1489 (but note, as explained above, that there may be more.)
1490
1491 =cut */
1492
1493 UV
1494 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1495 {
1496     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1497                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1498 }
1499
1500 /* a "swash" is a swatch hash */
1501
1502 SV*
1503 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1504 {
1505     SV* retval;
1506     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1507     dSP;
1508     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1509     SV* errsv_save;
1510
1511     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1512         ENTER;
1513         errsv_save = newSVsv(ERRSV);
1514         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1515         if (!SvTRUE(ERRSV))
1516             sv_setsv(ERRSV, errsv_save);
1517         SvREFCNT_dec(errsv_save);
1518         LEAVE;
1519     }
1520     SPAGAIN;
1521     PUSHSTACKi(PERLSI_MAGIC);
1522     PUSHMARK(SP);
1523     EXTEND(SP,5);
1524     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1525     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1526     PUSHs(listsv);
1527     PUSHs(sv_2mortal(newSViv(minbits)));
1528     PUSHs(sv_2mortal(newSViv(none)));
1529     PUTBACK;
1530     ENTER;
1531     SAVEI32(PL_hints);
1532     PL_hints = 0;
1533     save_re_context();
1534     if (PL_curcop == &PL_compiling) {
1535         /* XXX ought to be handled by lex_start */
1536         SAVEI32(PL_in_my);
1537         sv_setpv(tokenbufsv, PL_tokenbuf);
1538     }
1539     errsv_save = newSVsv(ERRSV);
1540     if (call_method("SWASHNEW", G_SCALAR))
1541         retval = newSVsv(*PL_stack_sp--);
1542     else
1543         retval = &PL_sv_undef;
1544     if (!SvTRUE(ERRSV))
1545         sv_setsv(ERRSV, errsv_save);
1546     SvREFCNT_dec(errsv_save);
1547     LEAVE;
1548     POPSTACK;
1549     if (PL_curcop == &PL_compiling) {
1550         STRLEN len;
1551         char* pv = SvPV(tokenbufsv, len);
1552
1553         Copy(pv, PL_tokenbuf, len+1, char);
1554         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1555     }
1556     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1557         if (SvPOK(retval))
1558             Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1559                        SvPV_nolen(retval));
1560         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1561     }
1562     return retval;
1563 }
1564
1565
1566 /* This API is wrong for special case conversions since we may need to
1567  * return several Unicode characters for a single Unicode character
1568  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1569  * the lower-level routine, and it is similarly broken for returning
1570  * multiple values.  --jhi */
1571 UV
1572 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1573 {
1574     HV* hv = (HV*)SvRV(sv);
1575     U32 klen;
1576     U32 off;
1577     STRLEN slen;
1578     STRLEN needents;
1579     U8 *tmps = NULL;
1580     U32 bit;
1581     SV *retval;
1582     U8 tmputf8[2];
1583     UV c = NATIVE_TO_ASCII(*ptr);
1584
1585     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1586         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1587         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1588         ptr = tmputf8;
1589     }
1590     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1591      * then the "swatch" is a vec() for al the chars which start
1592      * with 0xAA..0xYY
1593      * So the key in the hash (klen) is length of encoded char -1
1594      */
1595     klen = UTF8SKIP(ptr) - 1;
1596     off  = ptr[klen];
1597
1598     if (klen == 0)
1599      {
1600       /* If char in invariant then swatch is for all the invariant chars
1601        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1602        */
1603       needents = UTF_CONTINUATION_MARK;
1604       off      = NATIVE_TO_UTF(ptr[klen]);
1605      }
1606     else
1607      {
1608       /* If char is encoded then swatch is for the prefix */
1609       needents = (1 << UTF_ACCUMULATION_SHIFT);
1610       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1611      }
1612
1613     /*
1614      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1615      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1616      * it's nothing to sniff at.)  Pity we usually come through at least
1617      * two function calls to get here...
1618      *
1619      * NB: this code assumes that swatches are never modified, once generated!
1620      */
1621
1622     if (hv   == PL_last_swash_hv &&
1623         klen == PL_last_swash_klen &&
1624         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1625     {
1626         tmps = PL_last_swash_tmps;
1627         slen = PL_last_swash_slen;
1628     }
1629     else {
1630         /* Try our second-level swatch cache, kept in a hash. */
1631         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1632
1633         /* If not cached, generate it via utf8::SWASHGET */
1634         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1635             dSP;
1636             /* We use utf8n_to_uvuni() as we want an index into
1637                Unicode tables, not a native character number.
1638              */
1639             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1640                                            ckWARN(WARN_UTF8) ?
1641                                            0 : UTF8_ALLOW_ANY);
1642             SV *errsv_save;
1643             ENTER;
1644             SAVETMPS;
1645             save_re_context();
1646             PUSHSTACKi(PERLSI_MAGIC);
1647             PUSHMARK(SP);
1648             EXTEND(SP,3);
1649             PUSHs((SV*)sv);
1650             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1651             PUSHs(sv_2mortal(newSViv((klen) ?
1652                                      (code_point & ~(needents - 1)) : 0)));
1653             PUSHs(sv_2mortal(newSViv(needents)));
1654             PUTBACK;
1655             errsv_save = newSVsv(ERRSV);
1656             if (call_method("SWASHGET", G_SCALAR))
1657                 retval = newSVsv(*PL_stack_sp--);
1658             else
1659                 retval = &PL_sv_undef;
1660             if (!SvTRUE(ERRSV))
1661                 sv_setsv(ERRSV, errsv_save);
1662             SvREFCNT_dec(errsv_save);
1663             POPSTACK;
1664             FREETMPS;
1665             LEAVE;
1666             if (PL_curcop == &PL_compiling)
1667                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1668
1669             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1670
1671             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1672                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1673         }
1674
1675         PL_last_swash_hv = hv;
1676         PL_last_swash_klen = klen;
1677         PL_last_swash_tmps = tmps;
1678         PL_last_swash_slen = slen;
1679         if (klen)
1680             Copy(ptr, PL_last_swash_key, klen, U8);
1681     }
1682
1683     switch ((int)((slen << 3) / needents)) {
1684     case 1:
1685         bit = 1 << (off & 7);
1686         off >>= 3;
1687         return (tmps[off] & bit) != 0;
1688     case 8:
1689         return tmps[off];
1690     case 16:
1691         off <<= 1;
1692         return (tmps[off] << 8) + tmps[off + 1] ;
1693     case 32:
1694         off <<= 2;
1695         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1696     }
1697     Perl_croak(aTHX_ "panic: swash_fetch");
1698     return 0;
1699 }
1700
1701
1702 /*
1703 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1704
1705 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1706 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1707 bytes available. The return value is the pointer to the byte after the
1708 end of the new character. In other words,
1709
1710     d = uvchr_to_utf8(d, uv);
1711
1712 is the recommended wide native character-aware way of saying
1713
1714     *(d++) = uv;
1715
1716 =cut
1717 */
1718
1719 /* On ASCII machines this is normally a macro but we want a
1720    real function in case XS code wants it
1721 */
1722 #undef Perl_uvchr_to_utf8
1723 U8 *
1724 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1725 {
1726     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1727 }
1728
1729 U8 *
1730 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1731 {
1732     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1733 }
1734
1735 /*
1736 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1737
1738 Returns the native character value of the first character in the string C<s>
1739 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1740 length, in bytes, of that character.
1741
1742 Allows length and flags to be passed to low level routine.
1743
1744 =cut
1745 */
1746 /* On ASCII machines this is normally a macro but we want
1747    a real function in case XS code wants it
1748 */
1749 #undef Perl_utf8n_to_uvchr
1750 UV
1751 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1752 {
1753     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1754     return UNI_TO_NATIVE(uv);
1755 }
1756
1757 /*
1758 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1759
1760 Build to the scalar dsv a displayable version of the string spv,
1761 length len, the displayable version being at most pvlim bytes long
1762 (if longer, the rest is truncated and "..." will be appended).
1763
1764 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1765 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1766 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1767 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1768 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1769 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1770
1771 The pointer to the PV of the dsv is returned.
1772
1773 =cut */
1774 char *
1775 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1776 {
1777     int truncated = 0;
1778     char *s, *e;
1779
1780     sv_setpvn(dsv, "", 0);
1781     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1782          UV u;
1783          bool ok = FALSE;
1784
1785          if (pvlim && SvCUR(dsv) >= pvlim) {
1786               truncated++;
1787               break;
1788          }
1789          u = utf8_to_uvchr((U8*)s, 0);
1790          if (u < 256) {
1791              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1792                  switch (u & 0xFF) {
1793                  case '\n':
1794                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1795                  case '\r':
1796                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1797                  case '\t':
1798                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1799                  case '\f':
1800                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1801                  case '\a':
1802                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1803                  case '\\':
1804                      Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1805                  default: break;
1806                  }
1807              }
1808              /* isPRINT() is the locale-blind version. */
1809              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1810                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1811                  ok = TRUE;
1812              }
1813          }
1814          if (!ok)
1815              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1816     }
1817     if (truncated)
1818          sv_catpvn(dsv, "...", 3);
1819
1820     return SvPVX(dsv);
1821 }
1822
1823 /*
1824 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1825
1826 Build to the scalar dsv a displayable version of the scalar sv,
1827 the displayable version being at most pvlim bytes long
1828 (if longer, the rest is truncated and "..." will be appended).
1829
1830 The flags argument is as in pv_uni_display().
1831
1832 The pointer to the PV of the dsv is returned.
1833
1834 =cut */
1835 char *
1836 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1837 {
1838      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1839                                 pvlim, flags);
1840 }
1841
1842 /*
1843 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1844
1845 Return true if the strings s1 and s2 differ case-insensitively, false
1846 if not (if they are equal case-insensitively).  If u1 is true, the
1847 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1848 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1849 are false, the respective string is assumed to be in native 8-bit
1850 encoding.
1851
1852 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1853 in there (they will point at the beginning of the I<next> character).
1854 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1855 pointers beyond which scanning will not continue under any
1856 circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1857 s2+l2 will be used as goal end pointers that will also stop the scan,
1858 and which qualify towards defining a successful match: all the scans
1859 that define an explicit length must reach their goal pointers for
1860 a match to succeed).
1861
1862 For case-insensitiveness, the "casefolding" of Unicode is used
1863 instead of upper/lowercasing both the characters, see
1864 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1865
1866 =cut */
1867 I32
1868 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1869 {
1870      register U8 *p1  = (U8*)s1;
1871      register U8 *p2  = (U8*)s2;
1872      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1873      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1874      STRLEN n1 = 0, n2 = 0;
1875      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1876      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1877      U8 natbuf[1+1];
1878      STRLEN foldlen1, foldlen2;
1879      bool match;
1880
1881      if (pe1)
1882           e1 = *(U8**)pe1;
1883      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1884           f1 = (U8*)s1 + l1;
1885      if (pe2)
1886           e2 = *(U8**)pe2;
1887      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1888           f2 = (U8*)s2 + l2;
1889
1890      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1891           return 1; /* mismatch; possible infinite loop or false positive */
1892
1893      if (!u1 || !u2)
1894           natbuf[1] = 0; /* Need to terminate the buffer. */
1895
1896      while ((e1 == 0 || p1 < e1) &&
1897             (f1 == 0 || p1 < f1) &&
1898             (e2 == 0 || p2 < e2) &&
1899             (f2 == 0 || p2 < f2)) {
1900           if (n1 == 0) {
1901                if (u1)
1902                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1903                else {
1904                     natbuf[0] = *p1;
1905                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1906                }
1907                q1 = foldbuf1;
1908                n1 = foldlen1;
1909           }
1910           if (n2 == 0) {
1911                if (u2)
1912                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1913                else {
1914                     natbuf[0] = *p2;
1915                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1916                }
1917                q2 = foldbuf2;
1918                n2 = foldlen2;
1919           }
1920           while (n1 && n2) {
1921                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1922                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1923                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1924                    return 1; /* mismatch */
1925                n1 -= UTF8SKIP(q1);
1926                q1 += UTF8SKIP(q1);
1927                n2 -= UTF8SKIP(q2);
1928                q2 += UTF8SKIP(q2);
1929           }
1930           if (n1 == 0)
1931                p1 += u1 ? UTF8SKIP(p1) : 1;
1932           if (n2 == 0)
1933                p2 += u2 ? UTF8SKIP(p2) : 1;
1934
1935      }
1936
1937      /* A match is defined by all the scans that specified
1938       * an explicit length reaching their final goals. */
1939      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1940
1941      if (match) {
1942           if (pe1)
1943                *pe1 = (char*)p1;
1944           if (pe2)
1945                *pe2 = (char*)p2;
1946      }
1947
1948      return match ? 0 : 1; /* 0 match, 1 mismatch */
1949 }
1950