utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2002, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  33
  34 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  35 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  36 bytes available. The return value is the pointer to the byte after the
  37 end of the new character. In other words,
  38
  39     d = uvuni_to_utf8_flags(d, uv, flags);
  40
  41 or, in most cases,
  42
  43     d = uvuni_to_utf8(d, uv);
  44
  45 (which is equivalent to)
  46
  47     d = uvuni_to_utf8_flags(d, uv, 0);
  48
  49 is the recommended Unicode-aware way of saying
  50
  51     *(d++) = uv;
  52
  53 =cut
  54 */
  55
  56 U8 *
  57 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  58 {
  59     if (ckWARN(WARN_UTF8)) {
  60          if (UNICODE_IS_SURROGATE(uv) &&
  61              !(flags & UNICODE_ALLOW_SURROGATE))
  62               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  63          else if (
  64                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  65                     !(flags & UNICODE_ALLOW_FDD0))
  66                    ||
  67                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  68                     !(flags & UNICODE_ALLOW_FFFF))) &&
  69                   /* UNICODE_ALLOW_SUPER includes
  70                    * FFFEs and FFFFs beyond 0x10FFFF. */
  71                   ((uv <= PERL_UNICODE_MAX) ||
  72                    !(flags & UNICODE_ALLOW_SUPER))
  73                   )
  74               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  75                          "Unicode character 0x%04"UVxf" is illegal", uv);
  76     }
  77     if (UNI_IS_INVARIANT(uv)) {
  78         *d++ = (U8)UTF_TO_NATIVE(uv);
  79         return d;
  80     }
  81 #if defined(EBCDIC)
  82     else {
  83         STRLEN len  = UNISKIP(uv);
  84         U8 *p = d+len-1;
  85         while (p > d) {
  86             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  87             uv >>= UTF_ACCUMULATION_SHIFT;
  88         }
  89         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  90         return d+len;
  91     }
  92 #else /* Non loop style */
  93     if (uv < 0x800) {
  94         *d++ = (U8)(( uv >>  6)         | 0xc0);
  95         *d++ = (U8)(( uv        & 0x3f) | 0x80);
  96         return d;
  97     }
  98     if (uv < 0x10000) {
  99         *d++ = (U8)(( uv >> 12)         | 0xe0);
 100         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 101         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 102         return d;
 103     }
 104     if (uv < 0x200000) {
 105         *d++ = (U8)(( uv >> 18)         | 0xf0);
 106         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 107         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 108         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 109         return d;
 110     }
 111     if (uv < 0x4000000) {
 112         *d++ = (U8)(( uv >> 24)         | 0xf8);
 113         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x80000000) {
 120         *d++ = (U8)(( uv >> 30)         | 0xfc);
 121         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 124         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 125         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 126         return d;
 127     }
 128 #ifdef HAS_QUAD
 129     if (uv < UTF8_QUAD_MAX)
 130 #endif
 131     {
 132         *d++ =                            0xfe; /* Can't match U+FEFF! */
 133         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 134         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 135         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 136         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 137         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 138         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 139         return d;
 140     }
 141 #ifdef HAS_QUAD
 142     {
 143         *d++ =                            0xff;         /* Can't match U+FFFE! */
 144         *d++ =                            0x80;         /* 6 Reserved bits */
 145         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 146         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 147         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 148         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 149         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 150         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 151         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 152         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 153         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 154         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 155         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 156         return d;
 157     }
 158 #endif
 159 #endif /* Loop style */
 160 }
 161
 162 U8 *
 163 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 164 {
 165     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 166 }
 167
 168
 169 /*
 170 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 =cut */
 178 STRLEN
 179 Perl_is_utf8_char(pTHX_ U8 *s)
 180 {
 181     U8 u = *s;
 182     STRLEN slen, len;
 183     UV uv, ouv;
 184
 185     if (UTF8_IS_INVARIANT(u))
 186         return 1;
 187
 188     if (!UTF8_IS_START(u))
 189         return 0;
 190
 191     len = UTF8SKIP(s);
 192
 193     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 194         return 0;
 195
 196     slen = len - 1;
 197     s++;
 198     u &= UTF_START_MASK(len);
 199     uv  = u;
 200     ouv = uv;
 201     while (slen--) {
 202         if (!UTF8_IS_CONTINUATION(*s))
 203             return 0;
 204         uv = UTF8_ACCUMULATE(uv, *s);
 205         if (uv < ouv)
 206             return 0;
 207         ouv = uv;
 208         s++;
 209     }
 210
 211     if ((STRLEN)UNISKIP(uv) < len)
 212         return 0;
 213
 214     return len;
 215 }
 216
 217 /*
 218 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 219
 220 Returns true if first C<len> bytes of the given string form a valid UTF8
 221 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 222 'a string that contains UTF8' because a valid ASCII string is a valid
 223 UTF8 string.
 224
 225 =cut
 226 */
 227
 228 bool
 229 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 230 {
 231     U8* x = s;
 232     U8* send;
 233     STRLEN c;
 234
 235     if (!len)
 236         len = strlen((char *)s);
 237     send = s + len;
 238
 239     while (x < send) {
 240         c = is_utf8_char(x);
 241         if (!c)
 242             return FALSE;
 243         x += c;
 244     }
 245     if (x != send)
 246         return FALSE;
 247
 248     return TRUE;
 249 }
 250
 251 /*
 252 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 253
 254 Bottom level UTF-8 decode routine.
 255 Returns the unicode code point value of the first character in the string C<s>
 256 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 257 C<retlen> will be set to the length, in bytes, of that character.
 258
 259 If C<s> does not point to a well-formed UTF8 character, the behaviour
 260 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 261 it is assumed that the caller will raise a warning, and this function
 262 will silently just set C<retlen> to C<-1> and return zero.  If the
 263 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 264 malformations will be given, C<retlen> will be set to the expected
 265 length of the UTF-8 character in bytes, and zero will be returned.
 266
 267 The C<flags> can also contain various flags to allow deviations from
 268 the strict UTF-8 encoding (see F<utf8.h>).
 269
 270 Most code should use utf8_to_uvchr() rather than call this directly.
 271
 272 =cut
 273 */
 274
 275 UV
 276 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 277 {
 278     U8 *s0 = s;
 279     UV uv = *s, ouv = 0;
 280     STRLEN len = 1;
 281     bool dowarn = ckWARN_d(WARN_UTF8);
 282     UV startbyte = *s;
 283     STRLEN expectlen = 0;
 284     U32 warning = 0;
 285
 286 /* This list is a superset of the UTF8_ALLOW_XXX. */
 287
 288 #define UTF8_WARN_EMPTY                          1
 289 #define UTF8_WARN_CONTINUATION                   2
 290 #define UTF8_WARN_NON_CONTINUATION               3
 291 #define UTF8_WARN_FE_FF                          4
 292 #define UTF8_WARN_SHORT                          5
 293 #define UTF8_WARN_OVERFLOW                       6
 294 #define UTF8_WARN_SURROGATE                      7
 295 #define UTF8_WARN_LONG                           8
 296 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 297
 298     if (curlen == 0 &&
 299         !(flags & UTF8_ALLOW_EMPTY)) {
 300         warning = UTF8_WARN_EMPTY;
 301         goto malformed;
 302     }
 303
 304     if (UTF8_IS_INVARIANT(uv)) {
 305         if (retlen)
 306             *retlen = 1;
 307         return (UV) (NATIVE_TO_UTF(*s));
 308     }
 309
 310     if (UTF8_IS_CONTINUATION(uv) &&
 311         !(flags & UTF8_ALLOW_CONTINUATION)) {
 312         warning = UTF8_WARN_CONTINUATION;
 313         goto malformed;
 314     }
 315
 316     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 317         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 318         warning = UTF8_WARN_NON_CONTINUATION;
 319         goto malformed;
 320     }
 321
 322 #ifdef EBCDIC
 323     uv = NATIVE_TO_UTF(uv);
 324 #else
 325     if ((uv == 0xfe || uv == 0xff) &&
 326         !(flags & UTF8_ALLOW_FE_FF)) {
 327         warning = UTF8_WARN_FE_FF;
 328         goto malformed;
 329     }
 330 #endif
 331
 332     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 333     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 334     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 335     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 336 #ifdef EBCDIC
 337     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 338     else                        { len =  7; uv &= 0x01; }
 339 #else
 340     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 341     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 342     else                        { len = 13; uv = 0; } /* whoa! */
 343 #endif
 344
 345     if (retlen)
 346         *retlen = len;
 347
 348     expectlen = len;
 349
 350     if ((curlen < expectlen) &&
 351         !(flags & UTF8_ALLOW_SHORT)) {
 352         warning = UTF8_WARN_SHORT;
 353         goto malformed;
 354     }
 355
 356     len--;
 357     s++;
 358     ouv = uv;
 359
 360     while (len--) {
 361         if (!UTF8_IS_CONTINUATION(*s) &&
 362             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 363             s--;
 364             warning = UTF8_WARN_NON_CONTINUATION;
 365             goto malformed;
 366         }
 367         else
 368             uv = UTF8_ACCUMULATE(uv, *s);
 369         if (!(uv > ouv)) {
 370             /* These cannot be allowed. */
 371             if (uv == ouv) {
 372                 if (!(flags & UTF8_ALLOW_LONG)) {
 373                     warning = UTF8_WARN_LONG;
 374                     goto malformed;
 375                 }
 376             }
 377             else { /* uv < ouv */
 378                 /* This cannot be allowed. */
 379                 warning = UTF8_WARN_OVERFLOW;
 380                 goto malformed;
 381             }
 382         }
 383         s++;
 384         ouv = uv;
 385     }
 386
 387     if (UNICODE_IS_SURROGATE(uv) &&
 388         !(flags & UTF8_ALLOW_SURROGATE)) {
 389         warning = UTF8_WARN_SURROGATE;
 390         goto malformed;
 391     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 392                !(flags & UTF8_ALLOW_LONG)) {
 393         warning = UTF8_WARN_LONG;
 394         goto malformed;
 395     } else if (UNICODE_IS_ILLEGAL(uv) &&
 396                !(flags & UTF8_ALLOW_FFFF)) {
 397         warning = UTF8_WARN_FFFF;
 398         goto malformed;
 399     }
 400
 401     return uv;
 402
 403 malformed:
 404
 405     if (flags & UTF8_CHECK_ONLY) {
 406         if (retlen)
 407             *retlen = -1;
 408         return 0;
 409     }
 410
 411     if (dowarn) {
 412         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 413
 414         switch (warning) {
 415         case 0: /* Intentionally empty. */ break;
 416         case UTF8_WARN_EMPTY:
 417             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 418             break;
 419         case UTF8_WARN_CONTINUATION:
 420             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 421             break;
 422         case UTF8_WARN_NON_CONTINUATION:
 423             if (s == s0)
 424                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 425                            (UV)s[1], startbyte);
 426             else
 427                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 428                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 429
 430             break;
 431         case UTF8_WARN_FE_FF:
 432             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 433             break;
 434         case UTF8_WARN_SHORT:
 435             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 436                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 437             expectlen = curlen;         /* distance for caller to skip */
 438             break;
 439         case UTF8_WARN_OVERFLOW:
 440             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 441                            ouv, *s, startbyte);
 442             break;
 443         case UTF8_WARN_SURROGATE:
 444             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 445             break;
 446         case UTF8_WARN_LONG:
 447             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 448                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 449             break;
 450         case UTF8_WARN_FFFF:
 451             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 452             break;
 453         default:
 454             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 455             break;
 456         }
 457
 458         if (warning) {
 459             char *s = SvPVX(sv);
 460
 461             if (PL_op)
 462                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 463                             "%s in %s", s,  OP_DESC(PL_op));
 464             else
 465                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 466         }
 467     }
 468
 469     if (retlen)
 470         *retlen = expectlen ? expectlen : len;
 471
 472     return 0;
 473 }
 474
 475 /*
 476 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 477
 478 Returns the native character value of the first character in the string C<s>
 479 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 480 length, in bytes, of that character.
 481
 482 If C<s> does not point to a well-formed UTF8 character, zero is
 483 returned and retlen is set, if possible, to -1.
 484
 485 =cut
 486 */
 487
 488 UV
 489 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 490 {
 491     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
 492                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 493 }
 494
 495 /*
 496 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 497
 498 Returns the Unicode code point of the first character in the string C<s>
 499 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 500 length, in bytes, of that character.
 501
 502 This function should only be used when returned UV is considered
 503 an index into the Unicode semantic tables (e.g. swashes).
 504
 505 If C<s> does not point to a well-formed UTF8 character, zero is
 506 returned and retlen is set, if possible, to -1.
 507
 508 =cut
 509 */
 510
 511 UV
 512 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 513 {
 514     /* Call the low level routine asking for checks */
 515     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
 516                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 517 }
 518
 519 /*
 520 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 521
 522 Return the length of the UTF-8 char encoded string C<s> in characters.
 523 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 524 up past C<e>, croaks.
 525
 526 =cut
 527 */
 528
 529 STRLEN
 530 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 531 {
 532     STRLEN len = 0;
 533
 534     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 535      * the bitops (especially ~) can create illegal UTF-8.
 536      * In other words: in Perl UTF-8 is not just for Unicode. */
 537
 538     if (e < s) {
 539         if (ckWARN_d(WARN_UTF8)) {
 540             if (PL_op)
 541                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 542                             "%s in %s", unees, OP_DESC(PL_op));
 543             else
 544                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 545         }
 546         return 0;
 547     }
 548     while (s < e) {
 549         U8 t = UTF8SKIP(s);
 550
 551         if (e - s < t) {
 552             if (ckWARN_d(WARN_UTF8)) {
 553                 if (PL_op)
 554                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 555                                 unees, OP_DESC(PL_op));
 556                 else
 557                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 558             }
 559             return len;
 560         }
 561         s += t;
 562         len++;
 563     }
 564
 565     return len;
 566 }
 567
 568 /*
 569 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 570
 571 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 572 and C<b>.
 573
 574 WARNING: use only if you *know* that the pointers point inside the
 575 same UTF-8 buffer.
 576
 577 =cut
 578 */
 579
 580 IV
 581 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 582 {
 583     IV off = 0;
 584
 585     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 586      * the bitops (especially ~) can create illegal UTF-8.
 587      * In other words: in Perl UTF-8 is not just for Unicode. */
 588
 589     if (a < b) {
 590         while (a < b) {
 591             U8 c = UTF8SKIP(a);
 592
 593             if (b - a < c) {
 594                 if (ckWARN_d(WARN_UTF8)) {
 595                     if (PL_op)
 596                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 597                                     "%s in %s", unees, OP_DESC(PL_op));
 598                     else
 599                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 600                 }
 601                 return off;
 602             }
 603             a += c;
 604             off--;
 605         }
 606     }
 607     else {
 608         while (b < a) {
 609             U8 c = UTF8SKIP(b);
 610
 611             if (a - b < c) {
 612                 if (ckWARN_d(WARN_UTF8)) {
 613                     if (PL_op)
 614                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 615                                     "%s in %s", unees, OP_DESC(PL_op));
 616                     else
 617                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 618                 }
 619                 return off;
 620             }
 621             b += c;
 622             off++;
 623         }
 624     }
 625
 626     return off;
 627 }
 628
 629 /*
 630 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 631
 632 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 633 forward or backward.
 634
 635 WARNING: do not use the following unless you *know* C<off> is within
 636 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 637 on the first byte of character or just after the last byte of a character.
 638
 639 =cut
 640 */
 641
 642 U8 *
 643 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 644 {
 645     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 646      * the bitops (especially ~) can create illegal UTF-8.
 647      * In other words: in Perl UTF-8 is not just for Unicode. */
 648
 649     if (off >= 0) {
 650         while (off--)
 651             s += UTF8SKIP(s);
 652     }
 653     else {
 654         while (off++) {
 655             s--;
 656             while (UTF8_IS_CONTINUATION(*s))
 657                 s--;
 658         }
 659     }
 660     return s;
 661 }
 662
 663 /*
 664 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 665
 666 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 667 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 668 updates len to contain the new length.
 669 Returns zero on failure, setting C<len> to -1.
 670
 671 =cut
 672 */
 673
 674 U8 *
 675 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 676 {
 677     U8 *send;
 678     U8 *d;
 679     U8 *save = s;
 680
 681     /* ensure valid UTF8 and chars < 256 before updating string */
 682     for (send = s + *len; s < send; ) {
 683         U8 c = *s++;
 684
 685         if (!UTF8_IS_INVARIANT(c) &&
 686             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 687              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 688             *len = -1;
 689             return 0;
 690         }
 691     }
 692
 693     d = s = save;
 694     while (s < send) {
 695         STRLEN ulen;
 696         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 697         s += ulen;
 698     }
 699     *d = '\0';
 700     *len = d - save;
 701     return save;
 702 }
 703
 704 /*
 705 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 706
 707 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 708 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 709 the newly-created string, and updates C<len> to contain the new
 710 length.  Returns the original string if no conversion occurs, C<len>
 711 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 712 0 if C<s> is converted or contains all 7bit characters.
 713
 714 =cut
 715 */
 716
 717 U8 *
 718 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 719 {
 720     U8 *d;
 721     U8 *start = s;
 722     U8 *send;
 723     I32 count = 0;
 724
 725     if (!*is_utf8)
 726         return start;
 727
 728     /* ensure valid UTF8 and chars < 256 before converting string */
 729     for (send = s + *len; s < send;) {
 730         U8 c = *s++;
 731         if (!UTF8_IS_INVARIANT(c)) {
 732             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 733                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 734                 count++;
 735             else
 736                 return start;
 737         }
 738     }
 739
 740     *is_utf8 = 0;
 741
 742     Newz(801, d, (*len) - count + 1, U8);
 743     s = start; start = d;
 744     while (s < send) {
 745         U8 c = *s++;
 746         if (!UTF8_IS_INVARIANT(c)) {
 747             /* Then it is two-byte encoded */
 748             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 749             c = ASCII_TO_NATIVE(c);
 750         }
 751         *d++ = c;
 752     }
 753     *d = '\0';
 754     *len = d - start;
 755     return start;
 756 }
 757
 758 /*
 759 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 760
 761 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 762 Returns a pointer to the newly-created string, and sets C<len> to
 763 reflect the new length.
 764
 765 =cut
 766 */
 767
 768 U8*
 769 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 770 {
 771     U8 *send;
 772     U8 *d;
 773     U8 *dst;
 774     send = s + (*len);
 775
 776     Newz(801, d, (*len) * 2 + 1, U8);
 777     dst = d;
 778
 779     while (s < send) {
 780         UV uv = NATIVE_TO_ASCII(*s++);
 781         if (UNI_IS_INVARIANT(uv))
 782             *d++ = (U8)UTF_TO_NATIVE(uv);
 783         else {
 784             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 785             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 786         }
 787     }
 788     *d = '\0';
 789     *len = d-dst;
 790     return dst;
 791 }
 792
 793 /*
 794  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 795  *
 796  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 797  * We optimize for native, for obvious reasons. */
 798
 799 U8*
 800 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 801 {
 802     U8* pend;
 803     U8* dstart = d;
 804
 805     if (bytelen & 1)
 806         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 807
 808     pend = p + bytelen;
 809
 810     while (p < pend) {
 811         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 812         p += 2;
 813         if (uv < 0x80) {
 814             *d++ = (U8)uv;
 815             continue;
 816         }
 817         if (uv < 0x800) {
 818             *d++ = (U8)(( uv >>  6)         | 0xc0);
 819             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 820             continue;
 821         }
 822         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 823             UV low = *p++;
 824             if (low < 0xdc00 || low >= 0xdfff)
 825                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 826             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 827         }
 828         if (uv < 0x10000) {
 829             *d++ = (U8)(( uv >> 12)         | 0xe0);
 830             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 831             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 832             continue;
 833         }
 834         else {
 835             *d++ = (U8)(( uv >> 18)         | 0xf0);
 836             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 837             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 838             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 839             continue;
 840         }
 841     }
 842     *newlen = d - dstart;
 843     return d;
 844 }
 845
 846 /* Note: this one is slightly destructive of the source. */
 847
 848 U8*
 849 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 850 {
 851     U8* s = (U8*)p;
 852     U8* send = s + bytelen;
 853     while (s < send) {
 854         U8 tmp = s[0];
 855         s[0] = s[1];
 856         s[1] = tmp;
 857         s += 2;
 858     }
 859     return utf16_to_utf8(p, d, bytelen, newlen);
 860 }
 861
 862 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 863
 864 bool
 865 Perl_is_uni_alnum(pTHX_ UV c)
 866 {
 867     U8 tmpbuf[UTF8_MAXLEN+1];
 868     uvchr_to_utf8(tmpbuf, c);
 869     return is_utf8_alnum(tmpbuf);
 870 }
 871
 872 bool
 873 Perl_is_uni_alnumc(pTHX_ UV c)
 874 {
 875     U8 tmpbuf[UTF8_MAXLEN+1];
 876     uvchr_to_utf8(tmpbuf, c);
 877     return is_utf8_alnumc(tmpbuf);
 878 }
 879
 880 bool
 881 Perl_is_uni_idfirst(pTHX_ UV c)
 882 {
 883     U8 tmpbuf[UTF8_MAXLEN+1];
 884     uvchr_to_utf8(tmpbuf, c);
 885     return is_utf8_idfirst(tmpbuf);
 886 }
 887
 888 bool
 889 Perl_is_uni_alpha(pTHX_ UV c)
 890 {
 891     U8 tmpbuf[UTF8_MAXLEN+1];
 892     uvchr_to_utf8(tmpbuf, c);
 893     return is_utf8_alpha(tmpbuf);
 894 }
 895
 896 bool
 897 Perl_is_uni_ascii(pTHX_ UV c)
 898 {
 899     U8 tmpbuf[UTF8_MAXLEN+1];
 900     uvchr_to_utf8(tmpbuf, c);
 901     return is_utf8_ascii(tmpbuf);
 902 }
 903
 904 bool
 905 Perl_is_uni_space(pTHX_ UV c)
 906 {
 907     U8 tmpbuf[UTF8_MAXLEN+1];
 908     uvchr_to_utf8(tmpbuf, c);
 909     return is_utf8_space(tmpbuf);
 910 }
 911
 912 bool
 913 Perl_is_uni_digit(pTHX_ UV c)
 914 {
 915     U8 tmpbuf[UTF8_MAXLEN+1];
 916     uvchr_to_utf8(tmpbuf, c);
 917     return is_utf8_digit(tmpbuf);
 918 }
 919
 920 bool
 921 Perl_is_uni_upper(pTHX_ UV c)
 922 {
 923     U8 tmpbuf[UTF8_MAXLEN+1];
 924     uvchr_to_utf8(tmpbuf, c);
 925     return is_utf8_upper(tmpbuf);
 926 }
 927
 928 bool
 929 Perl_is_uni_lower(pTHX_ UV c)
 930 {
 931     U8 tmpbuf[UTF8_MAXLEN+1];
 932     uvchr_to_utf8(tmpbuf, c);
 933     return is_utf8_lower(tmpbuf);
 934 }
 935
 936 bool
 937 Perl_is_uni_cntrl(pTHX_ UV c)
 938 {
 939     U8 tmpbuf[UTF8_MAXLEN+1];
 940     uvchr_to_utf8(tmpbuf, c);
 941     return is_utf8_cntrl(tmpbuf);
 942 }
 943
 944 bool
 945 Perl_is_uni_graph(pTHX_ UV c)
 946 {
 947     U8 tmpbuf[UTF8_MAXLEN+1];
 948     uvchr_to_utf8(tmpbuf, c);
 949     return is_utf8_graph(tmpbuf);
 950 }
 951
 952 bool
 953 Perl_is_uni_print(pTHX_ UV c)
 954 {
 955     U8 tmpbuf[UTF8_MAXLEN+1];
 956     uvchr_to_utf8(tmpbuf, c);
 957     return is_utf8_print(tmpbuf);
 958 }
 959
 960 bool
 961 Perl_is_uni_punct(pTHX_ UV c)
 962 {
 963     U8 tmpbuf[UTF8_MAXLEN+1];
 964     uvchr_to_utf8(tmpbuf, c);
 965     return is_utf8_punct(tmpbuf);
 966 }
 967
 968 bool
 969 Perl_is_uni_xdigit(pTHX_ UV c)
 970 {
 971     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 972     uvchr_to_utf8(tmpbuf, c);
 973     return is_utf8_xdigit(tmpbuf);
 974 }
 975
 976 UV
 977 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 978 {
 979     uvchr_to_utf8(p, c);
 980     return to_utf8_upper(p, p, lenp);
 981 }
 982
 983 UV
 984 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 985 {
 986     uvchr_to_utf8(p, c);
 987     return to_utf8_title(p, p, lenp);
 988 }
 989
 990 UV
 991 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 992 {
 993     uvchr_to_utf8(p, c);
 994     return to_utf8_lower(p, p, lenp);
 995 }
 996
 997 UV
 998 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 999 {
1000     uvchr_to_utf8(p, c);
1001     return to_utf8_fold(p, p, lenp);
1002 }
1003
1004 /* for now these all assume no locale info available for Unicode > 255 */
1005
1006 bool
1007 Perl_is_uni_alnum_lc(pTHX_ UV c)
1008 {
1009     return is_uni_alnum(c);     /* XXX no locale support yet */
1010 }
1011
1012 bool
1013 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1014 {
1015     return is_uni_alnumc(c);    /* XXX no locale support yet */
1016 }
1017
1018 bool
1019 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1020 {
1021     return is_uni_idfirst(c);   /* XXX no locale support yet */
1022 }
1023
1024 bool
1025 Perl_is_uni_alpha_lc(pTHX_ UV c)
1026 {
1027     return is_uni_alpha(c);     /* XXX no locale support yet */
1028 }
1029
1030 bool
1031 Perl_is_uni_ascii_lc(pTHX_ UV c)
1032 {
1033     return is_uni_ascii(c);     /* XXX no locale support yet */
1034 }
1035
1036 bool
1037 Perl_is_uni_space_lc(pTHX_ UV c)
1038 {
1039     return is_uni_space(c);     /* XXX no locale support yet */
1040 }
1041
1042 bool
1043 Perl_is_uni_digit_lc(pTHX_ UV c)
1044 {
1045     return is_uni_digit(c);     /* XXX no locale support yet */
1046 }
1047
1048 bool
1049 Perl_is_uni_upper_lc(pTHX_ UV c)
1050 {
1051     return is_uni_upper(c);     /* XXX no locale support yet */
1052 }
1053
1054 bool
1055 Perl_is_uni_lower_lc(pTHX_ UV c)
1056 {
1057     return is_uni_lower(c);     /* XXX no locale support yet */
1058 }
1059
1060 bool
1061 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1062 {
1063     return is_uni_cntrl(c);     /* XXX no locale support yet */
1064 }
1065
1066 bool
1067 Perl_is_uni_graph_lc(pTHX_ UV c)
1068 {
1069     return is_uni_graph(c);     /* XXX no locale support yet */
1070 }
1071
1072 bool
1073 Perl_is_uni_print_lc(pTHX_ UV c)
1074 {
1075     return is_uni_print(c);     /* XXX no locale support yet */
1076 }
1077
1078 bool
1079 Perl_is_uni_punct_lc(pTHX_ UV c)
1080 {
1081     return is_uni_punct(c);     /* XXX no locale support yet */
1082 }
1083
1084 bool
1085 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1086 {
1087     return is_uni_xdigit(c);    /* XXX no locale support yet */
1088 }
1089
1090 U32
1091 Perl_to_uni_upper_lc(pTHX_ U32 c)
1092 {
1093     /* XXX returns only the first character -- do not use XXX */
1094     /* XXX no locale support yet */
1095     STRLEN len;
1096     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097     return (U32)to_uni_upper(c, tmpbuf, &len);
1098 }
1099
1100 U32
1101 Perl_to_uni_title_lc(pTHX_ U32 c)
1102 {
1103     /* XXX returns only the first character XXX -- do not use XXX */
1104     /* XXX no locale support yet */
1105     STRLEN len;
1106     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1107     return (U32)to_uni_title(c, tmpbuf, &len);
1108 }
1109
1110 U32
1111 Perl_to_uni_lower_lc(pTHX_ U32 c)
1112 {
1113     /* XXX returns only the first character -- do not use XXX */
1114     /* XXX no locale support yet */
1115     STRLEN len;
1116     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1117     return (U32)to_uni_lower(c, tmpbuf, &len);
1118 }
1119
1120 bool
1121 Perl_is_utf8_alnum(pTHX_ U8 *p)
1122 {
1123     if (!is_utf8_char(p))
1124         return FALSE;
1125     if (!PL_utf8_alnum)
1126         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1127          * descendant of isalnum(3), in other words, it doesn't
1128          * contain the '_'. --jhi */
1129         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1130     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1131 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1132 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1133     if (!PL_utf8_alnum)
1134         PL_utf8_alnum = swash_init("utf8", "",
1135             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1136     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1137 #endif
1138 }
1139
1140 bool
1141 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1142 {
1143     if (!is_utf8_char(p))
1144         return FALSE;
1145     if (!PL_utf8_alnum)
1146         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1147     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1148 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1149 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1150     if (!PL_utf8_alnum)
1151         PL_utf8_alnum = swash_init("utf8", "",
1152             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1153     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1154 #endif
1155 }
1156
1157 bool
1158 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1159 {
1160     if (*p == '_')
1161         return TRUE;
1162     if (!is_utf8_char(p))
1163         return FALSE;
1164     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1165         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1166     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1167 }
1168
1169 bool
1170 Perl_is_utf8_idcont(pTHX_ U8 *p)
1171 {
1172     if (*p == '_')
1173         return TRUE;
1174     if (!is_utf8_char(p))
1175         return FALSE;
1176     if (!PL_utf8_idcont)
1177         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1178     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1179 }
1180
1181 bool
1182 Perl_is_utf8_alpha(pTHX_ U8 *p)
1183 {
1184     if (!is_utf8_char(p))
1185         return FALSE;
1186     if (!PL_utf8_alpha)
1187         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1188     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1189 }
1190
1191 bool
1192 Perl_is_utf8_ascii(pTHX_ U8 *p)
1193 {
1194     if (!is_utf8_char(p))
1195         return FALSE;
1196     if (!PL_utf8_ascii)
1197         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1198     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1199 }
1200
1201 bool
1202 Perl_is_utf8_space(pTHX_ U8 *p)
1203 {
1204     if (!is_utf8_char(p))
1205         return FALSE;
1206     if (!PL_utf8_space)
1207         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1208     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1209 }
1210
1211 bool
1212 Perl_is_utf8_digit(pTHX_ U8 *p)
1213 {
1214     if (!is_utf8_char(p))
1215         return FALSE;
1216     if (!PL_utf8_digit)
1217         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1218     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1219 }
1220
1221 bool
1222 Perl_is_utf8_upper(pTHX_ U8 *p)
1223 {
1224     if (!is_utf8_char(p))
1225         return FALSE;
1226     if (!PL_utf8_upper)
1227         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1228     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1229 }
1230
1231 bool
1232 Perl_is_utf8_lower(pTHX_ U8 *p)
1233 {
1234     if (!is_utf8_char(p))
1235         return FALSE;
1236     if (!PL_utf8_lower)
1237         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1238     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1239 }
1240
1241 bool
1242 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1243 {
1244     if (!is_utf8_char(p))
1245         return FALSE;
1246     if (!PL_utf8_cntrl)
1247         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1248     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1249 }
1250
1251 bool
1252 Perl_is_utf8_graph(pTHX_ U8 *p)
1253 {
1254     if (!is_utf8_char(p))
1255         return FALSE;
1256     if (!PL_utf8_graph)
1257         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1258     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1259 }
1260
1261 bool
1262 Perl_is_utf8_print(pTHX_ U8 *p)
1263 {
1264     if (!is_utf8_char(p))
1265         return FALSE;
1266     if (!PL_utf8_print)
1267         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1268     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1269 }
1270
1271 bool
1272 Perl_is_utf8_punct(pTHX_ U8 *p)
1273 {
1274     if (!is_utf8_char(p))
1275         return FALSE;
1276     if (!PL_utf8_punct)
1277         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1278     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1279 }
1280
1281 bool
1282 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1283 {
1284     if (!is_utf8_char(p))
1285         return FALSE;
1286     if (!PL_utf8_xdigit)
1287         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1288     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1289 }
1290
1291 bool
1292 Perl_is_utf8_mark(pTHX_ U8 *p)
1293 {
1294     if (!is_utf8_char(p))
1295         return FALSE;
1296     if (!PL_utf8_mark)
1297         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1298     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1299 }
1300
1301 /*
1302 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1303
1304 The "p" contains the pointer to the UTF-8 string encoding
1305 the character that is being converted.
1306
1307 The "ustrp" is a pointer to the character buffer to put the
1308 conversion result to.  The "lenp" is a pointer to the length
1309 of the result.
1310
1311 The "swashp" is a pointer to the swash to use.
1312
1313 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1314 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1315 but not always, a multicharacter mapping), is tried first.
1316
1317 The "special" is a string like "utf8::ToSpecLower", which means the
1318 hash %utf8::ToSpecLower.  The access to the hash is through
1319 Perl_to_utf8_case().
1320
1321 The "normal" is a string like "ToLower" which means the swash
1322 %utf8::ToLower.
1323
1324 =cut */
1325
1326 UV
1327 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1328 {
1329     UV uv0, uv1;
1330     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1331     STRLEN len = 0;
1332
1333     uv0 = utf8_to_uvchr(p, 0);
1334     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1335      * are necessary in EBCDIC, they are redundant no-ops
1336      * in ASCII-ish platforms, and hopefully optimized away. */
1337     uv1 = NATIVE_TO_UNI(uv0);
1338     uvuni_to_utf8(tmpbuf, uv1);
1339
1340     if (!*swashp) /* load on-demand */
1341          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1342
1343     if (special) {
1344          /* It might be "special" (sometimes, but not always,
1345           * a multicharacter mapping) */
1346          HV *hv;
1347          SV *keysv;
1348          HE *he;
1349          SV *val;
1350
1351 #if defined(UNDER_CE) && defined(MIPS)
1352 /*strange: compiler complaints that I redefine macro UVXf and points where
1353            it was first defined. I copied line from there without any changes.
1354            Nothing should change.
1355            But when I do not do this, there is an error on a line with
1356               Perl_newSVpvf(aTHX_ "%04"UVXf, uv1)
1357 */
1358 #define UVXf            "lX"            /**/
1359 #endif
1360          if ((hv    = get_hv(special, FALSE)) &&
1361              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1362              (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1363              (val   = HeVAL(he))) {
1364              char *s;
1365
1366               s = SvPV(val, len);
1367               if (len == 1)
1368                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1369               else {
1370 #ifdef EBCDIC
1371                    /* If we have EBCDIC we need to remap the characters
1372                     * since any characters in the low 256 are Unicode
1373                     * code points, not EBCDIC. */
1374                    U8 *t = (U8*)s, *tend = t + len, *d;
1375
1376                    d = tmpbuf;
1377                    if (SvUTF8(val)) {
1378                         STRLEN tlen = 0;
1379
1380                         while (t < tend) {
1381                              UV c = utf8_to_uvchr(t, &tlen);
1382                              if (tlen > 0) {
1383                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1384                                   t += tlen;
1385                              }
1386                              else
1387                                   break;
1388                         }
1389                    }
1390                    else {
1391                         while (t < tend) {
1392                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1393                              t++;
1394                         }
1395                    }
1396                    len = d - tmpbuf;
1397                    Copy(tmpbuf, ustrp, len, U8);
1398 #else
1399                    Copy(s, ustrp, len, U8);
1400 #endif
1401               }
1402          }
1403     }
1404
1405     if (!len && *swashp) {
1406          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1407
1408          if (uv2) {
1409               /* It was "normal" (a single character mapping). */
1410               UV uv3 = UNI_TO_NATIVE(uv2);
1411
1412               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1413          }
1414     }
1415
1416     if (!len) /* Neither: just copy. */
1417          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1418
1419     if (lenp)
1420          *lenp = len;
1421
1422     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1423 }
1424
1425 /*
1426 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1427
1428 Convert the UTF-8 encoded character at p to its uppercase version and
1429 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1430 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1431 uppercase version may be longer than the original character (up to two
1432 characters).
1433
1434 The first character of the uppercased version is returned
1435 (but note, as explained above, that there may be more.)
1436
1437 =cut */
1438
1439 UV
1440 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1441 {
1442     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1443                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1444 }
1445
1446 /*
1447 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1448
1449 Convert the UTF-8 encoded character at p to its titlecase version and
1450 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1451 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1452 titlecase version may be longer than the original character (up to two
1453 characters).
1454
1455 The first character of the titlecased version is returned
1456 (but note, as explained above, that there may be more.)
1457
1458 =cut */
1459
1460 UV
1461 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1462 {
1463     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1464                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1465 }
1466
1467 /*
1468 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1469
1470 Convert the UTF-8 encoded character at p to its lowercase version and
1471 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1472 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1473 lowercase version may be longer than the original character (up to two
1474 characters).
1475
1476 The first character of the lowercased version is returned
1477 (but note, as explained above, that there may be more.)
1478
1479 =cut */
1480
1481 UV
1482 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1483 {
1484     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1485                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1486 }
1487
1488 /*
1489 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1490
1491 Convert the UTF-8 encoded character at p to its foldcase version and
1492 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1493 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1494 foldcase version may be longer than the original character (up to
1495 three characters).
1496
1497 The first character of the foldcased version is returned
1498 (but note, as explained above, that there may be more.)
1499
1500 =cut */
1501
1502 UV
1503 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1504 {
1505     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1506                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1507 }
1508
1509 /* a "swash" is a swatch hash */
1510
1511 SV*
1512 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1513 {
1514     SV* retval;
1515     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1516     dSP;
1517     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1518     SV* errsv_save;
1519
1520     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1521         ENTER;
1522         errsv_save = newSVsv(ERRSV);
1523         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1524         if (!SvTRUE(ERRSV))
1525             sv_setsv(ERRSV, errsv_save);
1526         SvREFCNT_dec(errsv_save);
1527         LEAVE;
1528     }
1529     SPAGAIN;
1530     PUSHSTACKi(PERLSI_MAGIC);
1531     PUSHMARK(SP);
1532     EXTEND(SP,5);
1533     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1534     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1535     PUSHs(listsv);
1536     PUSHs(sv_2mortal(newSViv(minbits)));
1537     PUSHs(sv_2mortal(newSViv(none)));
1538     PUTBACK;
1539     ENTER;
1540     SAVEI32(PL_hints);
1541     PL_hints = 0;
1542     save_re_context();
1543     if (PL_curcop == &PL_compiling) {
1544         /* XXX ought to be handled by lex_start */
1545         SAVEI32(PL_in_my);
1546         sv_setpv(tokenbufsv, PL_tokenbuf);
1547     }
1548     errsv_save = newSVsv(ERRSV);
1549     if (call_method("SWASHNEW", G_SCALAR))
1550         retval = newSVsv(*PL_stack_sp--);
1551     else
1552         retval = &PL_sv_undef;
1553     if (!SvTRUE(ERRSV))
1554         sv_setsv(ERRSV, errsv_save);
1555     SvREFCNT_dec(errsv_save);
1556     LEAVE;
1557     POPSTACK;
1558     if (PL_curcop == &PL_compiling) {
1559         STRLEN len;
1560         char* pv = SvPV(tokenbufsv, len);
1561
1562         Copy(pv, PL_tokenbuf, len+1, char);
1563         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1564     }
1565     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1566         if (SvPOK(retval))
1567             Perl_croak(aTHX_ "Can't find Unicode property definition \"%s\"",
1568                        SvPV_nolen(retval));
1569         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1570     }
1571     return retval;
1572 }
1573
1574
1575 /* This API is wrong for special case conversions since we may need to
1576  * return several Unicode characters for a single Unicode character
1577  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1578  * the lower-level routine, and it is similarly broken for returning
1579  * multiple values.  --jhi */
1580 UV
1581 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1582 {
1583     HV* hv = (HV*)SvRV(sv);
1584     U32 klen;
1585     U32 off;
1586     STRLEN slen;
1587     STRLEN needents;
1588     U8 *tmps = NULL;
1589     U32 bit;
1590     SV *retval;
1591     U8 tmputf8[2];
1592     UV c = NATIVE_TO_ASCII(*ptr);
1593
1594     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1595         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1596         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1597         ptr = tmputf8;
1598     }
1599     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1600      * then the "swatch" is a vec() for al the chars which start
1601      * with 0xAA..0xYY
1602      * So the key in the hash (klen) is length of encoded char -1
1603      */
1604     klen = UTF8SKIP(ptr) - 1;
1605     off  = ptr[klen];
1606
1607     if (klen == 0)
1608      {
1609       /* If char in invariant then swatch is for all the invariant chars
1610        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1611        */
1612       needents = UTF_CONTINUATION_MARK;
1613       off      = NATIVE_TO_UTF(ptr[klen]);
1614      }
1615     else
1616      {
1617       /* If char is encoded then swatch is for the prefix */
1618       needents = (1 << UTF_ACCUMULATION_SHIFT);
1619       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1620      }
1621
1622     /*
1623      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1624      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1625      * it's nothing to sniff at.)  Pity we usually come through at least
1626      * two function calls to get here...
1627      *
1628      * NB: this code assumes that swatches are never modified, once generated!
1629      */
1630
1631     if (hv   == PL_last_swash_hv &&
1632         klen == PL_last_swash_klen &&
1633         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1634     {
1635         tmps = PL_last_swash_tmps;
1636         slen = PL_last_swash_slen;
1637     }
1638     else {
1639         /* Try our second-level swatch cache, kept in a hash. */
1640         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1641
1642         /* If not cached, generate it via utf8::SWASHGET */
1643         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1644             dSP;
1645             /* We use utf8n_to_uvuni() as we want an index into
1646                Unicode tables, not a native character number.
1647              */
1648             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1649                                            ckWARN(WARN_UTF8) ?
1650                                            0 : UTF8_ALLOW_ANY);
1651             SV *errsv_save;
1652             ENTER;
1653             SAVETMPS;
1654             save_re_context();
1655             PUSHSTACKi(PERLSI_MAGIC);
1656             PUSHMARK(SP);
1657             EXTEND(SP,3);
1658             PUSHs((SV*)sv);
1659             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1660             PUSHs(sv_2mortal(newSViv((klen) ?
1661                                      (code_point & ~(needents - 1)) : 0)));
1662             PUSHs(sv_2mortal(newSViv(needents)));
1663             PUTBACK;
1664             errsv_save = newSVsv(ERRSV);
1665             if (call_method("SWASHGET", G_SCALAR))
1666                 retval = newSVsv(*PL_stack_sp--);
1667             else
1668                 retval = &PL_sv_undef;
1669             if (!SvTRUE(ERRSV))
1670                 sv_setsv(ERRSV, errsv_save);
1671             SvREFCNT_dec(errsv_save);
1672             POPSTACK;
1673             FREETMPS;
1674             LEAVE;
1675             if (PL_curcop == &PL_compiling)
1676                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1677
1678             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1679
1680             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1681                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1682         }
1683
1684         PL_last_swash_hv = hv;
1685         PL_last_swash_klen = klen;
1686         PL_last_swash_tmps = tmps;
1687         PL_last_swash_slen = slen;
1688         if (klen)
1689             Copy(ptr, PL_last_swash_key, klen, U8);
1690     }
1691
1692     switch ((int)((slen << 3) / needents)) {
1693     case 1:
1694         bit = 1 << (off & 7);
1695         off >>= 3;
1696         return (tmps[off] & bit) != 0;
1697     case 8:
1698         return tmps[off];
1699     case 16:
1700         off <<= 1;
1701         return (tmps[off] << 8) + tmps[off + 1] ;
1702     case 32:
1703         off <<= 2;
1704         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1705     }
1706     Perl_croak(aTHX_ "panic: swash_fetch");
1707     return 0;
1708 }
1709
1710
1711 /*
1712 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1713
1714 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1715 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1716 bytes available. The return value is the pointer to the byte after the
1717 end of the new character. In other words,
1718
1719     d = uvchr_to_utf8(d, uv);
1720
1721 is the recommended wide native character-aware way of saying
1722
1723     *(d++) = uv;
1724
1725 =cut
1726 */
1727
1728 /* On ASCII machines this is normally a macro but we want a
1729    real function in case XS code wants it
1730 */
1731 #undef Perl_uvchr_to_utf8
1732 U8 *
1733 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1734 {
1735     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1736 }
1737
1738 U8 *
1739 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1740 {
1741     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1742 }
1743
1744 /*
1745 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1746
1747 Returns the native character value of the first character in the string C<s>
1748 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1749 length, in bytes, of that character.
1750
1751 Allows length and flags to be passed to low level routine.
1752
1753 =cut
1754 */
1755 /* On ASCII machines this is normally a macro but we want
1756    a real function in case XS code wants it
1757 */
1758 #undef Perl_utf8n_to_uvchr
1759 UV
1760 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1761 {
1762     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1763     return UNI_TO_NATIVE(uv);
1764 }
1765
1766 /*
1767 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1768
1769 Build to the scalar dsv a displayable version of the string spv,
1770 length len, the displayable version being at most pvlim bytes long
1771 (if longer, the rest is truncated and "..." will be appended).
1772
1773 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1774 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1775 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1776 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1777 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1778 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1779
1780 The pointer to the PV of the dsv is returned.
1781
1782 =cut */
1783 char *
1784 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1785 {
1786     int truncated = 0;
1787     char *s, *e;
1788
1789     sv_setpvn(dsv, "", 0);
1790     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1791          UV u;
1792          bool ok = FALSE;
1793
1794          if (pvlim && SvCUR(dsv) >= pvlim) {
1795               truncated++;
1796               break;
1797          }
1798          u = utf8_to_uvchr((U8*)s, 0);
1799          if (u < 256) {
1800              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1801                  switch (u & 0xFF) {
1802                  case '\n':
1803                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1804                  case '\r':
1805                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1806                  case '\t':
1807                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1808                  case '\f':
1809                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1810                  case '\a':
1811                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1812                  case '\\':
1813                      Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1814                  default: break;
1815                  }
1816              }
1817              /* isPRINT() is the locale-blind version. */
1818              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1819                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1820                  ok = TRUE;
1821              }
1822          }
1823          if (!ok)
1824              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1825     }
1826     if (truncated)
1827          sv_catpvn(dsv, "...", 3);
1828
1829     return SvPVX(dsv);
1830 }
1831
1832 /*
1833 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1834
1835 Build to the scalar dsv a displayable version of the scalar sv,
1836 the displayable version being at most pvlim bytes long
1837 (if longer, the rest is truncated and "..." will be appended).
1838
1839 The flags argument is as in pv_uni_display().
1840
1841 The pointer to the PV of the dsv is returned.
1842
1843 =cut */
1844 char *
1845 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1846 {
1847      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1848                                 pvlim, flags);
1849 }
1850
1851 /*
1852 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1853
1854 Return true if the strings s1 and s2 differ case-insensitively, false
1855 if not (if they are equal case-insensitively).  If u1 is true, the
1856 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1857 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1858 are false, the respective string is assumed to be in native 8-bit
1859 encoding.
1860
1861 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1862 in there (they will point at the beginning of the I<next> character).
1863 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1864 pointers beyond which scanning will not continue under any
1865 circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1866 s2+l2 will be used as goal end pointers that will also stop the scan,
1867 and which qualify towards defining a successful match: all the scans
1868 that define an explicit length must reach their goal pointers for
1869 a match to succeed).
1870
1871 For case-insensitiveness, the "casefolding" of Unicode is used
1872 instead of upper/lowercasing both the characters, see
1873 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1874
1875 =cut */
1876 I32
1877 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1878 {
1879      register U8 *p1  = (U8*)s1;
1880      register U8 *p2  = (U8*)s2;
1881      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1882      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1883      STRLEN n1 = 0, n2 = 0;
1884      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1885      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1886      U8 natbuf[1+1];
1887      STRLEN foldlen1, foldlen2;
1888      bool match;
1889
1890      if (pe1)
1891           e1 = *(U8**)pe1;
1892      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1893           f1 = (U8*)s1 + l1;
1894      if (pe2)
1895           e2 = *(U8**)pe2;
1896      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1897           f2 = (U8*)s2 + l2;
1898
1899      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1900           return 1; /* mismatch; possible infinite loop or false positive */
1901
1902      if (!u1 || !u2)
1903           natbuf[1] = 0; /* Need to terminate the buffer. */
1904
1905      while ((e1 == 0 || p1 < e1) &&
1906             (f1 == 0 || p1 < f1) &&
1907             (e2 == 0 || p2 < e2) &&
1908             (f2 == 0 || p2 < f2)) {
1909           if (n1 == 0) {
1910                if (u1)
1911                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1912                else {
1913                     natbuf[0] = *p1;
1914                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1915                }
1916                q1 = foldbuf1;
1917                n1 = foldlen1;
1918           }
1919           if (n2 == 0) {
1920                if (u2)
1921                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1922                else {
1923                     natbuf[0] = *p2;
1924                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1925                }
1926                q2 = foldbuf2;
1927                n2 = foldlen2;
1928           }
1929           while (n1 && n2) {
1930                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1931                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1932                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1933                    return 1; /* mismatch */
1934                n1 -= UTF8SKIP(q1);
1935                q1 += UTF8SKIP(q1);
1936                n2 -= UTF8SKIP(q2);
1937                q2 += UTF8SKIP(q2);
1938           }
1939           if (n1 == 0)
1940                p1 += u1 ? UTF8SKIP(p1) : 1;
1941           if (n2 == 0)
1942                p2 += u2 ? UTF8SKIP(p2) : 1;
1943
1944      }
1945
1946      /* A match is defined by all the scans that specified
1947       * an explicit length reaching their final goals. */
1948      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1949
1950      if (match) {
1951           if (pe1)
1952                *pe1 = (char*)p1;
1953           if (pe2)
1954                *pe2 = (char*)p2;
1955      }
1956
1957      return match ? 0 : 1; /* 0 match, 1 mismatch */
1958 }
1959