utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 static char unees[] = "Malformed UTF-8 character (unexpected end of string)";
  28
  29 /*
  30 =head1 Unicode Support
  31
  32 This file contains various utility functions for manipulating UTF8-encoded
  33 strings. For the uninitiated, this is a method of representing arbitrary
  34 Unicode characters as a variable number of bytes, in such a way that
  35 characters in the ASCII range are unmodified, and a zero byte never appears
  36 within non-zero characters.
  37
  38 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  39
  40 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  41 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  42 bytes available. The return value is the pointer to the byte after the
  43 end of the new character. In other words,
  44
  45     d = uvuni_to_utf8_flags(d, uv, flags);
  46
  47 or, in most cases,
  48
  49     d = uvuni_to_utf8(d, uv);
  50
  51 (which is equivalent to)
  52
  53     d = uvuni_to_utf8_flags(d, uv, 0);
  54
  55 is the recommended Unicode-aware way of saying
  56
  57     *(d++) = uv;
  58
  59 =cut
  60 */
  61
  62 U8 *
  63 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  64 {
  65     if (ckWARN(WARN_UTF8)) {
  66          if (UNICODE_IS_SURROGATE(uv) &&
  67              !(flags & UNICODE_ALLOW_SURROGATE))
  68               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  69          else if (
  70                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  71                     !(flags & UNICODE_ALLOW_FDD0))
  72                    ||
  73                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  74                     !(flags & UNICODE_ALLOW_FFFF))) &&
  75                   /* UNICODE_ALLOW_SUPER includes
  76                    * FFFEs and FFFFs beyond 0x10FFFF. */
  77                   ((uv <= PERL_UNICODE_MAX) ||
  78                    !(flags & UNICODE_ALLOW_SUPER))
  79                   )
  80               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  81                          "Unicode character 0x%04"UVxf" is illegal", uv);
  82     }
  83     if (UNI_IS_INVARIANT(uv)) {
  84         *d++ = (U8)UTF_TO_NATIVE(uv);
  85         return d;
  86     }
  87 #if defined(EBCDIC)
  88     else {
  89         STRLEN len  = UNISKIP(uv);
  90         U8 *p = d+len-1;
  91         while (p > d) {
  92             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  93             uv >>= UTF_ACCUMULATION_SHIFT;
  94         }
  95         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  96         return d+len;
  97     }
  98 #else /* Non loop style */
  99     if (uv < 0x800) {
 100         *d++ = (U8)(( uv >>  6)         | 0xc0);
 101         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 102         return d;
 103     }
 104     if (uv < 0x10000) {
 105         *d++ = (U8)(( uv >> 12)         | 0xe0);
 106         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 107         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 108         return d;
 109     }
 110     if (uv < 0x200000) {
 111         *d++ = (U8)(( uv >> 18)         | 0xf0);
 112         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 113         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 114         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 115         return d;
 116     }
 117     if (uv < 0x4000000) {
 118         *d++ = (U8)(( uv >> 24)         | 0xf8);
 119         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 120         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 121         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 122         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 123         return d;
 124     }
 125     if (uv < 0x80000000) {
 126         *d++ = (U8)(( uv >> 30)         | 0xfc);
 127         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 128         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 129         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 131         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 132         return d;
 133     }
 134 #ifdef HAS_QUAD
 135     if (uv < UTF8_QUAD_MAX)
 136 #endif
 137     {
 138         *d++ =                            0xfe; /* Can't match U+FEFF! */
 139         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 140         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 141         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 144         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 145         return d;
 146     }
 147 #ifdef HAS_QUAD
 148     {
 149         *d++ =                            0xff;         /* Can't match U+FFFE! */
 150         *d++ =                            0x80;         /* 6 Reserved bits */
 151         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 152         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 153         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 154         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 161         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 162         return d;
 163     }
 164 #endif
 165 #endif /* Loop style */
 166 }
 167
 168 U8 *
 169 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 170 {
 171     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 172 }
 173
 174
 175 /*
 176 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 177
 178 Tests if some arbitrary number of bytes begins in a valid UTF-8
 179 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 180 UTF-8 character.  The actual number of bytes in the UTF-8 character
 181 will be returned if it is valid, otherwise 0.
 182
 183 =cut */
 184 STRLEN
 185 Perl_is_utf8_char(pTHX_ U8 *s)
 186 {
 187     U8 u = *s;
 188     STRLEN slen, len;
 189     UV uv, ouv;
 190
 191     if (UTF8_IS_INVARIANT(u))
 192         return 1;
 193
 194     if (!UTF8_IS_START(u))
 195         return 0;
 196
 197     len = UTF8SKIP(s);
 198
 199     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 200         return 0;
 201
 202     slen = len - 1;
 203     s++;
 204     u &= UTF_START_MASK(len);
 205     uv  = u;
 206     ouv = uv;
 207     while (slen--) {
 208         if (!UTF8_IS_CONTINUATION(*s))
 209             return 0;
 210         uv = UTF8_ACCUMULATE(uv, *s);
 211         if (uv < ouv)
 212             return 0;
 213         ouv = uv;
 214         s++;
 215     }
 216
 217     if ((STRLEN)UNISKIP(uv) < len)
 218         return 0;
 219
 220     return len;
 221 }
 222
 223 /*
 224 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 225
 226 Returns true if first C<len> bytes of the given string form a valid
 227 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 228 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 229 because a valid ASCII string is a valid UTF-8 string.
 230
 231 =cut
 232 */
 233
 234 bool
 235 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 236 {
 237     U8* x = s;
 238     U8* send;
 239     STRLEN c;
 240
 241     if (!len && s)
 242         len = strlen((char *)s);
 243     send = s + len;
 244
 245     while (x < send) {
 246          /* Inline the easy bits of is_utf8_char() here for speed... */
 247          if (UTF8_IS_INVARIANT(*x))
 248               c = 1;
 249          else if (!UTF8_IS_START(*x))
 250               return FALSE;
 251          else {
 252               /* ... and call is_utf8_char() only if really needed. */
 253               c = is_utf8_char(x);
 254               if (!c)
 255                    return FALSE;
 256          }
 257         x += c;
 258     }
 259     if (x != send)
 260         return FALSE;
 261
 262     return TRUE;
 263 }
 264
 265 /*
 266 =for apidoc A|bool|is_utf8_string_loc|U8 *s|STRLEN len|U8 **p
 267
 268 Like is_ut8_string but store the location of the failure in
 269 the last argument.
 270
 271 =cut
 272 */
 273
 274 bool
 275 Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **p)
 276 {
 277     U8* x = s;
 278     U8* send;
 279     STRLEN c;
 280
 281     if (!len && s)
 282         len = strlen((char *)s);
 283     send = s + len;
 284
 285     while (x < send) {
 286          /* Inline the easy bits of is_utf8_char() here for speed... */
 287          if (UTF8_IS_INVARIANT(*x))
 288               c = 1;
 289          else if (!UTF8_IS_START(*x)) {
 290               if (p)
 291                   *p = x;
 292               return FALSE;
 293          }
 294          else {
 295               /* ... and call is_utf8_char() only if really needed. */
 296               c = is_utf8_char(x);
 297               if (!c) {
 298                    if (p)
 299                       *p = x;
 300                    return FALSE;
 301               }
 302          }
 303         x += c;
 304     }
 305     if (x != send) {
 306        if (p)
 307            *p = x;
 308         return FALSE;
 309     }
 310
 311     return TRUE;
 312 }
 313
 314 /*
 315 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 316
 317 Bottom level UTF-8 decode routine.
 318 Returns the unicode code point value of the first character in the string C<s>
 319 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 320 C<retlen> will be set to the length, in bytes, of that character.
 321
 322 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 323 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 324 it is assumed that the caller will raise a warning, and this function
 325 will silently just set C<retlen> to C<-1> and return zero.  If the
 326 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 327 malformations will be given, C<retlen> will be set to the expected
 328 length of the UTF-8 character in bytes, and zero will be returned.
 329
 330 The C<flags> can also contain various flags to allow deviations from
 331 the strict UTF-8 encoding (see F<utf8.h>).
 332
 333 Most code should use utf8_to_uvchr() rather than call this directly.
 334
 335 =cut
 336 */
 337
 338 UV
 339 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 340 {
 341     U8 *s0 = s;
 342     UV uv = *s, ouv = 0;
 343     STRLEN len = 1;
 344     bool dowarn = ckWARN_d(WARN_UTF8);
 345     UV startbyte = *s;
 346     STRLEN expectlen = 0;
 347     U32 warning = 0;
 348
 349 /* This list is a superset of the UTF8_ALLOW_XXX. */
 350
 351 #define UTF8_WARN_EMPTY                          1
 352 #define UTF8_WARN_CONTINUATION                   2
 353 #define UTF8_WARN_NON_CONTINUATION               3
 354 #define UTF8_WARN_FE_FF                          4
 355 #define UTF8_WARN_SHORT                          5
 356 #define UTF8_WARN_OVERFLOW                       6
 357 #define UTF8_WARN_SURROGATE                      7
 358 #define UTF8_WARN_LONG                           8
 359 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 360
 361     if (curlen == 0 &&
 362         !(flags & UTF8_ALLOW_EMPTY)) {
 363         warning = UTF8_WARN_EMPTY;
 364         goto malformed;
 365     }
 366
 367     if (UTF8_IS_INVARIANT(uv)) {
 368         if (retlen)
 369             *retlen = 1;
 370         return (UV) (NATIVE_TO_UTF(*s));
 371     }
 372
 373     if (UTF8_IS_CONTINUATION(uv) &&
 374         !(flags & UTF8_ALLOW_CONTINUATION)) {
 375         warning = UTF8_WARN_CONTINUATION;
 376         goto malformed;
 377     }
 378
 379     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 380         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 381         warning = UTF8_WARN_NON_CONTINUATION;
 382         goto malformed;
 383     }
 384
 385 #ifdef EBCDIC
 386     uv = NATIVE_TO_UTF(uv);
 387 #else
 388     if ((uv == 0xfe || uv == 0xff) &&
 389         !(flags & UTF8_ALLOW_FE_FF)) {
 390         warning = UTF8_WARN_FE_FF;
 391         goto malformed;
 392     }
 393 #endif
 394
 395     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 396     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 397     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 398     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 399 #ifdef EBCDIC
 400     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 401     else                        { len =  7; uv &= 0x01; }
 402 #else
 403     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 404     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 405     else                        { len = 13; uv = 0; } /* whoa! */
 406 #endif
 407
 408     if (retlen)
 409         *retlen = len;
 410
 411     expectlen = len;
 412
 413     if ((curlen < expectlen) &&
 414         !(flags & UTF8_ALLOW_SHORT)) {
 415         warning = UTF8_WARN_SHORT;
 416         goto malformed;
 417     }
 418
 419     len--;
 420     s++;
 421     ouv = uv;
 422
 423     while (len--) {
 424         if (!UTF8_IS_CONTINUATION(*s) &&
 425             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 426             s--;
 427             warning = UTF8_WARN_NON_CONTINUATION;
 428             goto malformed;
 429         }
 430         else
 431             uv = UTF8_ACCUMULATE(uv, *s);
 432         if (!(uv > ouv)) {
 433             /* These cannot be allowed. */
 434             if (uv == ouv) {
 435                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 436                     warning = UTF8_WARN_LONG;
 437                     goto malformed;
 438                 }
 439             }
 440             else { /* uv < ouv */
 441                 /* This cannot be allowed. */
 442                 warning = UTF8_WARN_OVERFLOW;
 443                 goto malformed;
 444             }
 445         }
 446         s++;
 447         ouv = uv;
 448     }
 449
 450     if (UNICODE_IS_SURROGATE(uv) &&
 451         !(flags & UTF8_ALLOW_SURROGATE)) {
 452         warning = UTF8_WARN_SURROGATE;
 453         goto malformed;
 454     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 455                !(flags & UTF8_ALLOW_LONG)) {
 456         warning = UTF8_WARN_LONG;
 457         goto malformed;
 458     } else if (UNICODE_IS_ILLEGAL(uv) &&
 459                !(flags & UTF8_ALLOW_FFFF)) {
 460         warning = UTF8_WARN_FFFF;
 461         goto malformed;
 462     }
 463
 464     return uv;
 465
 466 malformed:
 467
 468     if (flags & UTF8_CHECK_ONLY) {
 469         if (retlen)
 470             *retlen = -1;
 471         return 0;
 472     }
 473
 474     if (dowarn) {
 475         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 476
 477         switch (warning) {
 478         case 0: /* Intentionally empty. */ break;
 479         case UTF8_WARN_EMPTY:
 480             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 481             break;
 482         case UTF8_WARN_CONTINUATION:
 483             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 484             break;
 485         case UTF8_WARN_NON_CONTINUATION:
 486             if (s == s0)
 487                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 488                            (UV)s[1], startbyte);
 489             else
 490                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 491                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 492
 493             break;
 494         case UTF8_WARN_FE_FF:
 495             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 496             break;
 497         case UTF8_WARN_SHORT:
 498             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 499                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 500             expectlen = curlen;         /* distance for caller to skip */
 501             break;
 502         case UTF8_WARN_OVERFLOW:
 503             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 504                            ouv, *s, startbyte);
 505             break;
 506         case UTF8_WARN_SURROGATE:
 507             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 508             break;
 509         case UTF8_WARN_LONG:
 510             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 511                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 512             break;
 513         case UTF8_WARN_FFFF:
 514             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 515             break;
 516         default:
 517             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 518             break;
 519         }
 520
 521         if (warning) {
 522             char *s = SvPVX(sv);
 523
 524             if (PL_op)
 525                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 526                             "%s in %s", s,  OP_DESC(PL_op));
 527             else
 528                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 529         }
 530     }
 531
 532     if (retlen)
 533         *retlen = expectlen ? expectlen : len;
 534
 535     return 0;
 536 }
 537
 538 /*
 539 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 540
 541 Returns the native character value of the first character in the string C<s>
 542 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 543 length, in bytes, of that character.
 544
 545 If C<s> does not point to a well-formed UTF-8 character, zero is
 546 returned and retlen is set, if possible, to -1.
 547
 548 =cut
 549 */
 550
 551 UV
 552 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 553 {
 554     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
 555                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 556 }
 557
 558 /*
 559 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 560
 561 Returns the Unicode code point of the first character in the string C<s>
 562 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 563 length, in bytes, of that character.
 564
 565 This function should only be used when returned UV is considered
 566 an index into the Unicode semantic tables (e.g. swashes).
 567
 568 If C<s> does not point to a well-formed UTF-8 character, zero is
 569 returned and retlen is set, if possible, to -1.
 570
 571 =cut
 572 */
 573
 574 UV
 575 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 576 {
 577     /* Call the low level routine asking for checks */
 578     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
 579                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 580 }
 581
 582 /*
 583 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 584
 585 Return the length of the UTF-8 char encoded string C<s> in characters.
 586 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 587 up past C<e>, croaks.
 588
 589 =cut
 590 */
 591
 592 STRLEN
 593 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 594 {
 595     STRLEN len = 0;
 596
 597     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 598      * the bitops (especially ~) can create illegal UTF-8.
 599      * In other words: in Perl UTF-8 is not just for Unicode. */
 600
 601     if (e < s) {
 602         if (ckWARN_d(WARN_UTF8)) {
 603             if (PL_op)
 604                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 605                             "%s in %s", unees, OP_DESC(PL_op));
 606             else
 607                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 608         }
 609         return 0;
 610     }
 611     while (s < e) {
 612         U8 t = UTF8SKIP(s);
 613
 614         if (e - s < t) {
 615             if (ckWARN_d(WARN_UTF8)) {
 616                 if (PL_op)
 617                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 618                                 unees, OP_DESC(PL_op));
 619                 else
 620                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 621             }
 622             return len;
 623         }
 624         s += t;
 625         len++;
 626     }
 627
 628     return len;
 629 }
 630
 631 /*
 632 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 633
 634 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 635 and C<b>.
 636
 637 WARNING: use only if you *know* that the pointers point inside the
 638 same UTF-8 buffer.
 639
 640 =cut
 641 */
 642
 643 IV
 644 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 645 {
 646     IV off = 0;
 647
 648     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 649      * the bitops (especially ~) can create illegal UTF-8.
 650      * In other words: in Perl UTF-8 is not just for Unicode. */
 651
 652     if (a < b) {
 653         while (a < b) {
 654             U8 c = UTF8SKIP(a);
 655
 656             if (b - a < c) {
 657                 if (ckWARN_d(WARN_UTF8)) {
 658                     if (PL_op)
 659                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 660                                     "%s in %s", unees, OP_DESC(PL_op));
 661                     else
 662                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 663                 }
 664                 return off;
 665             }
 666             a += c;
 667             off--;
 668         }
 669     }
 670     else {
 671         while (b < a) {
 672             U8 c = UTF8SKIP(b);
 673
 674             if (a - b < c) {
 675                 if (ckWARN_d(WARN_UTF8)) {
 676                     if (PL_op)
 677                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 678                                     "%s in %s", unees, OP_DESC(PL_op));
 679                     else
 680                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 681                 }
 682                 return off;
 683             }
 684             b += c;
 685             off++;
 686         }
 687     }
 688
 689     return off;
 690 }
 691
 692 /*
 693 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 694
 695 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 696 forward or backward.
 697
 698 WARNING: do not use the following unless you *know* C<off> is within
 699 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 700 on the first byte of character or just after the last byte of a character.
 701
 702 =cut
 703 */
 704
 705 U8 *
 706 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 707 {
 708     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 709      * the bitops (especially ~) can create illegal UTF-8.
 710      * In other words: in Perl UTF-8 is not just for Unicode. */
 711
 712     if (off >= 0) {
 713         while (off--)
 714             s += UTF8SKIP(s);
 715     }
 716     else {
 717         while (off++) {
 718             s--;
 719             while (UTF8_IS_CONTINUATION(*s))
 720                 s--;
 721         }
 722     }
 723     return s;
 724 }
 725
 726 /*
 727 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 728
 729 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 730 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 731 updates len to contain the new length.
 732 Returns zero on failure, setting C<len> to -1.
 733
 734 =cut
 735 */
 736
 737 U8 *
 738 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 739 {
 740     U8 *send;
 741     U8 *d;
 742     U8 *save = s;
 743
 744     /* ensure valid UTF-8 and chars < 256 before updating string */
 745     for (send = s + *len; s < send; ) {
 746         U8 c = *s++;
 747
 748         if (!UTF8_IS_INVARIANT(c) &&
 749             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 750              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 751             *len = -1;
 752             return 0;
 753         }
 754     }
 755
 756     d = s = save;
 757     while (s < send) {
 758         STRLEN ulen;
 759         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 760         s += ulen;
 761     }
 762     *d = '\0';
 763     *len = d - save;
 764     return save;
 765 }
 766
 767 /*
 768 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 769
 770 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 771 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 772 the newly-created string, and updates C<len> to contain the new
 773 length.  Returns the original string if no conversion occurs, C<len>
 774 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 775 0 if C<s> is converted or contains all 7bit characters.
 776
 777 =cut
 778 */
 779
 780 U8 *
 781 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 782 {
 783     U8 *d;
 784     U8 *start = s;
 785     U8 *send;
 786     I32 count = 0;
 787
 788     if (!*is_utf8)
 789         return start;
 790
 791     /* ensure valid UTF-8 and chars < 256 before converting string */
 792     for (send = s + *len; s < send;) {
 793         U8 c = *s++;
 794         if (!UTF8_IS_INVARIANT(c)) {
 795             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 796                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 797                 count++;
 798             else
 799                 return start;
 800         }
 801     }
 802
 803     *is_utf8 = 0;
 804
 805     Newz(801, d, (*len) - count + 1, U8);
 806     s = start; start = d;
 807     while (s < send) {
 808         U8 c = *s++;
 809         if (!UTF8_IS_INVARIANT(c)) {
 810             /* Then it is two-byte encoded */
 811             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 812             c = ASCII_TO_NATIVE(c);
 813         }
 814         *d++ = c;
 815     }
 816     *d = '\0';
 817     *len = d - start;
 818     return start;
 819 }
 820
 821 /*
 822 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 823
 824 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 825 Returns a pointer to the newly-created string, and sets C<len> to
 826 reflect the new length.
 827
 828 If you want to convert to UTF-8 from other encodings than ASCII,
 829 see sv_recode_to_utf8().
 830
 831 =cut
 832 */
 833
 834 U8*
 835 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 836 {
 837     U8 *send;
 838     U8 *d;
 839     U8 *dst;
 840     send = s + (*len);
 841
 842     Newz(801, d, (*len) * 2 + 1, U8);
 843     dst = d;
 844
 845     while (s < send) {
 846         UV uv = NATIVE_TO_ASCII(*s++);
 847         if (UNI_IS_INVARIANT(uv))
 848             *d++ = (U8)UTF_TO_NATIVE(uv);
 849         else {
 850             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 851             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 852         }
 853     }
 854     *d = '\0';
 855     *len = d-dst;
 856     return dst;
 857 }
 858
 859 /*
 860  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 861  *
 862  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 863  * We optimize for native, for obvious reasons. */
 864
 865 U8*
 866 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 867 {
 868     U8* pend;
 869     U8* dstart = d;
 870
 871     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 872          d[0] = 0;
 873          *newlen = 1;
 874          return d;
 875     }
 876
 877     if (bytelen & 1)
 878         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %d", bytelen);
 879
 880     pend = p + bytelen;
 881
 882     while (p < pend) {
 883         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 884         p += 2;
 885         if (uv < 0x80) {
 886             *d++ = (U8)uv;
 887             continue;
 888         }
 889         if (uv < 0x800) {
 890             *d++ = (U8)(( uv >>  6)         | 0xc0);
 891             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 892             continue;
 893         }
 894         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 895             UV low = (p[0] << 8) + p[1];
 896             p += 2;
 897             if (low < 0xdc00 || low >= 0xdfff)
 898                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 899             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 900         }
 901         if (uv < 0x10000) {
 902             *d++ = (U8)(( uv >> 12)         | 0xe0);
 903             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 904             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 905             continue;
 906         }
 907         else {
 908             *d++ = (U8)(( uv >> 18)         | 0xf0);
 909             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 910             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 911             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 912             continue;
 913         }
 914     }
 915     *newlen = d - dstart;
 916     return d;
 917 }
 918
 919 /* Note: this one is slightly destructive of the source. */
 920
 921 U8*
 922 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 923 {
 924     U8* s = (U8*)p;
 925     U8* send = s + bytelen;
 926     while (s < send) {
 927         U8 tmp = s[0];
 928         s[0] = s[1];
 929         s[1] = tmp;
 930         s += 2;
 931     }
 932     return utf16_to_utf8(p, d, bytelen, newlen);
 933 }
 934
 935 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 936
 937 bool
 938 Perl_is_uni_alnum(pTHX_ UV c)
 939 {
 940     U8 tmpbuf[UTF8_MAXLEN+1];
 941     uvchr_to_utf8(tmpbuf, c);
 942     return is_utf8_alnum(tmpbuf);
 943 }
 944
 945 bool
 946 Perl_is_uni_alnumc(pTHX_ UV c)
 947 {
 948     U8 tmpbuf[UTF8_MAXLEN+1];
 949     uvchr_to_utf8(tmpbuf, c);
 950     return is_utf8_alnumc(tmpbuf);
 951 }
 952
 953 bool
 954 Perl_is_uni_idfirst(pTHX_ UV c)
 955 {
 956     U8 tmpbuf[UTF8_MAXLEN+1];
 957     uvchr_to_utf8(tmpbuf, c);
 958     return is_utf8_idfirst(tmpbuf);
 959 }
 960
 961 bool
 962 Perl_is_uni_alpha(pTHX_ UV c)
 963 {
 964     U8 tmpbuf[UTF8_MAXLEN+1];
 965     uvchr_to_utf8(tmpbuf, c);
 966     return is_utf8_alpha(tmpbuf);
 967 }
 968
 969 bool
 970 Perl_is_uni_ascii(pTHX_ UV c)
 971 {
 972     U8 tmpbuf[UTF8_MAXLEN+1];
 973     uvchr_to_utf8(tmpbuf, c);
 974     return is_utf8_ascii(tmpbuf);
 975 }
 976
 977 bool
 978 Perl_is_uni_space(pTHX_ UV c)
 979 {
 980     U8 tmpbuf[UTF8_MAXLEN+1];
 981     uvchr_to_utf8(tmpbuf, c);
 982     return is_utf8_space(tmpbuf);
 983 }
 984
 985 bool
 986 Perl_is_uni_digit(pTHX_ UV c)
 987 {
 988     U8 tmpbuf[UTF8_MAXLEN+1];
 989     uvchr_to_utf8(tmpbuf, c);
 990     return is_utf8_digit(tmpbuf);
 991 }
 992
 993 bool
 994 Perl_is_uni_upper(pTHX_ UV c)
 995 {
 996     U8 tmpbuf[UTF8_MAXLEN+1];
 997     uvchr_to_utf8(tmpbuf, c);
 998     return is_utf8_upper(tmpbuf);
 999 }
1000
1001 bool
1002 Perl_is_uni_lower(pTHX_ UV c)
1003 {
1004     U8 tmpbuf[UTF8_MAXLEN+1];
1005     uvchr_to_utf8(tmpbuf, c);
1006     return is_utf8_lower(tmpbuf);
1007 }
1008
1009 bool
1010 Perl_is_uni_cntrl(pTHX_ UV c)
1011 {
1012     U8 tmpbuf[UTF8_MAXLEN+1];
1013     uvchr_to_utf8(tmpbuf, c);
1014     return is_utf8_cntrl(tmpbuf);
1015 }
1016
1017 bool
1018 Perl_is_uni_graph(pTHX_ UV c)
1019 {
1020     U8 tmpbuf[UTF8_MAXLEN+1];
1021     uvchr_to_utf8(tmpbuf, c);
1022     return is_utf8_graph(tmpbuf);
1023 }
1024
1025 bool
1026 Perl_is_uni_print(pTHX_ UV c)
1027 {
1028     U8 tmpbuf[UTF8_MAXLEN+1];
1029     uvchr_to_utf8(tmpbuf, c);
1030     return is_utf8_print(tmpbuf);
1031 }
1032
1033 bool
1034 Perl_is_uni_punct(pTHX_ UV c)
1035 {
1036     U8 tmpbuf[UTF8_MAXLEN+1];
1037     uvchr_to_utf8(tmpbuf, c);
1038     return is_utf8_punct(tmpbuf);
1039 }
1040
1041 bool
1042 Perl_is_uni_xdigit(pTHX_ UV c)
1043 {
1044     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1045     uvchr_to_utf8(tmpbuf, c);
1046     return is_utf8_xdigit(tmpbuf);
1047 }
1048
1049 UV
1050 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1051 {
1052     uvchr_to_utf8(p, c);
1053     return to_utf8_upper(p, p, lenp);
1054 }
1055
1056 UV
1057 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1058 {
1059     uvchr_to_utf8(p, c);
1060     return to_utf8_title(p, p, lenp);
1061 }
1062
1063 UV
1064 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1065 {
1066     uvchr_to_utf8(p, c);
1067     return to_utf8_lower(p, p, lenp);
1068 }
1069
1070 UV
1071 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1072 {
1073     uvchr_to_utf8(p, c);
1074     return to_utf8_fold(p, p, lenp);
1075 }
1076
1077 /* for now these all assume no locale info available for Unicode > 255 */
1078
1079 bool
1080 Perl_is_uni_alnum_lc(pTHX_ UV c)
1081 {
1082     return is_uni_alnum(c);     /* XXX no locale support yet */
1083 }
1084
1085 bool
1086 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1087 {
1088     return is_uni_alnumc(c);    /* XXX no locale support yet */
1089 }
1090
1091 bool
1092 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1093 {
1094     return is_uni_idfirst(c);   /* XXX no locale support yet */
1095 }
1096
1097 bool
1098 Perl_is_uni_alpha_lc(pTHX_ UV c)
1099 {
1100     return is_uni_alpha(c);     /* XXX no locale support yet */
1101 }
1102
1103 bool
1104 Perl_is_uni_ascii_lc(pTHX_ UV c)
1105 {
1106     return is_uni_ascii(c);     /* XXX no locale support yet */
1107 }
1108
1109 bool
1110 Perl_is_uni_space_lc(pTHX_ UV c)
1111 {
1112     return is_uni_space(c);     /* XXX no locale support yet */
1113 }
1114
1115 bool
1116 Perl_is_uni_digit_lc(pTHX_ UV c)
1117 {
1118     return is_uni_digit(c);     /* XXX no locale support yet */
1119 }
1120
1121 bool
1122 Perl_is_uni_upper_lc(pTHX_ UV c)
1123 {
1124     return is_uni_upper(c);     /* XXX no locale support yet */
1125 }
1126
1127 bool
1128 Perl_is_uni_lower_lc(pTHX_ UV c)
1129 {
1130     return is_uni_lower(c);     /* XXX no locale support yet */
1131 }
1132
1133 bool
1134 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1135 {
1136     return is_uni_cntrl(c);     /* XXX no locale support yet */
1137 }
1138
1139 bool
1140 Perl_is_uni_graph_lc(pTHX_ UV c)
1141 {
1142     return is_uni_graph(c);     /* XXX no locale support yet */
1143 }
1144
1145 bool
1146 Perl_is_uni_print_lc(pTHX_ UV c)
1147 {
1148     return is_uni_print(c);     /* XXX no locale support yet */
1149 }
1150
1151 bool
1152 Perl_is_uni_punct_lc(pTHX_ UV c)
1153 {
1154     return is_uni_punct(c);     /* XXX no locale support yet */
1155 }
1156
1157 bool
1158 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1159 {
1160     return is_uni_xdigit(c);    /* XXX no locale support yet */
1161 }
1162
1163 U32
1164 Perl_to_uni_upper_lc(pTHX_ U32 c)
1165 {
1166     /* XXX returns only the first character -- do not use XXX */
1167     /* XXX no locale support yet */
1168     STRLEN len;
1169     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1170     return (U32)to_uni_upper(c, tmpbuf, &len);
1171 }
1172
1173 U32
1174 Perl_to_uni_title_lc(pTHX_ U32 c)
1175 {
1176     /* XXX returns only the first character XXX -- do not use XXX */
1177     /* XXX no locale support yet */
1178     STRLEN len;
1179     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1180     return (U32)to_uni_title(c, tmpbuf, &len);
1181 }
1182
1183 U32
1184 Perl_to_uni_lower_lc(pTHX_ U32 c)
1185 {
1186     /* XXX returns only the first character -- do not use XXX */
1187     /* XXX no locale support yet */
1188     STRLEN len;
1189     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1190     return (U32)to_uni_lower(c, tmpbuf, &len);
1191 }
1192
1193 bool
1194 Perl_is_utf8_alnum(pTHX_ U8 *p)
1195 {
1196     if (!is_utf8_char(p))
1197         return FALSE;
1198     if (!PL_utf8_alnum)
1199         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1200          * descendant of isalnum(3), in other words, it doesn't
1201          * contain the '_'. --jhi */
1202         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1203     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1204 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1205 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1206     if (!PL_utf8_alnum)
1207         PL_utf8_alnum = swash_init("utf8", "",
1208             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1209     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1210 #endif
1211 }
1212
1213 bool
1214 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1215 {
1216     if (!is_utf8_char(p))
1217         return FALSE;
1218     if (!PL_utf8_alnum)
1219         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1220     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1221 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1222 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1223     if (!PL_utf8_alnum)
1224         PL_utf8_alnum = swash_init("utf8", "",
1225             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1226     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1227 #endif
1228 }
1229
1230 bool
1231 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1232 {
1233     if (*p == '_')
1234         return TRUE;
1235     if (!is_utf8_char(p))
1236         return FALSE;
1237     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1238         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1239     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1240 }
1241
1242 bool
1243 Perl_is_utf8_idcont(pTHX_ U8 *p)
1244 {
1245     if (*p == '_')
1246         return TRUE;
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!PL_utf8_idcont)
1250         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1251     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1252 }
1253
1254 bool
1255 Perl_is_utf8_alpha(pTHX_ U8 *p)
1256 {
1257     if (!is_utf8_char(p))
1258         return FALSE;
1259     if (!PL_utf8_alpha)
1260         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1261     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1262 }
1263
1264 bool
1265 Perl_is_utf8_ascii(pTHX_ U8 *p)
1266 {
1267     if (!is_utf8_char(p))
1268         return FALSE;
1269     if (!PL_utf8_ascii)
1270         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1271     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1272 }
1273
1274 bool
1275 Perl_is_utf8_space(pTHX_ U8 *p)
1276 {
1277     if (!is_utf8_char(p))
1278         return FALSE;
1279     if (!PL_utf8_space)
1280         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1281     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1282 }
1283
1284 bool
1285 Perl_is_utf8_digit(pTHX_ U8 *p)
1286 {
1287     if (!is_utf8_char(p))
1288         return FALSE;
1289     if (!PL_utf8_digit)
1290         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1291     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1292 }
1293
1294 bool
1295 Perl_is_utf8_upper(pTHX_ U8 *p)
1296 {
1297     if (!is_utf8_char(p))
1298         return FALSE;
1299     if (!PL_utf8_upper)
1300         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1301     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1302 }
1303
1304 bool
1305 Perl_is_utf8_lower(pTHX_ U8 *p)
1306 {
1307     if (!is_utf8_char(p))
1308         return FALSE;
1309     if (!PL_utf8_lower)
1310         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1311     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1312 }
1313
1314 bool
1315 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1316 {
1317     if (!is_utf8_char(p))
1318         return FALSE;
1319     if (!PL_utf8_cntrl)
1320         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1321     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1322 }
1323
1324 bool
1325 Perl_is_utf8_graph(pTHX_ U8 *p)
1326 {
1327     if (!is_utf8_char(p))
1328         return FALSE;
1329     if (!PL_utf8_graph)
1330         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1331     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1332 }
1333
1334 bool
1335 Perl_is_utf8_print(pTHX_ U8 *p)
1336 {
1337     if (!is_utf8_char(p))
1338         return FALSE;
1339     if (!PL_utf8_print)
1340         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1341     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1342 }
1343
1344 bool
1345 Perl_is_utf8_punct(pTHX_ U8 *p)
1346 {
1347     if (!is_utf8_char(p))
1348         return FALSE;
1349     if (!PL_utf8_punct)
1350         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1351     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1352 }
1353
1354 bool
1355 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1356 {
1357     if (!is_utf8_char(p))
1358         return FALSE;
1359     if (!PL_utf8_xdigit)
1360         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1361     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1362 }
1363
1364 bool
1365 Perl_is_utf8_mark(pTHX_ U8 *p)
1366 {
1367     if (!is_utf8_char(p))
1368         return FALSE;
1369     if (!PL_utf8_mark)
1370         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1371     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1372 }
1373
1374 /*
1375 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377 The "p" contains the pointer to the UTF-8 string encoding
1378 the character that is being converted.
1379
1380 The "ustrp" is a pointer to the character buffer to put the
1381 conversion result to.  The "lenp" is a pointer to the length
1382 of the result.
1383
1384 The "swashp" is a pointer to the swash to use.
1385
1386 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1388 but not always, a multicharacter mapping), is tried first.
1389
1390 The "special" is a string like "utf8::ToSpecLower", which means the
1391 hash %utf8::ToSpecLower.  The access to the hash is through
1392 Perl_to_utf8_case().
1393
1394 The "normal" is a string like "ToLower" which means the swash
1395 %utf8::ToLower.
1396
1397 =cut */
1398
1399 UV
1400 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1401 {
1402     UV uv0, uv1;
1403     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1404     STRLEN len = 0;
1405
1406     uv0 = utf8_to_uvchr(p, 0);
1407     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1408      * are necessary in EBCDIC, they are redundant no-ops
1409      * in ASCII-ish platforms, and hopefully optimized away. */
1410     uv1 = NATIVE_TO_UNI(uv0);
1411     uvuni_to_utf8(tmpbuf, uv1);
1412
1413     if (!*swashp) /* load on-demand */
1414          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1415
1416     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1417     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1418          /* It might be "special" (sometimes, but not always,
1419           * a multicharacter mapping) */
1420          HV *hv;
1421          SV **svp;
1422
1423          if ((hv  = get_hv(special, FALSE)) &&
1424              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1425              (*svp)) {
1426               char *s;
1427
1428               s = SvPV(*svp, len);
1429               if (len == 1)
1430                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1431               else {
1432 #ifdef EBCDIC
1433                    /* If we have EBCDIC we need to remap the characters
1434                     * since any characters in the low 256 are Unicode
1435                     * code points, not EBCDIC. */
1436                    U8 *t = (U8*)s, *tend = t + len, *d;
1437
1438                    d = tmpbuf;
1439                    if (SvUTF8(*svp)) {
1440                         STRLEN tlen = 0;
1441
1442                         while (t < tend) {
1443                              UV c = utf8_to_uvchr(t, &tlen);
1444                              if (tlen > 0) {
1445                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1446                                   t += tlen;
1447                              }
1448                              else
1449                                   break;
1450                         }
1451                    }
1452                    else {
1453                         while (t < tend) {
1454                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1455                              t++;
1456                         }
1457                    }
1458                    len = d - tmpbuf;
1459                    Copy(tmpbuf, ustrp, len, U8);
1460 #else
1461                    Copy(s, ustrp, len, U8);
1462 #endif
1463               }
1464          }
1465     }
1466
1467     if (!len && *swashp) {
1468          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1469
1470          if (uv2) {
1471               /* It was "normal" (a single character mapping). */
1472               UV uv3 = UNI_TO_NATIVE(uv2);
1473
1474               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1475          }
1476     }
1477
1478     if (!len) /* Neither: just copy. */
1479          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1480
1481     if (lenp)
1482          *lenp = len;
1483
1484     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1485 }
1486
1487 /*
1488 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490 Convert the UTF-8 encoded character at p to its uppercase version and
1491 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1492 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1493 uppercase version may be longer than the original character (up to two
1494 characters).
1495
1496 The first character of the uppercased version is returned
1497 (but note, as explained above, that there may be more.)
1498
1499 =cut */
1500
1501 UV
1502 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1503 {
1504     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1505                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1506 }
1507
1508 /*
1509 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1510
1511 Convert the UTF-8 encoded character at p to its titlecase version and
1512 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1513 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1514 titlecase version may be longer than the original character (up to two
1515 characters).
1516
1517 The first character of the titlecased version is returned
1518 (but note, as explained above, that there may be more.)
1519
1520 =cut */
1521
1522 UV
1523 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1524 {
1525     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1526                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1527 }
1528
1529 /*
1530 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1531
1532 Convert the UTF-8 encoded character at p to its lowercase version and
1533 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1534 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1535 lowercase version may be longer than the original character (up to two
1536 characters).
1537
1538 The first character of the lowercased version is returned
1539 (but note, as explained above, that there may be more.)
1540
1541 =cut */
1542
1543 UV
1544 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1545 {
1546     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1547                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1548 }
1549
1550 /*
1551 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1552
1553 Convert the UTF-8 encoded character at p to its foldcase version and
1554 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1555 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1556 foldcase version may be longer than the original character (up to
1557 three characters).
1558
1559 The first character of the foldcased version is returned
1560 (but note, as explained above, that there may be more.)
1561
1562 =cut */
1563
1564 UV
1565 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1566 {
1567     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1568                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1569 }
1570
1571 /* a "swash" is a swatch hash */
1572
1573 SV*
1574 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1575 {
1576     SV* retval;
1577     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1578     dSP;
1579     size_t pkg_len = strlen(pkg);
1580     size_t name_len = strlen(name);
1581     HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
1582     SV* errsv_save;
1583
1584     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1585         ENTER;
1586         errsv_save = newSVsv(ERRSV);
1587         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1588                          Nullsv);
1589         if (!SvTRUE(ERRSV))
1590             sv_setsv(ERRSV, errsv_save);
1591         SvREFCNT_dec(errsv_save);
1592         LEAVE;
1593     }
1594     SPAGAIN;
1595     PUSHSTACKi(PERLSI_MAGIC);
1596     PUSHMARK(SP);
1597     EXTEND(SP,5);
1598     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1599     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1600     PUSHs(listsv);
1601     PUSHs(sv_2mortal(newSViv(minbits)));
1602     PUSHs(sv_2mortal(newSViv(none)));
1603     PUTBACK;
1604     ENTER;
1605     SAVEI32(PL_hints);
1606     PL_hints = 0;
1607     save_re_context();
1608     if (IN_PERL_COMPILETIME) {
1609         /* XXX ought to be handled by lex_start */
1610         SAVEI32(PL_in_my);
1611         PL_in_my = 0;
1612         sv_setpv(tokenbufsv, PL_tokenbuf);
1613     }
1614     errsv_save = newSVsv(ERRSV);
1615     if (call_method("SWASHNEW", G_SCALAR))
1616         retval = newSVsv(*PL_stack_sp--);
1617     else
1618         retval = &PL_sv_undef;
1619     if (!SvTRUE(ERRSV))
1620         sv_setsv(ERRSV, errsv_save);
1621     SvREFCNT_dec(errsv_save);
1622     LEAVE;
1623     POPSTACK;
1624     if (IN_PERL_COMPILETIME) {
1625         STRLEN len;
1626         char* pv = SvPV(tokenbufsv, len);
1627
1628         Copy(pv, PL_tokenbuf, len+1, char);
1629         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1630     }
1631     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1632         if (SvPOK(retval))
1633             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1634                        retval);
1635         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1636     }
1637     return retval;
1638 }
1639
1640
1641 /* This API is wrong for special case conversions since we may need to
1642  * return several Unicode characters for a single Unicode character
1643  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1644  * the lower-level routine, and it is similarly broken for returning
1645  * multiple values.  --jhi */
1646 UV
1647 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1648 {
1649     HV* hv = (HV*)SvRV(sv);
1650     U32 klen;
1651     U32 off;
1652     STRLEN slen;
1653     STRLEN needents;
1654     U8 *tmps = NULL;
1655     U32 bit;
1656     SV *retval;
1657     U8 tmputf8[2];
1658     UV c = NATIVE_TO_ASCII(*ptr);
1659
1660     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1661         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1662         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1663         ptr = tmputf8;
1664     }
1665     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1666      * then the "swatch" is a vec() for al the chars which start
1667      * with 0xAA..0xYY
1668      * So the key in the hash (klen) is length of encoded char -1
1669      */
1670     klen = UTF8SKIP(ptr) - 1;
1671     off  = ptr[klen];
1672
1673     if (klen == 0)
1674      {
1675       /* If char in invariant then swatch is for all the invariant chars
1676        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1677        */
1678       needents = UTF_CONTINUATION_MARK;
1679       off      = NATIVE_TO_UTF(ptr[klen]);
1680      }
1681     else
1682      {
1683       /* If char is encoded then swatch is for the prefix */
1684       needents = (1 << UTF_ACCUMULATION_SHIFT);
1685       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1686      }
1687
1688     /*
1689      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1690      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1691      * it's nothing to sniff at.)  Pity we usually come through at least
1692      * two function calls to get here...
1693      *
1694      * NB: this code assumes that swatches are never modified, once generated!
1695      */
1696
1697     if (hv   == PL_last_swash_hv &&
1698         klen == PL_last_swash_klen &&
1699         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1700     {
1701         tmps = PL_last_swash_tmps;
1702         slen = PL_last_swash_slen;
1703     }
1704     else {
1705         /* Try our second-level swatch cache, kept in a hash. */
1706         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1707
1708         /* If not cached, generate it via utf8::SWASHGET */
1709         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1710             dSP;
1711             /* We use utf8n_to_uvuni() as we want an index into
1712                Unicode tables, not a native character number.
1713              */
1714             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
1715                                            ckWARN(WARN_UTF8) ?
1716                                            0 : UTF8_ALLOW_ANY);
1717             SV *errsv_save;
1718             ENTER;
1719             SAVETMPS;
1720             save_re_context();
1721             PUSHSTACKi(PERLSI_MAGIC);
1722             PUSHMARK(SP);
1723             EXTEND(SP,3);
1724             PUSHs((SV*)sv);
1725             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1726             PUSHs(sv_2mortal(newSViv((klen) ?
1727                                      (code_point & ~(needents - 1)) : 0)));
1728             PUSHs(sv_2mortal(newSViv(needents)));
1729             PUTBACK;
1730             errsv_save = newSVsv(ERRSV);
1731             if (call_method("SWASHGET", G_SCALAR))
1732                 retval = newSVsv(*PL_stack_sp--);
1733             else
1734                 retval = &PL_sv_undef;
1735             if (!SvTRUE(ERRSV))
1736                 sv_setsv(ERRSV, errsv_save);
1737             SvREFCNT_dec(errsv_save);
1738             POPSTACK;
1739             FREETMPS;
1740             LEAVE;
1741             if (IN_PERL_COMPILETIME)
1742                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1743
1744             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1745
1746             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1747                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1748         }
1749
1750         PL_last_swash_hv = hv;
1751         PL_last_swash_klen = klen;
1752         PL_last_swash_tmps = tmps;
1753         PL_last_swash_slen = slen;
1754         if (klen)
1755             Copy(ptr, PL_last_swash_key, klen, U8);
1756     }
1757
1758     switch ((int)((slen << 3) / needents)) {
1759     case 1:
1760         bit = 1 << (off & 7);
1761         off >>= 3;
1762         return (tmps[off] & bit) != 0;
1763     case 8:
1764         return tmps[off];
1765     case 16:
1766         off <<= 1;
1767         return (tmps[off] << 8) + tmps[off + 1] ;
1768     case 32:
1769         off <<= 2;
1770         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1771     }
1772     Perl_croak(aTHX_ "panic: swash_fetch");
1773     return 0;
1774 }
1775
1776
1777 /*
1778 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1779
1780 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1781 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1782 bytes available. The return value is the pointer to the byte after the
1783 end of the new character. In other words,
1784
1785     d = uvchr_to_utf8(d, uv);
1786
1787 is the recommended wide native character-aware way of saying
1788
1789     *(d++) = uv;
1790
1791 =cut
1792 */
1793
1794 /* On ASCII machines this is normally a macro but we want a
1795    real function in case XS code wants it
1796 */
1797 #undef Perl_uvchr_to_utf8
1798 U8 *
1799 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1800 {
1801     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1802 }
1803
1804 U8 *
1805 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1806 {
1807     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1808 }
1809
1810 /*
1811 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1812
1813 Returns the native character value of the first character in the string C<s>
1814 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1815 length, in bytes, of that character.
1816
1817 Allows length and flags to be passed to low level routine.
1818
1819 =cut
1820 */
1821 /* On ASCII machines this is normally a macro but we want
1822    a real function in case XS code wants it
1823 */
1824 #undef Perl_utf8n_to_uvchr
1825 UV
1826 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1827 {
1828     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1829     return UNI_TO_NATIVE(uv);
1830 }
1831
1832 /*
1833 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1834
1835 Build to the scalar dsv a displayable version of the string spv,
1836 length len, the displayable version being at most pvlim bytes long
1837 (if longer, the rest is truncated and "..." will be appended).
1838
1839 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1840 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1841 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1842 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1843 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1844 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1845
1846 The pointer to the PV of the dsv is returned.
1847
1848 =cut */
1849 char *
1850 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1851 {
1852     int truncated = 0;
1853     char *s, *e;
1854
1855     sv_setpvn(dsv, "", 0);
1856     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1857          UV u;
1858          bool ok = FALSE;
1859
1860          if (pvlim && SvCUR(dsv) >= pvlim) {
1861               truncated++;
1862               break;
1863          }
1864          u = utf8_to_uvchr((U8*)s, 0);
1865          if (u < 256) {
1866              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1867                  switch (u & 0xFF) {
1868                  case '\n':
1869                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1870                  case '\r':
1871                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1872                  case '\t':
1873                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1874                  case '\f':
1875                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1876                  case '\a':
1877                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1878                  case '\\':
1879                      Perl_sv_catpvf(aTHX_ dsv, "\\\\" ); ok = TRUE; break;
1880                  default: break;
1881                  }
1882              }
1883              /* isPRINT() is the locale-blind version. */
1884              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1885                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1886                  ok = TRUE;
1887              }
1888          }
1889          if (!ok)
1890              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1891     }
1892     if (truncated)
1893          sv_catpvn(dsv, "...", 3);
1894
1895     return SvPVX(dsv);
1896 }
1897
1898 /*
1899 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1900
1901 Build to the scalar dsv a displayable version of the scalar sv,
1902 the displayable version being at most pvlim bytes long
1903 (if longer, the rest is truncated and "..." will be appended).
1904
1905 The flags argument is as in pv_uni_display().
1906
1907 The pointer to the PV of the dsv is returned.
1908
1909 =cut */
1910 char *
1911 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1912 {
1913      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1914                                 pvlim, flags);
1915 }
1916
1917 /*
1918 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1919
1920 Return true if the strings s1 and s2 differ case-insensitively, false
1921 if not (if they are equal case-insensitively).  If u1 is true, the
1922 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1923 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1924 are false, the respective string is assumed to be in native 8-bit
1925 encoding.
1926
1927 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1928 in there (they will point at the beginning of the I<next> character).
1929 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1930 pointers beyond which scanning will not continue under any
1931 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1932 s2+l2 will be used as goal end pointers that will also stop the scan,
1933 and which qualify towards defining a successful match: all the scans
1934 that define an explicit length must reach their goal pointers for
1935 a match to succeed).
1936
1937 For case-insensitiveness, the "casefolding" of Unicode is used
1938 instead of upper/lowercasing both the characters, see
1939 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1940
1941 =cut */
1942 I32
1943 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1944 {
1945      register U8 *p1  = (U8*)s1;
1946      register U8 *p2  = (U8*)s2;
1947      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1948      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1949      STRLEN n1 = 0, n2 = 0;
1950      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1951      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1952      U8 natbuf[1+1];
1953      STRLEN foldlen1, foldlen2;
1954      bool match;
1955
1956      if (pe1)
1957           e1 = *(U8**)pe1;
1958      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (U8*)s1)))
1959           f1 = (U8*)s1 + l1;
1960      if (pe2)
1961           e2 = *(U8**)pe2;
1962      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (U8*)s2)))
1963           f2 = (U8*)s2 + l2;
1964
1965      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1966           return 1; /* mismatch; possible infinite loop or false positive */
1967
1968      if (!u1 || !u2)
1969           natbuf[1] = 0; /* Need to terminate the buffer. */
1970
1971      while ((e1 == 0 || p1 < e1) &&
1972             (f1 == 0 || p1 < f1) &&
1973             (e2 == 0 || p2 < e2) &&
1974             (f2 == 0 || p2 < f2)) {
1975           if (n1 == 0) {
1976                if (u1)
1977                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1978                else {
1979                     natbuf[0] = *p1;
1980                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1981                }
1982                q1 = foldbuf1;
1983                n1 = foldlen1;
1984           }
1985           if (n2 == 0) {
1986                if (u2)
1987                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1988                else {
1989                     natbuf[0] = *p2;
1990                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1991                }
1992                q2 = foldbuf2;
1993                n2 = foldlen2;
1994           }
1995           while (n1 && n2) {
1996                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1997                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1998                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1999                    return 1; /* mismatch */
2000                n1 -= UTF8SKIP(q1);
2001                q1 += UTF8SKIP(q1);
2002                n2 -= UTF8SKIP(q2);
2003                q2 += UTF8SKIP(q2);
2004           }
2005           if (n1 == 0)
2006                p1 += u1 ? UTF8SKIP(p1) : 1;
2007           if (n2 == 0)
2008                p2 += u2 ? UTF8SKIP(p2) : 1;
2009
2010      }
2011
2012      /* A match is defined by all the scans that specified
2013       * an explicit length reaching their final goals. */
2014      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2015
2016      if (match) {
2017           if (pe1)
2018                *pe1 = (char*)p1;
2019           if (pe2)
2020                *pe2 = (char*)p2;
2021      }
2022
2023      return match ? 0 : 1; /* 0 match, 1 mismatch */
2024 }
2025