utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
   4  *    others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 U8 *
 171 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 172 {
 173     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 174 }
 175
 176
 177 /*
 178 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 179
 180 Tests if some arbitrary number of bytes begins in a valid UTF-8
 181 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 182 UTF-8 character.  The actual number of bytes in the UTF-8 character
 183 will be returned if it is valid, otherwise 0.
 184
 185 =cut */
 186 STRLEN
 187 Perl_is_utf8_char(pTHX_ const U8 *s)
 188 {
 189     U8 u = *s;
 190     STRLEN slen, len;
 191     UV uv, ouv;
 192
 193     if (UTF8_IS_INVARIANT(u))
 194         return 1;
 195
 196     if (!UTF8_IS_START(u))
 197         return 0;
 198
 199     len = UTF8SKIP(s);
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206     u &= UTF_START_MASK(len);
 207     uv  = u;
 208     ouv = uv;
 209     while (slen--) {
 210         if (!UTF8_IS_CONTINUATION(*s))
 211             return 0;
 212         uv = UTF8_ACCUMULATE(uv, *s);
 213         if (uv < ouv)
 214             return 0;
 215         ouv = uv;
 216         s++;
 217     }
 218
 219     if ((STRLEN)UNISKIP(uv) < len)
 220         return 0;
 221
 222     return len;
 223 }
 224
 225 /*
 226 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 227
 228 Returns true if first C<len> bytes of the given string form a valid
 229 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 230 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 231 because a valid ASCII string is a valid UTF-8 string.
 232
 233 =cut
 234 */
 235
 236 bool
 237 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 238 {
 239     const U8* x = s;
 240     const U8* send;
 241     STRLEN c;
 242
 243     if (!len && s)
 244         len = strlen((const char *)s);
 245     send = s + len;
 246
 247     while (x < send) {
 248          /* Inline the easy bits of is_utf8_char() here for speed... */
 249          if (UTF8_IS_INVARIANT(*x))
 250               c = 1;
 251          else if (!UTF8_IS_START(*x))
 252               return FALSE;
 253          else {
 254               /* ... and call is_utf8_char() only if really needed. */
 255               c = is_utf8_char(x);
 256               if (!c)
 257                    return FALSE;
 258          }
 259         x += c;
 260     }
 261     if (x != send)
 262         return FALSE;
 263
 264     return TRUE;
 265 }
 266
 267 /*
 268 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **p
 269
 270 Like is_ut8_string but store the location of the failure in
 271 the last argument.
 272
 273 =cut
 274 */
 275
 276 bool
 277 Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **p)
 278 {
 279     const U8* x = s;
 280     const U8* send;
 281     STRLEN c;
 282
 283     if (!len && s)
 284         len = strlen((const char *)s);
 285     send = s + len;
 286
 287     while (x < send) {
 288          /* Inline the easy bits of is_utf8_char() here for speed... */
 289          if (UTF8_IS_INVARIANT(*x))
 290               c = 1;
 291          else if (!UTF8_IS_START(*x)) {
 292               if (p)
 293                   *p = x;
 294               return FALSE;
 295          }
 296          else {
 297               /* ... and call is_utf8_char() only if really needed. */
 298               c = is_utf8_char(x);
 299               if (!c) {
 300                    if (p)
 301                       *p = x;
 302                    return FALSE;
 303               }
 304          }
 305         x += c;
 306     }
 307     if (x != send) {
 308        if (p)
 309            *p = x;
 310         return FALSE;
 311     }
 312
 313     return TRUE;
 314 }
 315
 316 /*
 317 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 318
 319 Bottom level UTF-8 decode routine.
 320 Returns the unicode code point value of the first character in the string C<s>
 321 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 322 C<retlen> will be set to the length, in bytes, of that character.
 323
 324 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 325 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 326 it is assumed that the caller will raise a warning, and this function
 327 will silently just set C<retlen> to C<-1> and return zero.  If the
 328 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 329 malformations will be given, C<retlen> will be set to the expected
 330 length of the UTF-8 character in bytes, and zero will be returned.
 331
 332 The C<flags> can also contain various flags to allow deviations from
 333 the strict UTF-8 encoding (see F<utf8.h>).
 334
 335 Most code should use utf8_to_uvchr() rather than call this directly.
 336
 337 =cut
 338 */
 339
 340 UV
 341 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 342 {
 343     const U8 *s0 = s;
 344     UV uv = *s, ouv = 0;
 345     STRLEN len = 1;
 346     const bool dowarn = ckWARN_d(WARN_UTF8);
 347     const UV startbyte = *s;
 348     STRLEN expectlen = 0;
 349     U32 warning = 0;
 350
 351 /* This list is a superset of the UTF8_ALLOW_XXX. */
 352
 353 #define UTF8_WARN_EMPTY                          1
 354 #define UTF8_WARN_CONTINUATION                   2
 355 #define UTF8_WARN_NON_CONTINUATION               3
 356 #define UTF8_WARN_FE_FF                          4
 357 #define UTF8_WARN_SHORT                          5
 358 #define UTF8_WARN_OVERFLOW                       6
 359 #define UTF8_WARN_SURROGATE                      7
 360 #define UTF8_WARN_LONG                           8
 361 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 362
 363     if (curlen == 0 &&
 364         !(flags & UTF8_ALLOW_EMPTY)) {
 365         warning = UTF8_WARN_EMPTY;
 366         goto malformed;
 367     }
 368
 369     if (UTF8_IS_INVARIANT(uv)) {
 370         if (retlen)
 371             *retlen = 1;
 372         return (UV) (NATIVE_TO_UTF(*s));
 373     }
 374
 375     if (UTF8_IS_CONTINUATION(uv) &&
 376         !(flags & UTF8_ALLOW_CONTINUATION)) {
 377         warning = UTF8_WARN_CONTINUATION;
 378         goto malformed;
 379     }
 380
 381     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 382         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 383         warning = UTF8_WARN_NON_CONTINUATION;
 384         goto malformed;
 385     }
 386
 387 #ifdef EBCDIC
 388     uv = NATIVE_TO_UTF(uv);
 389 #else
 390     if ((uv == 0xfe || uv == 0xff) &&
 391         !(flags & UTF8_ALLOW_FE_FF)) {
 392         warning = UTF8_WARN_FE_FF;
 393         goto malformed;
 394     }
 395 #endif
 396
 397     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 398     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 399     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 400     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 401 #ifdef EBCDIC
 402     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 403     else                        { len =  7; uv &= 0x01; }
 404 #else
 405     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 406     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 407     else                        { len = 13; uv = 0; } /* whoa! */
 408 #endif
 409
 410     if (retlen)
 411         *retlen = len;
 412
 413     expectlen = len;
 414
 415     if ((curlen < expectlen) &&
 416         !(flags & UTF8_ALLOW_SHORT)) {
 417         warning = UTF8_WARN_SHORT;
 418         goto malformed;
 419     }
 420
 421     len--;
 422     s++;
 423     ouv = uv;
 424
 425     while (len--) {
 426         if (!UTF8_IS_CONTINUATION(*s) &&
 427             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 428             s--;
 429             warning = UTF8_WARN_NON_CONTINUATION;
 430             goto malformed;
 431         }
 432         else
 433             uv = UTF8_ACCUMULATE(uv, *s);
 434         if (!(uv > ouv)) {
 435             /* These cannot be allowed. */
 436             if (uv == ouv) {
 437                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 438                     warning = UTF8_WARN_LONG;
 439                     goto malformed;
 440                 }
 441             }
 442             else { /* uv < ouv */
 443                 /* This cannot be allowed. */
 444                 warning = UTF8_WARN_OVERFLOW;
 445                 goto malformed;
 446             }
 447         }
 448         s++;
 449         ouv = uv;
 450     }
 451
 452     if (UNICODE_IS_SURROGATE(uv) &&
 453         !(flags & UTF8_ALLOW_SURROGATE)) {
 454         warning = UTF8_WARN_SURROGATE;
 455         goto malformed;
 456     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 457                !(flags & UTF8_ALLOW_LONG)) {
 458         warning = UTF8_WARN_LONG;
 459         goto malformed;
 460     } else if (UNICODE_IS_ILLEGAL(uv) &&
 461                !(flags & UTF8_ALLOW_FFFF)) {
 462         warning = UTF8_WARN_FFFF;
 463         goto malformed;
 464     }
 465
 466     return uv;
 467
 468 malformed:
 469
 470     if (flags & UTF8_CHECK_ONLY) {
 471         if (retlen)
 472             *retlen = -1;
 473         return 0;
 474     }
 475
 476     if (dowarn) {
 477         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 478
 479         switch (warning) {
 480         case 0: /* Intentionally empty. */ break;
 481         case UTF8_WARN_EMPTY:
 482             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 483             break;
 484         case UTF8_WARN_CONTINUATION:
 485             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 486             break;
 487         case UTF8_WARN_NON_CONTINUATION:
 488             if (s == s0)
 489                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 490                            (UV)s[1], startbyte);
 491             else
 492                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 493                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 494
 495             break;
 496         case UTF8_WARN_FE_FF:
 497             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 498             break;
 499         case UTF8_WARN_SHORT:
 500             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 501                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 502             expectlen = curlen;         /* distance for caller to skip */
 503             break;
 504         case UTF8_WARN_OVERFLOW:
 505             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 506                            ouv, *s, startbyte);
 507             break;
 508         case UTF8_WARN_SURROGATE:
 509             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 510             break;
 511         case UTF8_WARN_LONG:
 512             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 513                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 514             break;
 515         case UTF8_WARN_FFFF:
 516             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 517             break;
 518         default:
 519             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 520             break;
 521         }
 522
 523         if (warning) {
 524             char *s = SvPVX(sv);
 525
 526             if (PL_op)
 527                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 528                             "%s in %s", s,  OP_DESC(PL_op));
 529             else
 530                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 531         }
 532     }
 533
 534     if (retlen)
 535         *retlen = expectlen ? expectlen : len;
 536
 537     return 0;
 538 }
 539
 540 /*
 541 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 542
 543 Returns the native character value of the first character in the string C<s>
 544 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 545 length, in bytes, of that character.
 546
 547 If C<s> does not point to a well-formed UTF-8 character, zero is
 548 returned and retlen is set, if possible, to -1.
 549
 550 =cut
 551 */
 552
 553 UV
 554 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 555 {
 556     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
 557                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 558 }
 559
 560 /*
 561 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 562
 563 Returns the Unicode code point of the first character in the string C<s>
 564 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 565 length, in bytes, of that character.
 566
 567 This function should only be used when returned UV is considered
 568 an index into the Unicode semantic tables (e.g. swashes).
 569
 570 If C<s> does not point to a well-formed UTF-8 character, zero is
 571 returned and retlen is set, if possible, to -1.
 572
 573 =cut
 574 */
 575
 576 UV
 577 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 578 {
 579     /* Call the low level routine asking for checks */
 580     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 581                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 582 }
 583
 584 /*
 585 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 586
 587 Return the length of the UTF-8 char encoded string C<s> in characters.
 588 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 589 up past C<e>, croaks.
 590
 591 =cut
 592 */
 593
 594 STRLEN
 595 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 596 {
 597     STRLEN len = 0;
 598
 599     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 600      * the bitops (especially ~) can create illegal UTF-8.
 601      * In other words: in Perl UTF-8 is not just for Unicode. */
 602
 603     if (e < s) {
 604         if (ckWARN_d(WARN_UTF8)) {
 605             if (PL_op)
 606                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 607                             "%s in %s", unees, OP_DESC(PL_op));
 608             else
 609                 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 610         }
 611         return 0;
 612     }
 613     while (s < e) {
 614         U8 t = UTF8SKIP(s);
 615
 616         if (e - s < t) {
 617             if (ckWARN_d(WARN_UTF8)) {
 618                 if (PL_op)
 619                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 620                                 unees, OP_DESC(PL_op));
 621                 else
 622                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 623             }
 624             return len;
 625         }
 626         s += t;
 627         len++;
 628     }
 629
 630     return len;
 631 }
 632
 633 /*
 634 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 635
 636 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 637 and C<b>.
 638
 639 WARNING: use only if you *know* that the pointers point inside the
 640 same UTF-8 buffer.
 641
 642 =cut
 643 */
 644
 645 IV
 646 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 647 {
 648     IV off = 0;
 649
 650     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 651      * the bitops (especially ~) can create illegal UTF-8.
 652      * In other words: in Perl UTF-8 is not just for Unicode. */
 653
 654     if (a < b) {
 655         while (a < b) {
 656             const U8 c = UTF8SKIP(a);
 657
 658             if (b - a < c) {
 659                 if (ckWARN_d(WARN_UTF8)) {
 660                     if (PL_op)
 661                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 662                                     "%s in %s", unees, OP_DESC(PL_op));
 663                     else
 664                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 665                 }
 666                 return off;
 667             }
 668             a += c;
 669             off--;
 670         }
 671     }
 672     else {
 673         while (b < a) {
 674             U8 c = UTF8SKIP(b);
 675
 676             if (a - b < c) {
 677                 if (ckWARN_d(WARN_UTF8)) {
 678                     if (PL_op)
 679                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 680                                     "%s in %s", unees, OP_DESC(PL_op));
 681                     else
 682                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 683                 }
 684                 return off;
 685             }
 686             b += c;
 687             off++;
 688         }
 689     }
 690
 691     return off;
 692 }
 693
 694 /*
 695 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 696
 697 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 698 forward or backward.
 699
 700 WARNING: do not use the following unless you *know* C<off> is within
 701 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 702 on the first byte of character or just after the last byte of a character.
 703
 704 =cut
 705 */
 706
 707 U8 *
 708 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 709 {
 710     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 711      * the bitops (especially ~) can create illegal UTF-8.
 712      * In other words: in Perl UTF-8 is not just for Unicode. */
 713
 714     if (off >= 0) {
 715         while (off--)
 716             s += UTF8SKIP(s);
 717     }
 718     else {
 719         while (off++) {
 720             s--;
 721             while (UTF8_IS_CONTINUATION(*s))
 722                 s--;
 723         }
 724     }
 725     return s;
 726 }
 727
 728 /*
 729 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 730
 731 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 732 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 733 updates len to contain the new length.
 734 Returns zero on failure, setting C<len> to -1.
 735
 736 =cut
 737 */
 738
 739 U8 *
 740 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 741 {
 742     U8 *send;
 743     U8 *d;
 744     U8 *save = s;
 745
 746     /* ensure valid UTF-8 and chars < 256 before updating string */
 747     for (send = s + *len; s < send; ) {
 748         U8 c = *s++;
 749
 750         if (!UTF8_IS_INVARIANT(c) &&
 751             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 752              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 753             *len = -1;
 754             return 0;
 755         }
 756     }
 757
 758     d = s = save;
 759     while (s < send) {
 760         STRLEN ulen;
 761         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 762         s += ulen;
 763     }
 764     *d = '\0';
 765     *len = d - save;
 766     return save;
 767 }
 768
 769 /*
 770 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 771
 772 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 773 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 774 the newly-created string, and updates C<len> to contain the new
 775 length.  Returns the original string if no conversion occurs, C<len>
 776 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 777 0 if C<s> is converted or contains all 7bit characters.
 778
 779 =cut
 780 */
 781
 782 U8 *
 783 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 784 {
 785     U8 *d;
 786     const U8 *start = s;
 787     const U8 *send;
 788     I32 count = 0;
 789
 790     if (!*is_utf8)
 791         return (U8 *)start;
 792
 793     /* ensure valid UTF-8 and chars < 256 before converting string */
 794     for (send = s + *len; s < send;) {
 795         U8 c = *s++;
 796         if (!UTF8_IS_INVARIANT(c)) {
 797             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 798                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 799                 count++;
 800             else
 801                 return (U8 *)start;
 802         }
 803     }
 804
 805     *is_utf8 = 0;
 806
 807     Newz(801, d, (*len) - count + 1, U8);
 808     s = start; start = d;
 809     while (s < send) {
 810         U8 c = *s++;
 811         if (!UTF8_IS_INVARIANT(c)) {
 812             /* Then it is two-byte encoded */
 813             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 814             c = ASCII_TO_NATIVE(c);
 815         }
 816         *d++ = c;
 817     }
 818     *d = '\0';
 819     *len = d - start;
 820     return (U8 *)start;
 821 }
 822
 823 /*
 824 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 825
 826 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 827 Returns a pointer to the newly-created string, and sets C<len> to
 828 reflect the new length.
 829
 830 If you want to convert to UTF-8 from other encodings than ASCII,
 831 see sv_recode_to_utf8().
 832
 833 =cut
 834 */
 835
 836 U8*
 837 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 838 {
 839     const U8 * const send = s + (*len);
 840     U8 *d;
 841     U8 *dst;
 842
 843     Newz(801, d, (*len) * 2 + 1, U8);
 844     dst = d;
 845
 846     while (s < send) {
 847         const UV uv = NATIVE_TO_ASCII(*s++);
 848         if (UNI_IS_INVARIANT(uv))
 849             *d++ = (U8)UTF_TO_NATIVE(uv);
 850         else {
 851             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 852             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 853         }
 854     }
 855     *d = '\0';
 856     *len = d-dst;
 857     return dst;
 858 }
 859
 860 /*
 861  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 862  *
 863  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 864  * We optimize for native, for obvious reasons. */
 865
 866 U8*
 867 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 868 {
 869     U8* pend;
 870     U8* dstart = d;
 871
 872     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 873          d[0] = 0;
 874          *newlen = 1;
 875          return d;
 876     }
 877
 878     if (bytelen & 1)
 879         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 880
 881     pend = p + bytelen;
 882
 883     while (p < pend) {
 884         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 885         p += 2;
 886         if (uv < 0x80) {
 887             *d++ = (U8)uv;
 888             continue;
 889         }
 890         if (uv < 0x800) {
 891             *d++ = (U8)(( uv >>  6)         | 0xc0);
 892             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 893             continue;
 894         }
 895         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 896             UV low = (p[0] << 8) + p[1];
 897             p += 2;
 898             if (low < 0xdc00 || low >= 0xdfff)
 899                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 900             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 901         }
 902         if (uv < 0x10000) {
 903             *d++ = (U8)(( uv >> 12)         | 0xe0);
 904             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 905             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 906             continue;
 907         }
 908         else {
 909             *d++ = (U8)(( uv >> 18)         | 0xf0);
 910             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 911             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 912             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 913             continue;
 914         }
 915     }
 916     *newlen = d - dstart;
 917     return d;
 918 }
 919
 920 /* Note: this one is slightly destructive of the source. */
 921
 922 U8*
 923 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 924 {
 925     U8* s = (U8*)p;
 926     U8* send = s + bytelen;
 927     while (s < send) {
 928         U8 tmp = s[0];
 929         s[0] = s[1];
 930         s[1] = tmp;
 931         s += 2;
 932     }
 933     return utf16_to_utf8(p, d, bytelen, newlen);
 934 }
 935
 936 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 937
 938 bool
 939 Perl_is_uni_alnum(pTHX_ UV c)
 940 {
 941     U8 tmpbuf[UTF8_MAXBYTES+1];
 942     uvchr_to_utf8(tmpbuf, c);
 943     return is_utf8_alnum(tmpbuf);
 944 }
 945
 946 bool
 947 Perl_is_uni_alnumc(pTHX_ UV c)
 948 {
 949     U8 tmpbuf[UTF8_MAXBYTES+1];
 950     uvchr_to_utf8(tmpbuf, c);
 951     return is_utf8_alnumc(tmpbuf);
 952 }
 953
 954 bool
 955 Perl_is_uni_idfirst(pTHX_ UV c)
 956 {
 957     U8 tmpbuf[UTF8_MAXBYTES+1];
 958     uvchr_to_utf8(tmpbuf, c);
 959     return is_utf8_idfirst(tmpbuf);
 960 }
 961
 962 bool
 963 Perl_is_uni_alpha(pTHX_ UV c)
 964 {
 965     U8 tmpbuf[UTF8_MAXBYTES+1];
 966     uvchr_to_utf8(tmpbuf, c);
 967     return is_utf8_alpha(tmpbuf);
 968 }
 969
 970 bool
 971 Perl_is_uni_ascii(pTHX_ UV c)
 972 {
 973     U8 tmpbuf[UTF8_MAXBYTES+1];
 974     uvchr_to_utf8(tmpbuf, c);
 975     return is_utf8_ascii(tmpbuf);
 976 }
 977
 978 bool
 979 Perl_is_uni_space(pTHX_ UV c)
 980 {
 981     U8 tmpbuf[UTF8_MAXBYTES+1];
 982     uvchr_to_utf8(tmpbuf, c);
 983     return is_utf8_space(tmpbuf);
 984 }
 985
 986 bool
 987 Perl_is_uni_digit(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_digit(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_upper(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_upper(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_lower(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_lower(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_cntrl(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_cntrl(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_graph(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_graph(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_print(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_print(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_punct(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_punct(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_xdigit(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_xdigit(tmpbuf);
1048 }
1049
1050 UV
1051 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1052 {
1053     uvchr_to_utf8(p, c);
1054     return to_utf8_upper(p, p, lenp);
1055 }
1056
1057 UV
1058 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1059 {
1060     uvchr_to_utf8(p, c);
1061     return to_utf8_title(p, p, lenp);
1062 }
1063
1064 UV
1065 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1066 {
1067     uvchr_to_utf8(p, c);
1068     return to_utf8_lower(p, p, lenp);
1069 }
1070
1071 UV
1072 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1073 {
1074     uvchr_to_utf8(p, c);
1075     return to_utf8_fold(p, p, lenp);
1076 }
1077
1078 /* for now these all assume no locale info available for Unicode > 255 */
1079
1080 bool
1081 Perl_is_uni_alnum_lc(pTHX_ UV c)
1082 {
1083     return is_uni_alnum(c);     /* XXX no locale support yet */
1084 }
1085
1086 bool
1087 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1088 {
1089     return is_uni_alnumc(c);    /* XXX no locale support yet */
1090 }
1091
1092 bool
1093 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1094 {
1095     return is_uni_idfirst(c);   /* XXX no locale support yet */
1096 }
1097
1098 bool
1099 Perl_is_uni_alpha_lc(pTHX_ UV c)
1100 {
1101     return is_uni_alpha(c);     /* XXX no locale support yet */
1102 }
1103
1104 bool
1105 Perl_is_uni_ascii_lc(pTHX_ UV c)
1106 {
1107     return is_uni_ascii(c);     /* XXX no locale support yet */
1108 }
1109
1110 bool
1111 Perl_is_uni_space_lc(pTHX_ UV c)
1112 {
1113     return is_uni_space(c);     /* XXX no locale support yet */
1114 }
1115
1116 bool
1117 Perl_is_uni_digit_lc(pTHX_ UV c)
1118 {
1119     return is_uni_digit(c);     /* XXX no locale support yet */
1120 }
1121
1122 bool
1123 Perl_is_uni_upper_lc(pTHX_ UV c)
1124 {
1125     return is_uni_upper(c);     /* XXX no locale support yet */
1126 }
1127
1128 bool
1129 Perl_is_uni_lower_lc(pTHX_ UV c)
1130 {
1131     return is_uni_lower(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1136 {
1137     return is_uni_cntrl(c);     /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_graph_lc(pTHX_ UV c)
1142 {
1143     return is_uni_graph(c);     /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_print_lc(pTHX_ UV c)
1148 {
1149     return is_uni_print(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_punct_lc(pTHX_ UV c)
1154 {
1155     return is_uni_punct(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1160 {
1161     return is_uni_xdigit(c);    /* XXX no locale support yet */
1162 }
1163
1164 U32
1165 Perl_to_uni_upper_lc(pTHX_ U32 c)
1166 {
1167     /* XXX returns only the first character -- do not use XXX */
1168     /* XXX no locale support yet */
1169     STRLEN len;
1170     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1171     return (U32)to_uni_upper(c, tmpbuf, &len);
1172 }
1173
1174 U32
1175 Perl_to_uni_title_lc(pTHX_ U32 c)
1176 {
1177     /* XXX returns only the first character XXX -- do not use XXX */
1178     /* XXX no locale support yet */
1179     STRLEN len;
1180     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1181     return (U32)to_uni_title(c, tmpbuf, &len);
1182 }
1183
1184 U32
1185 Perl_to_uni_lower_lc(pTHX_ U32 c)
1186 {
1187     /* XXX returns only the first character -- do not use XXX */
1188     /* XXX no locale support yet */
1189     STRLEN len;
1190     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1191     return (U32)to_uni_lower(c, tmpbuf, &len);
1192 }
1193
1194 bool
1195 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1196 {
1197     if (!is_utf8_char(p))
1198         return FALSE;
1199     if (!PL_utf8_alnum)
1200         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1201          * descendant of isalnum(3), in other words, it doesn't
1202          * contain the '_'. --jhi */
1203         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1204     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1205 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1206 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1207     if (!PL_utf8_alnum)
1208         PL_utf8_alnum = swash_init("utf8", "",
1209             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1210     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1211 #endif
1212 }
1213
1214 bool
1215 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1216 {
1217     if (!is_utf8_char(p))
1218         return FALSE;
1219     if (!PL_utf8_alnum)
1220         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1221     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1222 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1223 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1224     if (!PL_utf8_alnum)
1225         PL_utf8_alnum = swash_init("utf8", "",
1226             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1227     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1228 #endif
1229 }
1230
1231 bool
1232 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1233 {
1234     if (*p == '_')
1235         return TRUE;
1236     if (!is_utf8_char(p))
1237         return FALSE;
1238     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1239         PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1240     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1241 }
1242
1243 bool
1244 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1245 {
1246     if (*p == '_')
1247         return TRUE;
1248     if (!is_utf8_char(p))
1249         return FALSE;
1250     if (!PL_utf8_idcont)
1251         PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1252     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1253 }
1254
1255 bool
1256 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1257 {
1258     if (!is_utf8_char(p))
1259         return FALSE;
1260     if (!PL_utf8_alpha)
1261         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1262     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1263 }
1264
1265 bool
1266 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1267 {
1268     if (!is_utf8_char(p))
1269         return FALSE;
1270     if (!PL_utf8_ascii)
1271         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1272     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1273 }
1274
1275 bool
1276 Perl_is_utf8_space(pTHX_ const U8 *p)
1277 {
1278     if (!is_utf8_char(p))
1279         return FALSE;
1280     if (!PL_utf8_space)
1281         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1282     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1283 }
1284
1285 bool
1286 Perl_is_utf8_digit(pTHX_ const U8 *p)
1287 {
1288     if (!is_utf8_char(p))
1289         return FALSE;
1290     if (!PL_utf8_digit)
1291         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1292     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1293 }
1294
1295 bool
1296 Perl_is_utf8_upper(pTHX_ const U8 *p)
1297 {
1298     if (!is_utf8_char(p))
1299         return FALSE;
1300     if (!PL_utf8_upper)
1301         PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1302     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1303 }
1304
1305 bool
1306 Perl_is_utf8_lower(pTHX_ const U8 *p)
1307 {
1308     if (!is_utf8_char(p))
1309         return FALSE;
1310     if (!PL_utf8_lower)
1311         PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1312     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1313 }
1314
1315 bool
1316 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1317 {
1318     if (!is_utf8_char(p))
1319         return FALSE;
1320     if (!PL_utf8_cntrl)
1321         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1322     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1323 }
1324
1325 bool
1326 Perl_is_utf8_graph(pTHX_ const U8 *p)
1327 {
1328     if (!is_utf8_char(p))
1329         return FALSE;
1330     if (!PL_utf8_graph)
1331         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1332     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1333 }
1334
1335 bool
1336 Perl_is_utf8_print(pTHX_ const U8 *p)
1337 {
1338     if (!is_utf8_char(p))
1339         return FALSE;
1340     if (!PL_utf8_print)
1341         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1342     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1343 }
1344
1345 bool
1346 Perl_is_utf8_punct(pTHX_ const U8 *p)
1347 {
1348     if (!is_utf8_char(p))
1349         return FALSE;
1350     if (!PL_utf8_punct)
1351         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1352     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1353 }
1354
1355 bool
1356 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1357 {
1358     if (!is_utf8_char(p))
1359         return FALSE;
1360     if (!PL_utf8_xdigit)
1361         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1362     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1363 }
1364
1365 bool
1366 Perl_is_utf8_mark(pTHX_ const U8 *p)
1367 {
1368     if (!is_utf8_char(p))
1369         return FALSE;
1370     if (!PL_utf8_mark)
1371         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1372     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1373 }
1374
1375 /*
1376 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1377
1378 The "p" contains the pointer to the UTF-8 string encoding
1379 the character that is being converted.
1380
1381 The "ustrp" is a pointer to the character buffer to put the
1382 conversion result to.  The "lenp" is a pointer to the length
1383 of the result.
1384
1385 The "swashp" is a pointer to the swash to use.
1386
1387 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1388 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1389 but not always, a multicharacter mapping), is tried first.
1390
1391 The "special" is a string like "utf8::ToSpecLower", which means the
1392 hash %utf8::ToSpecLower.  The access to the hash is through
1393 Perl_to_utf8_case().
1394
1395 The "normal" is a string like "ToLower" which means the swash
1396 %utf8::ToLower.
1397
1398 =cut */
1399
1400 UV
1401 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
1402 {
1403     UV uv1;
1404     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1405     STRLEN len = 0;
1406
1407     const UV uv0 = utf8_to_uvchr(p, 0);
1408     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1409      * are necessary in EBCDIC, they are redundant no-ops
1410      * in ASCII-ish platforms, and hopefully optimized away. */
1411     uv1 = NATIVE_TO_UNI(uv0);
1412     uvuni_to_utf8(tmpbuf, uv1);
1413
1414     if (!*swashp) /* load on-demand */
1415          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1416
1417     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1418     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1419          /* It might be "special" (sometimes, but not always,
1420           * a multicharacter mapping) */
1421          HV *hv;
1422          SV **svp;
1423
1424          if ((hv  = get_hv(special, FALSE)) &&
1425              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1426              (*svp)) {
1427               char *s;
1428
1429               s = SvPV(*svp, len);
1430               if (len == 1)
1431                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1432               else {
1433 #ifdef EBCDIC
1434                    /* If we have EBCDIC we need to remap the characters
1435                     * since any characters in the low 256 are Unicode
1436                     * code points, not EBCDIC. */
1437                    U8 *t = (U8*)s, *tend = t + len, *d;
1438
1439                    d = tmpbuf;
1440                    if (SvUTF8(*svp)) {
1441                         STRLEN tlen = 0;
1442
1443                         while (t < tend) {
1444                              UV c = utf8_to_uvchr(t, &tlen);
1445                              if (tlen > 0) {
1446                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1447                                   t += tlen;
1448                              }
1449                              else
1450                                   break;
1451                         }
1452                    }
1453                    else {
1454                         while (t < tend) {
1455                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1456                              t++;
1457                         }
1458                    }
1459                    len = d - tmpbuf;
1460                    Copy(tmpbuf, ustrp, len, U8);
1461 #else
1462                    Copy(s, ustrp, len, U8);
1463 #endif
1464               }
1465          }
1466     }
1467
1468     if (!len && *swashp) {
1469          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1470
1471          if (uv2) {
1472               /* It was "normal" (a single character mapping). */
1473               UV uv3 = UNI_TO_NATIVE(uv2);
1474
1475               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1476          }
1477     }
1478
1479     if (!len) /* Neither: just copy. */
1480          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1481
1482     if (lenp)
1483          *lenp = len;
1484
1485     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1486 }
1487
1488 /*
1489 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1490
1491 Convert the UTF-8 encoded character at p to its uppercase version and
1492 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1493 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1494 the uppercase version may be longer than the original character.
1495
1496 The first character of the uppercased version is returned
1497 (but note, as explained above, that there may be more.)
1498
1499 =cut */
1500
1501 UV
1502 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1503 {
1504     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1505                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1506 }
1507
1508 /*
1509 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1510
1511 Convert the UTF-8 encoded character at p to its titlecase version and
1512 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1513 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1514 titlecase version may be longer than the original character.
1515
1516 The first character of the titlecased version is returned
1517 (but note, as explained above, that there may be more.)
1518
1519 =cut */
1520
1521 UV
1522 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1523 {
1524     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1525                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1526 }
1527
1528 /*
1529 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1530
1531 Convert the UTF-8 encoded character at p to its lowercase version and
1532 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1533 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1534 lowercase version may be longer than the original character.
1535
1536 The first character of the lowercased version is returned
1537 (but note, as explained above, that there may be more.)
1538
1539 =cut */
1540
1541 UV
1542 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1543 {
1544     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1546 }
1547
1548 /*
1549 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1550
1551 Convert the UTF-8 encoded character at p to its foldcase version and
1552 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1553 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1554 foldcase version may be longer than the original character (up to
1555 three characters).
1556
1557 The first character of the foldcased version is returned
1558 (but note, as explained above, that there may be more.)
1559
1560 =cut */
1561
1562 UV
1563 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1564 {
1565     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1566                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1567 }
1568
1569 /* a "swash" is a swatch hash */
1570
1571 SV*
1572 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1573 {
1574     dVAR;
1575     SV* retval;
1576     SV* tokenbufsv = sv_newmortal();
1577     dSP;
1578     const size_t pkg_len = strlen(pkg);
1579     const size_t name_len = strlen(name);
1580     HV *stash = gv_stashpvn(pkg, pkg_len, FALSE);
1581     SV* errsv_save;
1582
1583     PUSHSTACKi(PERLSI_MAGIC);
1584     ENTER;
1585     SAVEI32(PL_hints);
1586     PL_hints = 0;
1587     save_re_context();
1588     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1589         ENTER;
1590         errsv_save = newSVsv(ERRSV);
1591         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1592                          Nullsv);
1593         if (!SvTRUE(ERRSV))
1594             sv_setsv(ERRSV, errsv_save);
1595         SvREFCNT_dec(errsv_save);
1596         LEAVE;
1597     }
1598     SPAGAIN;
1599     PUSHMARK(SP);
1600     EXTEND(SP,5);
1601     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1602     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1603     PUSHs(listsv);
1604     PUSHs(sv_2mortal(newSViv(minbits)));
1605     PUSHs(sv_2mortal(newSViv(none)));
1606     PUTBACK;
1607     if (IN_PERL_COMPILETIME) {
1608         /* XXX ought to be handled by lex_start */
1609         SAVEI32(PL_in_my);
1610         PL_in_my = 0;
1611         sv_setpv(tokenbufsv, PL_tokenbuf);
1612     }
1613     errsv_save = newSVsv(ERRSV);
1614     if (call_method("SWASHNEW", G_SCALAR))
1615         retval = newSVsv(*PL_stack_sp--);
1616     else
1617         retval = &PL_sv_undef;
1618     if (!SvTRUE(ERRSV))
1619         sv_setsv(ERRSV, errsv_save);
1620     SvREFCNT_dec(errsv_save);
1621     LEAVE;
1622     POPSTACK;
1623     if (IN_PERL_COMPILETIME) {
1624         STRLEN len;
1625         const char* pv = SvPV(tokenbufsv, len);
1626
1627         Copy(pv, PL_tokenbuf, len+1, char);
1628         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1629     }
1630     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1631         if (SvPOK(retval))
1632             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1633                        retval);
1634         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1635     }
1636     return retval;
1637 }
1638
1639
1640 /* This API is wrong for special case conversions since we may need to
1641  * return several Unicode characters for a single Unicode character
1642  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1643  * the lower-level routine, and it is similarly broken for returning
1644  * multiple values.  --jhi */
1645 UV
1646 Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
1647 {
1648     dVAR;
1649     HV* hv = (HV*)SvRV(sv);
1650     U32 klen;
1651     U32 off;
1652     STRLEN slen;
1653     STRLEN needents;
1654     U8 *tmps = NULL;
1655     U32 bit;
1656     SV *retval;
1657     U8 tmputf8[2];
1658     UV c = NATIVE_TO_ASCII(*ptr);
1659
1660     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1661         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1662         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1663         ptr = tmputf8;
1664     }
1665     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1666      * then the "swatch" is a vec() for al the chars which start
1667      * with 0xAA..0xYY
1668      * So the key in the hash (klen) is length of encoded char -1
1669      */
1670     klen = UTF8SKIP(ptr) - 1;
1671     off  = ptr[klen];
1672
1673     if (klen == 0)
1674      {
1675       /* If char in invariant then swatch is for all the invariant chars
1676        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1677        */
1678       needents = UTF_CONTINUATION_MARK;
1679       off      = NATIVE_TO_UTF(ptr[klen]);
1680      }
1681     else
1682      {
1683       /* If char is encoded then swatch is for the prefix */
1684       needents = (1 << UTF_ACCUMULATION_SHIFT);
1685       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1686      }
1687
1688     /*
1689      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1690      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1691      * it's nothing to sniff at.)  Pity we usually come through at least
1692      * two function calls to get here...
1693      *
1694      * NB: this code assumes that swatches are never modified, once generated!
1695      */
1696
1697     if (hv   == PL_last_swash_hv &&
1698         klen == PL_last_swash_klen &&
1699         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1700     {
1701         tmps = PL_last_swash_tmps;
1702         slen = PL_last_swash_slen;
1703     }
1704     else {
1705         /* Try our second-level swatch cache, kept in a hash. */
1706         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1707
1708         /* If not cached, generate it via utf8::SWASHGET */
1709         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1710             dSP;
1711             /* We use utf8n_to_uvuni() as we want an index into
1712                Unicode tables, not a native character number.
1713              */
1714             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1715                                            ckWARN(WARN_UTF8) ?
1716                                            0 : UTF8_ALLOW_ANY);
1717             SV *errsv_save;
1718             ENTER;
1719             SAVETMPS;
1720             save_re_context();
1721             PUSHSTACKi(PERLSI_MAGIC);
1722             PUSHMARK(SP);
1723             EXTEND(SP,3);
1724             PUSHs((SV*)sv);
1725             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1726             PUSHs(sv_2mortal(newSViv((klen) ?
1727                                      (code_point & ~(needents - 1)) : 0)));
1728             PUSHs(sv_2mortal(newSViv(needents)));
1729             PUTBACK;
1730             errsv_save = newSVsv(ERRSV);
1731             if (call_method("SWASHGET", G_SCALAR))
1732                 retval = newSVsv(*PL_stack_sp--);
1733             else
1734                 retval = &PL_sv_undef;
1735             if (!SvTRUE(ERRSV))
1736                 sv_setsv(ERRSV, errsv_save);
1737             SvREFCNT_dec(errsv_save);
1738             POPSTACK;
1739             FREETMPS;
1740             LEAVE;
1741             if (IN_PERL_COMPILETIME)
1742                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1743
1744             svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1745
1746             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1747                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1748         }
1749
1750         PL_last_swash_hv = hv;
1751         PL_last_swash_klen = klen;
1752         PL_last_swash_tmps = tmps;
1753         PL_last_swash_slen = slen;
1754         if (klen)
1755             Copy(ptr, PL_last_swash_key, klen, U8);
1756     }
1757
1758     switch ((int)((slen << 3) / needents)) {
1759     case 1:
1760         bit = 1 << (off & 7);
1761         off >>= 3;
1762         return (tmps[off] & bit) != 0;
1763     case 8:
1764         return tmps[off];
1765     case 16:
1766         off <<= 1;
1767         return (tmps[off] << 8) + tmps[off + 1] ;
1768     case 32:
1769         off <<= 2;
1770         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1771     }
1772     Perl_croak(aTHX_ "panic: swash_fetch");
1773     return 0;
1774 }
1775
1776
1777 /*
1778 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1779
1780 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1781 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1782 bytes available. The return value is the pointer to the byte after the
1783 end of the new character. In other words,
1784
1785     d = uvchr_to_utf8(d, uv);
1786
1787 is the recommended wide native character-aware way of saying
1788
1789     *(d++) = uv;
1790
1791 =cut
1792 */
1793
1794 /* On ASCII machines this is normally a macro but we want a
1795    real function in case XS code wants it
1796 */
1797 #undef Perl_uvchr_to_utf8
1798 U8 *
1799 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1800 {
1801     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1802 }
1803
1804 U8 *
1805 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1806 {
1807     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1808 }
1809
1810 /*
1811 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1812
1813 Returns the native character value of the first character in the string C<s>
1814 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1815 length, in bytes, of that character.
1816
1817 Allows length and flags to be passed to low level routine.
1818
1819 =cut
1820 */
1821 /* On ASCII machines this is normally a macro but we want
1822    a real function in case XS code wants it
1823 */
1824 #undef Perl_utf8n_to_uvchr
1825 UV
1826 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1827 {
1828     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1829     return UNI_TO_NATIVE(uv);
1830 }
1831
1832 /*
1833 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1834
1835 Build to the scalar dsv a displayable version of the string spv,
1836 length len, the displayable version being at most pvlim bytes long
1837 (if longer, the rest is truncated and "..." will be appended).
1838
1839 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1840 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1841 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1842 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1843 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1844 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1845
1846 The pointer to the PV of the dsv is returned.
1847
1848 =cut */
1849 char *
1850 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1851 {
1852     int truncated = 0;
1853     const char *s, *e;
1854
1855     sv_setpvn(dsv, "", 0);
1856     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1857          UV u;
1858           /* This serves double duty as a flag and a character to print after
1859              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1860           */
1861          char ok = 0;
1862
1863          if (pvlim && SvCUR(dsv) >= pvlim) {
1864               truncated++;
1865               break;
1866          }
1867          u = utf8_to_uvchr((U8*)s, 0);
1868          if (u < 256) {
1869              unsigned char c = (unsigned char)u & 0xFF;
1870              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1871                  switch (c) {
1872                  case '\n':
1873                      ok = 'n'; break;
1874                  case '\r':
1875                      ok = 'r'; break;
1876                  case '\t':
1877                      ok = 't'; break;
1878                  case '\f':
1879                      ok = 'f'; break;
1880                  case '\a':
1881                      ok = 'a'; break;
1882                  case '\\':
1883                      ok = '\\'; break;
1884                  default: break;
1885                  }
1886                  if (ok) {
1887                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1888                  }
1889              }
1890              /* isPRINT() is the locale-blind version. */
1891              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1892                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1893                  ok = 1;
1894              }
1895          }
1896          if (!ok)
1897              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1898     }
1899     if (truncated)
1900          sv_catpvn(dsv, "...", 3);
1901
1902     return SvPVX(dsv);
1903 }
1904
1905 /*
1906 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1907
1908 Build to the scalar dsv a displayable version of the scalar sv,
1909 the displayable version being at most pvlim bytes long
1910 (if longer, the rest is truncated and "..." will be appended).
1911
1912 The flags argument is as in pv_uni_display().
1913
1914 The pointer to the PV of the dsv is returned.
1915
1916 =cut */
1917 char *
1918 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1919 {
1920      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1921                                 pvlim, flags);
1922 }
1923
1924 /*
1925 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1926
1927 Return true if the strings s1 and s2 differ case-insensitively, false
1928 if not (if they are equal case-insensitively).  If u1 is true, the
1929 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1930 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1931 are false, the respective string is assumed to be in native 8-bit
1932 encoding.
1933
1934 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1935 in there (they will point at the beginning of the I<next> character).
1936 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1937 pointers beyond which scanning will not continue under any
1938 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1939 s2+l2 will be used as goal end pointers that will also stop the scan,
1940 and which qualify towards defining a successful match: all the scans
1941 that define an explicit length must reach their goal pointers for
1942 a match to succeed).
1943
1944 For case-insensitiveness, the "casefolding" of Unicode is used
1945 instead of upper/lowercasing both the characters, see
1946 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1947
1948 =cut */
1949 I32
1950 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1951 {
1952      register const U8 *p1  = (const U8*)s1;
1953      register const U8 *p2  = (const U8*)s2;
1954      register const U8 *f1 = 0, *f2 = 0;
1955      register U8 *e1 = 0, *q1 = 0;
1956      register U8 *e2 = 0, *q2 = 0;
1957      STRLEN n1 = 0, n2 = 0;
1958      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
1959      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1960      U8 natbuf[1+1];
1961      STRLEN foldlen1, foldlen2;
1962      bool match;
1963
1964      if (pe1)
1965           e1 = *(U8**)pe1;
1966      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
1967           f1 = (const U8*)s1 + l1;
1968      if (pe2)
1969           e2 = *(U8**)pe2;
1970      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
1971           f2 = (const U8*)s2 + l2;
1972
1973      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1974           return 1; /* mismatch; possible infinite loop or false positive */
1975
1976      if (!u1 || !u2)
1977           natbuf[1] = 0; /* Need to terminate the buffer. */
1978
1979      while ((e1 == 0 || p1 < e1) &&
1980             (f1 == 0 || p1 < f1) &&
1981             (e2 == 0 || p2 < e2) &&
1982             (f2 == 0 || p2 < f2)) {
1983           if (n1 == 0) {
1984                if (u1)
1985                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1986                else {
1987                     natbuf[0] = *p1;
1988                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1989                }
1990                q1 = foldbuf1;
1991                n1 = foldlen1;
1992           }
1993           if (n2 == 0) {
1994                if (u2)
1995                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1996                else {
1997                     natbuf[0] = *p2;
1998                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1999                }
2000                q2 = foldbuf2;
2001                n2 = foldlen2;
2002           }
2003           while (n1 && n2) {
2004                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2005                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2006                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2007                    return 1; /* mismatch */
2008                n1 -= UTF8SKIP(q1);
2009                q1 += UTF8SKIP(q1);
2010                n2 -= UTF8SKIP(q2);
2011                q2 += UTF8SKIP(q2);
2012           }
2013           if (n1 == 0)
2014                p1 += u1 ? UTF8SKIP(p1) : 1;
2015           if (n2 == 0)
2016                p2 += u2 ? UTF8SKIP(p2) : 1;
2017
2018      }
2019
2020      /* A match is defined by all the scans that specified
2021       * an explicit length reaching their final goals. */
2022      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2023
2024      if (match) {
2025           if (pe1)
2026                *pe1 = (char*)p1;
2027           if (pe2)
2028                *pe2 = (char*)p2;
2029      }
2030
2031      return match ? 0 : 1; /* 0 match, 1 mismatch */
2032 }
2033
2034 /*
2035  * Local variables:
2036  * c-indentation-style: bsd
2037  * c-basic-offset: 4
2038  * indent-tabs-mode: t
2039  * End:
2040  *
2041  * vim: shiftwidth=4:
2042 */