utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 #ifndef EBCDIC
  29 /* Separate prototypes needed because in ASCII systems these
  30  * usually macros but they still are compiled as code, too. */
  31 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  32 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  33 #endif
  34
  35 static const char unees[] =
  36     "Malformed UTF-8 character (unexpected end of string)";
  37
  38 /*
  39 =head1 Unicode Support
  40
  41 This file contains various utility functions for manipulating UTF8-encoded
  42 strings. For the uninitiated, this is a method of representing arbitrary
  43 Unicode characters as a variable number of bytes, in such a way that
  44 characters in the ASCII range are unmodified, and a zero byte never appears
  45 within non-zero characters.
  46
  47 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  48
  49 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  50 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  51 bytes available. The return value is the pointer to the byte after the
  52 end of the new character. In other words,
  53
  54     d = uvuni_to_utf8_flags(d, uv, flags);
  55
  56 or, in most cases,
  57
  58     d = uvuni_to_utf8(d, uv);
  59
  60 (which is equivalent to)
  61
  62     d = uvuni_to_utf8_flags(d, uv, 0);
  63
  64 is the recommended Unicode-aware way of saying
  65
  66     *(d++) = uv;
  67
  68 =cut
  69 */
  70
  71 U8 *
  72 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  73 {
  74     if (ckWARN(WARN_UTF8)) {
  75          if (UNICODE_IS_SURROGATE(uv) &&
  76              !(flags & UNICODE_ALLOW_SURROGATE))
  77               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  78          else if (
  79                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  80                     !(flags & UNICODE_ALLOW_FDD0))
  81                    ||
  82                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  83                     !(flags & UNICODE_ALLOW_FFFF))) &&
  84                   /* UNICODE_ALLOW_SUPER includes
  85                    * FFFEs and FFFFs beyond 0x10FFFF. */
  86                   ((uv <= PERL_UNICODE_MAX) ||
  87                    !(flags & UNICODE_ALLOW_SUPER))
  88                   )
  89               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  90                          "Unicode character 0x%04"UVxf" is illegal", uv);
  91     }
  92     if (UNI_IS_INVARIANT(uv)) {
  93         *d++ = (U8)UTF_TO_NATIVE(uv);
  94         return d;
  95     }
  96 #if defined(EBCDIC)
  97     else {
  98         STRLEN len  = UNISKIP(uv);
  99         U8 *p = d+len-1;
 100         while (p > d) {
 101             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 102             uv >>= UTF_ACCUMULATION_SHIFT;
 103         }
 104         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 105         return d+len;
 106     }
 107 #else /* Non loop style */
 108     if (uv < 0x800) {
 109         *d++ = (U8)(( uv >>  6)         | 0xc0);
 110         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 111         return d;
 112     }
 113     if (uv < 0x10000) {
 114         *d++ = (U8)(( uv >> 12)         | 0xe0);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x200000) {
 120         *d++ = (U8)(( uv >> 18)         | 0xf0);
 121         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 123         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 124         return d;
 125     }
 126     if (uv < 0x4000000) {
 127         *d++ = (U8)(( uv >> 24)         | 0xf8);
 128         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 129         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 131         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 132         return d;
 133     }
 134     if (uv < 0x80000000) {
 135         *d++ = (U8)(( uv >> 30)         | 0xfc);
 136         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 137         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 138         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 139         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 140         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 141         return d;
 142     }
 143 #ifdef HAS_QUAD
 144     if (uv < UTF8_QUAD_MAX)
 145 #endif
 146     {
 147         *d++ =                            0xfe; /* Can't match U+FEFF! */
 148         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 149         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 150         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 151         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 152         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 153         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 154         return d;
 155     }
 156 #ifdef HAS_QUAD
 157     {
 158         *d++ =                            0xff;         /* Can't match U+FFFE! */
 159         *d++ =                            0x80;         /* 6 Reserved bits */
 160         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 161         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 163         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 164         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 165         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 166         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 167         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 168         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 169         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 170         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 171         return d;
 172     }
 173 #endif
 174 #endif /* Loop style */
 175 }
 176
 177 /*
 178
 179 Tests if some arbitrary number of bytes begins in a valid UTF-8
 180 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 181 UTF-8 character.  The actual number of bytes in the UTF-8 character
 182 will be returned if it is valid, otherwise 0.
 183
 184 This is the "slow" version as opposed to the "fast" version which is
 185 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 186 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 187 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 188 you should use the _slow().  In practice this means that the _slow()
 189 will be used very rarely, since the maximum Unicode code point (as of
 190 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 191 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 192 five bytes or more.
 193
 194 =cut */
 195 STATIC STRLEN
 196 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 197 {
 198     U8 u = *s;
 199     STRLEN slen;
 200     UV uv, ouv;
 201
 202     if (UTF8_IS_INVARIANT(u))
 203         return 1;
 204
 205     if (!UTF8_IS_START(u))
 206         return 0;
 207
 208     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 209         return 0;
 210
 211     slen = len - 1;
 212     s++;
 213 #ifdef EBCDIC
 214     u = NATIVE_TO_UTF(u);
 215 #endif
 216     u &= UTF_START_MASK(len);
 217     uv  = u;
 218     ouv = uv;
 219     while (slen--) {
 220         if (!UTF8_IS_CONTINUATION(*s))
 221             return 0;
 222         uv = UTF8_ACCUMULATE(uv, *s);
 223         if (uv < ouv)
 224             return 0;
 225         ouv = uv;
 226         s++;
 227     }
 228
 229     if ((STRLEN)UNISKIP(uv) < len)
 230         return 0;
 231
 232     return len;
 233 }
 234
 235 /*
 236 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 237
 238 Tests if some arbitrary number of bytes begins in a valid UTF-8
 239 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 240 UTF-8 character.  The actual number of bytes in the UTF-8 character
 241 will be returned if it is valid, otherwise 0.
 242
 243 =cut */
 244 STRLEN
 245 Perl_is_utf8_char(pTHX_ const U8 *s)
 246 {
 247     const STRLEN len = UTF8SKIP(s);
 248     PERL_UNUSED_CONTEXT;
 249 #ifdef IS_UTF8_CHAR
 250     if (IS_UTF8_CHAR_FAST(len))
 251         return IS_UTF8_CHAR(s, len) ? len : 0;
 252 #endif /* #ifdef IS_UTF8_CHAR */
 253     return is_utf8_char_slow(s, len);
 254 }
 255
 256 /*
 257 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 258
 259 Returns true if first C<len> bytes of the given string form a valid
 260 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 261 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 262 because a valid ASCII string is a valid UTF-8 string.
 263
 264 See also is_utf8_string_loclen() and is_utf8_string_loc().
 265
 266 =cut
 267 */
 268
 269 bool
 270 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 271 {
 272     const U8* const send = s + (len ? len : strlen((const char *)s));
 273     const U8* x = s;
 274
 275     PERL_UNUSED_CONTEXT;
 276
 277     while (x < send) {
 278         STRLEN c;
 279          /* Inline the easy bits of is_utf8_char() here for speed... */
 280          if (UTF8_IS_INVARIANT(*x))
 281               c = 1;
 282          else if (!UTF8_IS_START(*x))
 283              goto out;
 284          else {
 285               /* ... and call is_utf8_char() only if really needed. */
 286 #ifdef IS_UTF8_CHAR
 287              c = UTF8SKIP(x);
 288              if (IS_UTF8_CHAR_FAST(c)) {
 289                  if (!IS_UTF8_CHAR(x, c))
 290                      c = 0;
 291              }
 292              else
 293                 c = is_utf8_char_slow(x, c);
 294 #else
 295              c = is_utf8_char(x);
 296 #endif /* #ifdef IS_UTF8_CHAR */
 297               if (!c)
 298                   goto out;
 299          }
 300         x += c;
 301     }
 302
 303  out:
 304     if (x != send)
 305         return FALSE;
 306
 307     return TRUE;
 308 }
 309
 310 /*
 311 Implemented as a macro in utf8.h
 312
 313 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 314
 315 Like is_utf8_string() but stores the location of the failure (in the
 316 case of "utf8ness failure") or the location s+len (in the case of
 317 "utf8ness success") in the C<ep>.
 318
 319 See also is_utf8_string_loclen() and is_utf8_string().
 320
 321 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 322
 323 Like is_utf8_string() but stores the location of the failure (in the
 324 case of "utf8ness failure") or the location s+len (in the case of
 325 "utf8ness success") in the C<ep>, and the number of UTF-8
 326 encoded characters in the C<el>.
 327
 328 See also is_utf8_string_loc() and is_utf8_string().
 329
 330 =cut
 331 */
 332
 333 bool
 334 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 335 {
 336     const U8* const send = s + (len ? len : strlen((const char *)s));
 337     const U8* x = s;
 338     STRLEN c;
 339     STRLEN outlen = 0;
 340     PERL_UNUSED_CONTEXT;
 341
 342     while (x < send) {
 343          /* Inline the easy bits of is_utf8_char() here for speed... */
 344          if (UTF8_IS_INVARIANT(*x))
 345              c = 1;
 346          else if (!UTF8_IS_START(*x))
 347              goto out;
 348          else {
 349              /* ... and call is_utf8_char() only if really needed. */
 350 #ifdef IS_UTF8_CHAR
 351              c = UTF8SKIP(x);
 352              if (IS_UTF8_CHAR_FAST(c)) {
 353                  if (!IS_UTF8_CHAR(x, c))
 354                      c = 0;
 355              } else
 356                  c = is_utf8_char_slow(x, c);
 357 #else
 358              c = is_utf8_char(x);
 359 #endif /* #ifdef IS_UTF8_CHAR */
 360              if (!c)
 361                  goto out;
 362          }
 363          x += c;
 364          outlen++;
 365     }
 366
 367  out:
 368     if (el)
 369         *el = outlen;
 370
 371     if (ep)
 372         *ep = x;
 373     return (x == send);
 374 }
 375
 376 /*
 377
 378 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 379
 380 Bottom level UTF-8 decode routine.
 381 Returns the unicode code point value of the first character in the string C<s>
 382 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 383 C<retlen> will be set to the length, in bytes, of that character.
 384
 385 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 386 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 387 it is assumed that the caller will raise a warning, and this function
 388 will silently just set C<retlen> to C<-1> and return zero.  If the
 389 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 390 malformations will be given, C<retlen> will be set to the expected
 391 length of the UTF-8 character in bytes, and zero will be returned.
 392
 393 The C<flags> can also contain various flags to allow deviations from
 394 the strict UTF-8 encoding (see F<utf8.h>).
 395
 396 Most code should use utf8_to_uvchr() rather than call this directly.
 397
 398 =cut
 399 */
 400
 401 UV
 402 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 403 {
 404     dVAR;
 405     const U8 * const s0 = s;
 406     UV uv = *s, ouv = 0;
 407     STRLEN len = 1;
 408     const bool dowarn = ckWARN_d(WARN_UTF8);
 409     const UV startbyte = *s;
 410     STRLEN expectlen = 0;
 411     U32 warning = 0;
 412
 413 /* This list is a superset of the UTF8_ALLOW_XXX. */
 414
 415 #define UTF8_WARN_EMPTY                          1
 416 #define UTF8_WARN_CONTINUATION                   2
 417 #define UTF8_WARN_NON_CONTINUATION               3
 418 #define UTF8_WARN_FE_FF                          4
 419 #define UTF8_WARN_SHORT                          5
 420 #define UTF8_WARN_OVERFLOW                       6
 421 #define UTF8_WARN_SURROGATE                      7
 422 #define UTF8_WARN_LONG                           8
 423 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 424
 425     if (curlen == 0 &&
 426         !(flags & UTF8_ALLOW_EMPTY)) {
 427         warning = UTF8_WARN_EMPTY;
 428         goto malformed;
 429     }
 430
 431     if (UTF8_IS_INVARIANT(uv)) {
 432         if (retlen)
 433             *retlen = 1;
 434         return (UV) (NATIVE_TO_UTF(*s));
 435     }
 436
 437     if (UTF8_IS_CONTINUATION(uv) &&
 438         !(flags & UTF8_ALLOW_CONTINUATION)) {
 439         warning = UTF8_WARN_CONTINUATION;
 440         goto malformed;
 441     }
 442
 443     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 444         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 445         warning = UTF8_WARN_NON_CONTINUATION;
 446         goto malformed;
 447     }
 448
 449 #ifdef EBCDIC
 450     uv = NATIVE_TO_UTF(uv);
 451 #else
 452     if ((uv == 0xfe || uv == 0xff) &&
 453         !(flags & UTF8_ALLOW_FE_FF)) {
 454         warning = UTF8_WARN_FE_FF;
 455         goto malformed;
 456     }
 457 #endif
 458
 459     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 460     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 461     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 462     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 463 #ifdef EBCDIC
 464     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 465     else                        { len =  7; uv &= 0x01; }
 466 #else
 467     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 468     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 469     else                        { len = 13; uv = 0; } /* whoa! */
 470 #endif
 471
 472     if (retlen)
 473         *retlen = len;
 474
 475     expectlen = len;
 476
 477     if ((curlen < expectlen) &&
 478         !(flags & UTF8_ALLOW_SHORT)) {
 479         warning = UTF8_WARN_SHORT;
 480         goto malformed;
 481     }
 482
 483     len--;
 484     s++;
 485     ouv = uv;
 486
 487     while (len--) {
 488         if (!UTF8_IS_CONTINUATION(*s) &&
 489             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 490             s--;
 491             warning = UTF8_WARN_NON_CONTINUATION;
 492             goto malformed;
 493         }
 494         else
 495             uv = UTF8_ACCUMULATE(uv, *s);
 496         if (!(uv > ouv)) {
 497             /* These cannot be allowed. */
 498             if (uv == ouv) {
 499                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 500                     warning = UTF8_WARN_LONG;
 501                     goto malformed;
 502                 }
 503             }
 504             else { /* uv < ouv */
 505                 /* This cannot be allowed. */
 506                 warning = UTF8_WARN_OVERFLOW;
 507                 goto malformed;
 508             }
 509         }
 510         s++;
 511         ouv = uv;
 512     }
 513
 514     if (UNICODE_IS_SURROGATE(uv) &&
 515         !(flags & UTF8_ALLOW_SURROGATE)) {
 516         warning = UTF8_WARN_SURROGATE;
 517         goto malformed;
 518     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 519                !(flags & UTF8_ALLOW_LONG)) {
 520         warning = UTF8_WARN_LONG;
 521         goto malformed;
 522     } else if (UNICODE_IS_ILLEGAL(uv) &&
 523                !(flags & UTF8_ALLOW_FFFF)) {
 524         warning = UTF8_WARN_FFFF;
 525         goto malformed;
 526     }
 527
 528     return uv;
 529
 530 malformed:
 531
 532     if (flags & UTF8_CHECK_ONLY) {
 533         if (retlen)
 534             *retlen = ((STRLEN) -1);
 535         return 0;
 536     }
 537
 538     if (dowarn) {
 539         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 540
 541         switch (warning) {
 542         case 0: /* Intentionally empty. */ break;
 543         case UTF8_WARN_EMPTY:
 544             sv_catpvs(sv, "(empty string)");
 545             break;
 546         case UTF8_WARN_CONTINUATION:
 547             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 548             break;
 549         case UTF8_WARN_NON_CONTINUATION:
 550             if (s == s0)
 551                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 552                            (UV)s[1], startbyte);
 553             else {
 554                 const int len = (int)(s-s0);
 555                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 556                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 557             }
 558
 559             break;
 560         case UTF8_WARN_FE_FF:
 561             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 562             break;
 563         case UTF8_WARN_SHORT:
 564             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 565                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 566             expectlen = curlen;         /* distance for caller to skip */
 567             break;
 568         case UTF8_WARN_OVERFLOW:
 569             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 570                            ouv, *s, startbyte);
 571             break;
 572         case UTF8_WARN_SURROGATE:
 573             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 574             break;
 575         case UTF8_WARN_LONG:
 576             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 577                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 578             break;
 579         case UTF8_WARN_FFFF:
 580             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 581             break;
 582         default:
 583             sv_catpvs(sv, "(unknown reason)");
 584             break;
 585         }
 586
 587         if (warning) {
 588             const char * const s = SvPVX_const(sv);
 589
 590             if (PL_op)
 591                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 592                             "%s in %s", s,  OP_DESC(PL_op));
 593             else
 594                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 595         }
 596     }
 597
 598     if (retlen)
 599         *retlen = expectlen ? expectlen : len;
 600
 601     return 0;
 602 }
 603
 604 /*
 605 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 606
 607 Returns the native character value of the first character in the string C<s>
 608 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 609 length, in bytes, of that character.
 610
 611 If C<s> does not point to a well-formed UTF-8 character, zero is
 612 returned and retlen is set, if possible, to -1.
 613
 614 =cut
 615 */
 616
 617 UV
 618 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 619 {
 620     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 621                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 622 }
 623
 624 /*
 625 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 626
 627 Returns the Unicode code point of the first character in the string C<s>
 628 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 629 length, in bytes, of that character.
 630
 631 This function should only be used when returned UV is considered
 632 an index into the Unicode semantic tables (e.g. swashes).
 633
 634 If C<s> does not point to a well-formed UTF-8 character, zero is
 635 returned and retlen is set, if possible, to -1.
 636
 637 =cut
 638 */
 639
 640 UV
 641 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 642 {
 643     /* Call the low level routine asking for checks */
 644     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 645                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 646 }
 647
 648 /*
 649 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 650
 651 Return the length of the UTF-8 char encoded string C<s> in characters.
 652 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 653 up past C<e>, croaks.
 654
 655 =cut
 656 */
 657
 658 STRLEN
 659 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 660 {
 661     dVAR;
 662     STRLEN len = 0;
 663     U8 t = 0;
 664
 665     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 666      * the bitops (especially ~) can create illegal UTF-8.
 667      * In other words: in Perl UTF-8 is not just for Unicode. */
 668
 669     if (e < s)
 670         goto warn_and_return;
 671     while (s < e) {
 672         t = UTF8SKIP(s);
 673         if (e - s < t) {
 674             warn_and_return:
 675             if (ckWARN_d(WARN_UTF8)) {
 676                 if (PL_op)
 677                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 678                             "%s in %s", unees, OP_DESC(PL_op));
 679                 else
 680                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 681             }
 682             return len;
 683         }
 684         s += t;
 685         len++;
 686     }
 687
 688     return len;
 689 }
 690
 691 /*
 692 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 693
 694 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 695 and C<b>.
 696
 697 WARNING: use only if you *know* that the pointers point inside the
 698 same UTF-8 buffer.
 699
 700 =cut
 701 */
 702
 703 IV
 704 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 705 {
 706     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 707 }
 708
 709 /*
 710 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 711
 712 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 713 forward or backward.
 714
 715 WARNING: do not use the following unless you *know* C<off> is within
 716 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 717 on the first byte of character or just after the last byte of a character.
 718
 719 =cut
 720 */
 721
 722 U8 *
 723 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 724 {
 725     PERL_UNUSED_CONTEXT;
 726     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 727      * the bitops (especially ~) can create illegal UTF-8.
 728      * In other words: in Perl UTF-8 is not just for Unicode. */
 729
 730     if (off >= 0) {
 731         while (off--)
 732             s += UTF8SKIP(s);
 733     }
 734     else {
 735         while (off++) {
 736             s--;
 737             while (UTF8_IS_CONTINUATION(*s))
 738                 s--;
 739         }
 740     }
 741     return (U8 *)s;
 742 }
 743
 744 /*
 745 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 746
 747 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 748 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 749 updates len to contain the new length.
 750 Returns zero on failure, setting C<len> to -1.
 751
 752 If you need a copy of the string, see C<bytes_from_utf8>.
 753
 754 =cut
 755 */
 756
 757 U8 *
 758 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 759 {
 760     U8 * const save = s;
 761     U8 * const send = s + *len;
 762     U8 *d;
 763
 764     /* ensure valid UTF-8 and chars < 256 before updating string */
 765     while (s < send) {
 766         U8 c = *s++;
 767
 768         if (!UTF8_IS_INVARIANT(c) &&
 769             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 770              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 771             *len = ((STRLEN) -1);
 772             return 0;
 773         }
 774     }
 775
 776     d = s = save;
 777     while (s < send) {
 778         STRLEN ulen;
 779         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 780         s += ulen;
 781     }
 782     *d = '\0';
 783     *len = d - save;
 784     return save;
 785 }
 786
 787 /*
 788 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 789
 790 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 791 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 792 the newly-created string, and updates C<len> to contain the new
 793 length.  Returns the original string if no conversion occurs, C<len>
 794 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 795 0 if C<s> is converted or contains all 7bit characters.
 796
 797 =cut
 798 */
 799
 800 U8 *
 801 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 802 {
 803     U8 *d;
 804     const U8 *start = s;
 805     const U8 *send;
 806     I32 count = 0;
 807
 808     PERL_UNUSED_CONTEXT;
 809     if (!*is_utf8)
 810         return (U8 *)start;
 811
 812     /* ensure valid UTF-8 and chars < 256 before converting string */
 813     for (send = s + *len; s < send;) {
 814         U8 c = *s++;
 815         if (!UTF8_IS_INVARIANT(c)) {
 816             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 817                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 818                 count++;
 819             else
 820                 return (U8 *)start;
 821         }
 822     }
 823
 824     *is_utf8 = FALSE;
 825
 826     Newx(d, (*len) - count + 1, U8);
 827     s = start; start = d;
 828     while (s < send) {
 829         U8 c = *s++;
 830         if (!UTF8_IS_INVARIANT(c)) {
 831             /* Then it is two-byte encoded */
 832             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 833             c = ASCII_TO_NATIVE(c);
 834         }
 835         *d++ = c;
 836     }
 837     *d = '\0';
 838     *len = d - start;
 839     return (U8 *)start;
 840 }
 841
 842 /*
 843 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 844
 845 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 846 Returns a pointer to the newly-created string, and sets C<len> to
 847 reflect the new length.
 848
 849 If you want to convert to UTF-8 from other encodings than ASCII,
 850 see sv_recode_to_utf8().
 851
 852 =cut
 853 */
 854
 855 U8*
 856 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 857 {
 858     const U8 * const send = s + (*len);
 859     U8 *d;
 860     U8 *dst;
 861     PERL_UNUSED_CONTEXT;
 862
 863     Newx(d, (*len) * 2 + 1, U8);
 864     dst = d;
 865
 866     while (s < send) {
 867         const UV uv = NATIVE_TO_ASCII(*s++);
 868         if (UNI_IS_INVARIANT(uv))
 869             *d++ = (U8)UTF_TO_NATIVE(uv);
 870         else {
 871             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 872             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 873         }
 874     }
 875     *d = '\0';
 876     *len = d-dst;
 877     return dst;
 878 }
 879
 880 /*
 881  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 882  *
 883  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 884  * We optimize for native, for obvious reasons. */
 885
 886 U8*
 887 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 888 {
 889     U8* pend;
 890     U8* dstart = d;
 891
 892     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 893          d[0] = 0;
 894          *newlen = 1;
 895          return d;
 896     }
 897
 898     if (bytelen & 1)
 899         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
 900
 901     pend = p + bytelen;
 902
 903     while (p < pend) {
 904         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 905         p += 2;
 906         if (uv < 0x80) {
 907 #ifdef EBCDIC
 908             *d++ = UNI_TO_NATIVE(uv);
 909 #else
 910             *d++ = (U8)uv;
 911 #endif
 912             continue;
 913         }
 914         if (uv < 0x800) {
 915             *d++ = (U8)(( uv >>  6)         | 0xc0);
 916             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 917             continue;
 918         }
 919         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 920             UV low = (p[0] << 8) + p[1];
 921             p += 2;
 922             if (low < 0xdc00 || low >= 0xdfff)
 923                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 924             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 925         }
 926         if (uv < 0x10000) {
 927             *d++ = (U8)(( uv >> 12)         | 0xe0);
 928             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 929             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 930             continue;
 931         }
 932         else {
 933             *d++ = (U8)(( uv >> 18)         | 0xf0);
 934             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 935             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 936             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 937             continue;
 938         }
 939     }
 940     *newlen = d - dstart;
 941     return d;
 942 }
 943
 944 /* Note: this one is slightly destructive of the source. */
 945
 946 U8*
 947 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 948 {
 949     U8* s = (U8*)p;
 950     U8* const send = s + bytelen;
 951     while (s < send) {
 952         const U8 tmp = s[0];
 953         s[0] = s[1];
 954         s[1] = tmp;
 955         s += 2;
 956     }
 957     return utf16_to_utf8(p, d, bytelen, newlen);
 958 }
 959
 960 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 961
 962 bool
 963 Perl_is_uni_alnum(pTHX_ UV c)
 964 {
 965     U8 tmpbuf[UTF8_MAXBYTES+1];
 966     uvchr_to_utf8(tmpbuf, c);
 967     return is_utf8_alnum(tmpbuf);
 968 }
 969
 970 bool
 971 Perl_is_uni_alnumc(pTHX_ UV c)
 972 {
 973     U8 tmpbuf[UTF8_MAXBYTES+1];
 974     uvchr_to_utf8(tmpbuf, c);
 975     return is_utf8_alnumc(tmpbuf);
 976 }
 977
 978 bool
 979 Perl_is_uni_idfirst(pTHX_ UV c)
 980 {
 981     U8 tmpbuf[UTF8_MAXBYTES+1];
 982     uvchr_to_utf8(tmpbuf, c);
 983     return is_utf8_idfirst(tmpbuf);
 984 }
 985
 986 bool
 987 Perl_is_uni_alpha(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_alpha(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_ascii(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_ascii(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_space(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_space(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_digit(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_digit(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_upper(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_upper(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_lower(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_lower(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_cntrl(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_cntrl(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_graph(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_graph(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_print(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_print(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_punct(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_punct(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_xdigit(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_xdigit(tmpbuf);
1072 }
1073
1074 UV
1075 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1076 {
1077     uvchr_to_utf8(p, c);
1078     return to_utf8_upper(p, p, lenp);
1079 }
1080
1081 UV
1082 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1083 {
1084     uvchr_to_utf8(p, c);
1085     return to_utf8_title(p, p, lenp);
1086 }
1087
1088 UV
1089 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1090 {
1091     uvchr_to_utf8(p, c);
1092     return to_utf8_lower(p, p, lenp);
1093 }
1094
1095 UV
1096 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1097 {
1098     uvchr_to_utf8(p, c);
1099     return to_utf8_fold(p, p, lenp);
1100 }
1101
1102 /* for now these all assume no locale info available for Unicode > 255 */
1103
1104 bool
1105 Perl_is_uni_alnum_lc(pTHX_ UV c)
1106 {
1107     return is_uni_alnum(c);     /* XXX no locale support yet */
1108 }
1109
1110 bool
1111 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1112 {
1113     return is_uni_alnumc(c);    /* XXX no locale support yet */
1114 }
1115
1116 bool
1117 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1118 {
1119     return is_uni_idfirst(c);   /* XXX no locale support yet */
1120 }
1121
1122 bool
1123 Perl_is_uni_alpha_lc(pTHX_ UV c)
1124 {
1125     return is_uni_alpha(c);     /* XXX no locale support yet */
1126 }
1127
1128 bool
1129 Perl_is_uni_ascii_lc(pTHX_ UV c)
1130 {
1131     return is_uni_ascii(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_space_lc(pTHX_ UV c)
1136 {
1137     return is_uni_space(c);     /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_digit_lc(pTHX_ UV c)
1142 {
1143     return is_uni_digit(c);     /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_upper_lc(pTHX_ UV c)
1148 {
1149     return is_uni_upper(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_lower_lc(pTHX_ UV c)
1154 {
1155     return is_uni_lower(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1160 {
1161     return is_uni_cntrl(c);     /* XXX no locale support yet */
1162 }
1163
1164 bool
1165 Perl_is_uni_graph_lc(pTHX_ UV c)
1166 {
1167     return is_uni_graph(c);     /* XXX no locale support yet */
1168 }
1169
1170 bool
1171 Perl_is_uni_print_lc(pTHX_ UV c)
1172 {
1173     return is_uni_print(c);     /* XXX no locale support yet */
1174 }
1175
1176 bool
1177 Perl_is_uni_punct_lc(pTHX_ UV c)
1178 {
1179     return is_uni_punct(c);     /* XXX no locale support yet */
1180 }
1181
1182 bool
1183 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1184 {
1185     return is_uni_xdigit(c);    /* XXX no locale support yet */
1186 }
1187
1188 U32
1189 Perl_to_uni_upper_lc(pTHX_ U32 c)
1190 {
1191     /* XXX returns only the first character -- do not use XXX */
1192     /* XXX no locale support yet */
1193     STRLEN len;
1194     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1195     return (U32)to_uni_upper(c, tmpbuf, &len);
1196 }
1197
1198 U32
1199 Perl_to_uni_title_lc(pTHX_ U32 c)
1200 {
1201     /* XXX returns only the first character XXX -- do not use XXX */
1202     /* XXX no locale support yet */
1203     STRLEN len;
1204     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1205     return (U32)to_uni_title(c, tmpbuf, &len);
1206 }
1207
1208 U32
1209 Perl_to_uni_lower_lc(pTHX_ U32 c)
1210 {
1211     /* XXX returns only the first character -- do not use XXX */
1212     /* XXX no locale support yet */
1213     STRLEN len;
1214     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1215     return (U32)to_uni_lower(c, tmpbuf, &len);
1216 }
1217
1218 static bool
1219 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1220                  const char *const swashname)
1221 {
1222     dVAR;
1223     if (!is_utf8_char(p))
1224         return FALSE;
1225     if (!*swash)
1226         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1227     return swash_fetch(*swash, p, TRUE) != 0;
1228 }
1229
1230 bool
1231 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1232 {
1233     dVAR;
1234     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1235      * descendant of isalnum(3), in other words, it doesn't
1236      * contain the '_'. --jhi */
1237     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1238 }
1239
1240 bool
1241 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1242 {
1243     dVAR;
1244     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1245 }
1246
1247 bool
1248 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1249 {
1250     dVAR;
1251     if (*p == '_')
1252         return TRUE;
1253     /* is_utf8_idstart would be more logical. */
1254     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1255 }
1256
1257 bool
1258 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1259 {
1260     dVAR;
1261     if (*p == '_')
1262         return TRUE;
1263     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1264 }
1265
1266 bool
1267 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1268 {
1269     dVAR;
1270     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1271 }
1272
1273 bool
1274 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1275 {
1276     dVAR;
1277     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1278 }
1279
1280 bool
1281 Perl_is_utf8_space(pTHX_ const U8 *p)
1282 {
1283     dVAR;
1284     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1285 }
1286
1287 bool
1288 Perl_is_utf8_digit(pTHX_ const U8 *p)
1289 {
1290     dVAR;
1291     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1292 }
1293
1294 bool
1295 Perl_is_utf8_upper(pTHX_ const U8 *p)
1296 {
1297     dVAR;
1298     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1299 }
1300
1301 bool
1302 Perl_is_utf8_lower(pTHX_ const U8 *p)
1303 {
1304     dVAR;
1305     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1306 }
1307
1308 bool
1309 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1310 {
1311     dVAR;
1312     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1313 }
1314
1315 bool
1316 Perl_is_utf8_graph(pTHX_ const U8 *p)
1317 {
1318     dVAR;
1319     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1320 }
1321
1322 bool
1323 Perl_is_utf8_print(pTHX_ const U8 *p)
1324 {
1325     dVAR;
1326     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1327 }
1328
1329 bool
1330 Perl_is_utf8_punct(pTHX_ const U8 *p)
1331 {
1332     dVAR;
1333     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1334 }
1335
1336 bool
1337 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1338 {
1339     dVAR;
1340     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1341 }
1342
1343 bool
1344 Perl_is_utf8_mark(pTHX_ const U8 *p)
1345 {
1346     dVAR;
1347     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1348 }
1349
1350 /*
1351 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1352
1353 The "p" contains the pointer to the UTF-8 string encoding
1354 the character that is being converted.
1355
1356 The "ustrp" is a pointer to the character buffer to put the
1357 conversion result to.  The "lenp" is a pointer to the length
1358 of the result.
1359
1360 The "swashp" is a pointer to the swash to use.
1361
1362 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1363 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1364 but not always, a multicharacter mapping), is tried first.
1365
1366 The "special" is a string like "utf8::ToSpecLower", which means the
1367 hash %utf8::ToSpecLower.  The access to the hash is through
1368 Perl_to_utf8_case().
1369
1370 The "normal" is a string like "ToLower" which means the swash
1371 %utf8::ToLower.
1372
1373 =cut */
1374
1375 UV
1376 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1377                         SV **swashp, const char *normal, const char *special)
1378 {
1379     dVAR;
1380     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1381     STRLEN len = 0;
1382
1383     const UV uv0 = utf8_to_uvchr(p, NULL);
1384     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1385      * are necessary in EBCDIC, they are redundant no-ops
1386      * in ASCII-ish platforms, and hopefully optimized away. */
1387     const UV uv1 = NATIVE_TO_UNI(uv0);
1388     uvuni_to_utf8(tmpbuf, uv1);
1389
1390     if (!*swashp) /* load on-demand */
1391          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1392
1393     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1394     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1395          /* It might be "special" (sometimes, but not always,
1396           * a multicharacter mapping) */
1397          HV * const hv = get_hv(special, FALSE);
1398          SV **svp;
1399
1400          if (hv &&
1401              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1402              (*svp)) {
1403              const char *s;
1404
1405               s = SvPV_const(*svp, len);
1406               if (len == 1)
1407                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1408               else {
1409 #ifdef EBCDIC
1410                    /* If we have EBCDIC we need to remap the characters
1411                     * since any characters in the low 256 are Unicode
1412                     * code points, not EBCDIC. */
1413                    U8 *t = (U8*)s, *tend = t + len, *d;
1414
1415                    d = tmpbuf;
1416                    if (SvUTF8(*svp)) {
1417                         STRLEN tlen = 0;
1418
1419                         while (t < tend) {
1420                              const UV c = utf8_to_uvchr(t, &tlen);
1421                              if (tlen > 0) {
1422                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1423                                   t += tlen;
1424                              }
1425                              else
1426                                   break;
1427                         }
1428                    }
1429                    else {
1430                         while (t < tend) {
1431                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1432                              t++;
1433                         }
1434                    }
1435                    len = d - tmpbuf;
1436                    Copy(tmpbuf, ustrp, len, U8);
1437 #else
1438                    Copy(s, ustrp, len, U8);
1439 #endif
1440               }
1441          }
1442     }
1443
1444     if (!len && *swashp) {
1445         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1446
1447          if (uv2) {
1448               /* It was "normal" (a single character mapping). */
1449               const UV uv3 = UNI_TO_NATIVE(uv2);
1450               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1451          }
1452     }
1453
1454     if (!len) /* Neither: just copy. */
1455          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1456
1457     if (lenp)
1458          *lenp = len;
1459
1460     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1461 }
1462
1463 /*
1464 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1465
1466 Convert the UTF-8 encoded character at p to its uppercase version and
1467 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1468 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1469 the uppercase version may be longer than the original character.
1470
1471 The first character of the uppercased version is returned
1472 (but note, as explained above, that there may be more.)
1473
1474 =cut */
1475
1476 UV
1477 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1478 {
1479     dVAR;
1480     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1481                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1482 }
1483
1484 /*
1485 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1486
1487 Convert the UTF-8 encoded character at p to its titlecase version and
1488 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1489 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1490 titlecase version may be longer than the original character.
1491
1492 The first character of the titlecased version is returned
1493 (but note, as explained above, that there may be more.)
1494
1495 =cut */
1496
1497 UV
1498 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1499 {
1500     dVAR;
1501     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1502                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1503 }
1504
1505 /*
1506 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1507
1508 Convert the UTF-8 encoded character at p to its lowercase version and
1509 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1510 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1511 lowercase version may be longer than the original character.
1512
1513 The first character of the lowercased version is returned
1514 (but note, as explained above, that there may be more.)
1515
1516 =cut */
1517
1518 UV
1519 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1520 {
1521     dVAR;
1522     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1523                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1524 }
1525
1526 /*
1527 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1528
1529 Convert the UTF-8 encoded character at p to its foldcase version and
1530 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1531 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1532 foldcase version may be longer than the original character (up to
1533 three characters).
1534
1535 The first character of the foldcased version is returned
1536 (but note, as explained above, that there may be more.)
1537
1538 =cut */
1539
1540 UV
1541 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1542 {
1543     dVAR;
1544     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1546 }
1547
1548 /* Note:
1549  * A "swash" is a swatch hash.
1550  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1553  */
1554 SV*
1555 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1556 {
1557     dVAR;
1558     SV* retval;
1559     SV* const tokenbufsv = sv_newmortal();
1560     dSP;
1561     const size_t pkg_len = strlen(pkg);
1562     const size_t name_len = strlen(name);
1563     HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1564     SV* errsv_save;
1565
1566     PUSHSTACKi(PERLSI_MAGIC);
1567     ENTER;
1568     SAVEI32(PL_hints);
1569     PL_hints = 0;
1570     save_re_context();
1571     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1572         ENTER;
1573         errsv_save = newSVsv(ERRSV);
1574         /* It is assumed that callers of this routine are not passing in any
1575            user derived data.  */
1576         /* Need to do this after save_re_context() as it will set PL_tainted to
1577            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1578            Even line to create errsv_save can turn on PL_tainted.  */
1579         SAVEBOOL(PL_tainted);
1580         PL_tainted = 0;
1581         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1582                          NULL);
1583         if (!SvTRUE(ERRSV))
1584             sv_setsv(ERRSV, errsv_save);
1585         SvREFCNT_dec(errsv_save);
1586         LEAVE;
1587     }
1588     SPAGAIN;
1589     PUSHMARK(SP);
1590     EXTEND(SP,5);
1591     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1592     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1593     PUSHs(listsv);
1594     PUSHs(sv_2mortal(newSViv(minbits)));
1595     PUSHs(sv_2mortal(newSViv(none)));
1596     PUTBACK;
1597     if (IN_PERL_COMPILETIME) {
1598         /* XXX ought to be handled by lex_start */
1599         SAVEI16(PL_in_my);
1600         PL_in_my = 0;
1601         sv_setpv(tokenbufsv, PL_tokenbuf);
1602     }
1603     errsv_save = newSVsv(ERRSV);
1604     if (call_method("SWASHNEW", G_SCALAR))
1605         retval = newSVsv(*PL_stack_sp--);
1606     else
1607         retval = &PL_sv_undef;
1608     if (!SvTRUE(ERRSV))
1609         sv_setsv(ERRSV, errsv_save);
1610     SvREFCNT_dec(errsv_save);
1611     LEAVE;
1612     POPSTACK;
1613     if (IN_PERL_COMPILETIME) {
1614         STRLEN len;
1615         const char* const pv = SvPV_const(tokenbufsv, len);
1616
1617         Copy(pv, PL_tokenbuf, len+1, char);
1618         CopHINTS_set(PL_curcop, PL_hints);
1619     }
1620     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1621         if (SvPOK(retval))
1622             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1623                        SVfARG(retval));
1624         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1625     }
1626     return retval;
1627 }
1628
1629
1630 /* This API is wrong for special case conversions since we may need to
1631  * return several Unicode characters for a single Unicode character
1632  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1633  * the lower-level routine, and it is similarly broken for returning
1634  * multiple values.  --jhi */
1635 /* Now SWASHGET is recasted into S_swash_get in this file. */
1636
1637 /* Note:
1638  * Returns the value of property/mapping C<swash> for the first character
1639  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1640  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1641  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1642  */
1643 UV
1644 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1645 {
1646     dVAR;
1647     HV* const hv = (HV*)SvRV(swash);
1648     U32 klen;
1649     U32 off;
1650     STRLEN slen;
1651     STRLEN needents;
1652     const U8 *tmps = NULL;
1653     U32 bit;
1654     SV *swatch;
1655     U8 tmputf8[2];
1656     const UV c = NATIVE_TO_ASCII(*ptr);
1657
1658     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1659         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1660         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1661         ptr = tmputf8;
1662     }
1663     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1664      * then the "swatch" is a vec() for al the chars which start
1665      * with 0xAA..0xYY
1666      * So the key in the hash (klen) is length of encoded char -1
1667      */
1668     klen = UTF8SKIP(ptr) - 1;
1669     off  = ptr[klen];
1670
1671     if (klen == 0) {
1672       /* If char in invariant then swatch is for all the invariant chars
1673        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1674        */
1675         needents = UTF_CONTINUATION_MARK;
1676         off      = NATIVE_TO_UTF(ptr[klen]);
1677     }
1678     else {
1679       /* If char is encoded then swatch is for the prefix */
1680         needents = (1 << UTF_ACCUMULATION_SHIFT);
1681         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1682     }
1683
1684     /*
1685      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1686      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1687      * it's nothing to sniff at.)  Pity we usually come through at least
1688      * two function calls to get here...
1689      *
1690      * NB: this code assumes that swatches are never modified, once generated!
1691      */
1692
1693     if (hv   == PL_last_swash_hv &&
1694         klen == PL_last_swash_klen &&
1695         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1696     {
1697         tmps = PL_last_swash_tmps;
1698         slen = PL_last_swash_slen;
1699     }
1700     else {
1701         /* Try our second-level swatch cache, kept in a hash. */
1702         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1703
1704         /* If not cached, generate it via swash_get */
1705         if (!svp || !SvPOK(*svp)
1706                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1707             /* We use utf8n_to_uvuni() as we want an index into
1708                Unicode tables, not a native character number.
1709              */
1710             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1711                                            ckWARN(WARN_UTF8) ?
1712                                            0 : UTF8_ALLOW_ANY);
1713             swatch = swash_get(swash,
1714                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1715                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1716                                 needents);
1717
1718             if (IN_PERL_COMPILETIME)
1719                 CopHINTS_set(PL_curcop, PL_hints);
1720
1721             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1722
1723             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1724                      || (slen << 3) < needents)
1725                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1726         }
1727
1728         PL_last_swash_hv = hv;
1729         assert(klen <= sizeof(PL_last_swash_key));
1730         PL_last_swash_klen = (U8)klen;
1731         /* FIXME change interpvar.h?  */
1732         PL_last_swash_tmps = (U8 *) tmps;
1733         PL_last_swash_slen = slen;
1734         if (klen)
1735             Copy(ptr, PL_last_swash_key, klen, U8);
1736     }
1737
1738     switch ((int)((slen << 3) / needents)) {
1739     case 1:
1740         bit = 1 << (off & 7);
1741         off >>= 3;
1742         return (tmps[off] & bit) != 0;
1743     case 8:
1744         return tmps[off];
1745     case 16:
1746         off <<= 1;
1747         return (tmps[off] << 8) + tmps[off + 1] ;
1748     case 32:
1749         off <<= 2;
1750         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1751     }
1752     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1753     NORETURN_FUNCTION_END;
1754 }
1755
1756 /* Note:
1757  * Returns a swatch (a bit vector string) for a code point sequence
1758  * that starts from the value C<start> and comprises the number C<span>.
1759  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1760  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1761  */
1762 STATIC SV*
1763 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1764 {
1765     SV *swatch;
1766     U8 *l, *lend, *x, *xend, *s;
1767     STRLEN lcur, xcur, scur;
1768
1769     HV* const hv = (HV*)SvRV(swash);
1770     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1771     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1772     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1773     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1774     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1775     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1776     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1777     const STRLEN bits  = SvUV(*bitssvp);
1778     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1779     const UV     none  = SvUV(*nonesvp);
1780     const UV     end   = start + span;
1781
1782     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1783         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1784                                                  (UV)bits);
1785     }
1786
1787     /* create and initialize $swatch */
1788     swatch = newSVpvs("");
1789     scur   = octets ? (span * octets) : (span + 7) / 8;
1790     SvGROW(swatch, scur + 1);
1791     s = (U8*)SvPVX(swatch);
1792     if (octets && none) {
1793         const U8* const e = s + scur;
1794         while (s < e) {
1795             if (bits == 8)
1796                 *s++ = (U8)(none & 0xff);
1797             else if (bits == 16) {
1798                 *s++ = (U8)((none >>  8) & 0xff);
1799                 *s++ = (U8)( none        & 0xff);
1800             }
1801             else if (bits == 32) {
1802                 *s++ = (U8)((none >> 24) & 0xff);
1803                 *s++ = (U8)((none >> 16) & 0xff);
1804                 *s++ = (U8)((none >>  8) & 0xff);
1805                 *s++ = (U8)( none        & 0xff);
1806             }
1807         }
1808         *s = '\0';
1809     }
1810     else {
1811         (void)memzero((U8*)s, scur + 1);
1812     }
1813     SvCUR_set(swatch, scur);
1814     s = (U8*)SvPVX(swatch);
1815
1816     /* read $swash->{LIST} */
1817     l = (U8*)SvPV(*listsvp, lcur);
1818     lend = l + lcur;
1819     while (l < lend) {
1820         UV min, max, val;
1821         STRLEN numlen;
1822         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1823
1824         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1825
1826         numlen = lend - l;
1827         min = grok_hex((char *)l, &numlen, &flags, NULL);
1828         if (numlen)
1829             l += numlen;
1830         else if (nl) {
1831             l = nl + 1; /* 1 is length of "\n" */
1832             continue;
1833         }
1834         else {
1835             l = lend; /* to LIST's end at which \n is not found */
1836             break;
1837         }
1838
1839         if (isBLANK(*l)) {
1840             ++l;
1841             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1842             numlen = lend - l;
1843             max = grok_hex((char *)l, &numlen, &flags, NULL);
1844             if (numlen)
1845                 l += numlen;
1846             else
1847                 max = min;
1848
1849             if (octets) {
1850                 if (isBLANK(*l)) {
1851                     ++l;
1852                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1853                             PERL_SCAN_DISALLOW_PREFIX;
1854                     numlen = lend - l;
1855                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1856                     if (numlen)
1857                         l += numlen;
1858                     else
1859                         val = 0;
1860                 }
1861                 else {
1862                     val = 0;
1863                     if (typeto) {
1864                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1865                                          typestr, l);
1866                     }
1867                 }
1868             }
1869             else
1870                 val = 0; /* bits == 1, then val should be ignored */
1871         }
1872         else {
1873             max = min;
1874             if (octets) {
1875                 val = 0;
1876                 if (typeto) {
1877                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1878                 }
1879             }
1880             else
1881                 val = 0; /* bits == 1, then val should be ignored */
1882         }
1883
1884         if (nl)
1885             l = nl + 1;
1886         else
1887             l = lend;
1888
1889         if (max < start)
1890             continue;
1891
1892         if (octets) {
1893             UV key;
1894             if (min < start) {
1895                 if (!none || val < none) {
1896                     val += start - min;
1897                 }
1898                 min = start;
1899             }
1900             for (key = min; key <= max; key++) {
1901                 STRLEN offset;
1902                 if (key >= end)
1903                     goto go_out_list;
1904                 /* offset must be non-negative (start <= min <= key < end) */
1905                 offset = octets * (key - start);
1906                 if (bits == 8)
1907                     s[offset] = (U8)(val & 0xff);
1908                 else if (bits == 16) {
1909                     s[offset    ] = (U8)((val >>  8) & 0xff);
1910                     s[offset + 1] = (U8)( val        & 0xff);
1911                 }
1912                 else if (bits == 32) {
1913                     s[offset    ] = (U8)((val >> 24) & 0xff);
1914                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1915                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1916                     s[offset + 3] = (U8)( val        & 0xff);
1917                 }
1918
1919                 if (!none || val < none)
1920                     ++val;
1921             }
1922         }
1923         else { /* bits == 1, then val should be ignored */
1924             UV key;
1925             if (min < start)
1926                 min = start;
1927             for (key = min; key <= max; key++) {
1928                 const STRLEN offset = (STRLEN)(key - start);
1929                 if (key >= end)
1930                     goto go_out_list;
1931                 s[offset >> 3] |= 1 << (offset & 7);
1932             }
1933         }
1934     } /* while */
1935   go_out_list:
1936
1937     /* read $swash->{EXTRAS} */
1938     x = (U8*)SvPV(*extssvp, xcur);
1939     xend = x + xcur;
1940     while (x < xend) {
1941         STRLEN namelen;
1942         U8 *namestr;
1943         SV** othersvp;
1944         HV* otherhv;
1945         STRLEN otherbits;
1946         SV **otherbitssvp, *other;
1947         U8 *s, *o, *nl;
1948         STRLEN slen, olen;
1949
1950         const U8 opc = *x++;
1951         if (opc == '\n')
1952             continue;
1953
1954         nl = (U8*)memchr(x, '\n', xend - x);
1955
1956         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1957             if (nl) {
1958                 x = nl + 1; /* 1 is length of "\n" */
1959                 continue;
1960             }
1961             else {
1962                 x = xend; /* to EXTRAS' end at which \n is not found */
1963                 break;
1964             }
1965         }
1966
1967         namestr = x;
1968         if (nl) {
1969             namelen = nl - namestr;
1970             x = nl + 1;
1971         }
1972         else {
1973             namelen = xend - namestr;
1974             x = xend;
1975         }
1976
1977         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1978         otherhv = (HV*)SvRV(*othersvp);
1979         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1980         otherbits = (STRLEN)SvUV(*otherbitssvp);
1981         if (bits < otherbits)
1982             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1983
1984         /* The "other" swatch must be destroyed after. */
1985         other = swash_get(*othersvp, start, span);
1986         o = (U8*)SvPV(other, olen);
1987
1988         if (!olen)
1989             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1990
1991         s = (U8*)SvPV(swatch, slen);
1992         if (bits == 1 && otherbits == 1) {
1993             if (slen != olen)
1994                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1995
1996             switch (opc) {
1997             case '+':
1998                 while (slen--)
1999                     *s++ |= *o++;
2000                 break;
2001             case '!':
2002                 while (slen--)
2003                     *s++ |= ~*o++;
2004                 break;
2005             case '-':
2006                 while (slen--)
2007                     *s++ &= ~*o++;
2008                 break;
2009             case '&':
2010                 while (slen--)
2011                     *s++ &= *o++;
2012                 break;
2013             default:
2014                 break;
2015             }
2016         }
2017         else {
2018             STRLEN otheroctets = otherbits >> 3;
2019             STRLEN offset = 0;
2020             U8* const send = s + slen;
2021
2022             while (s < send) {
2023                 UV otherval = 0;
2024
2025                 if (otherbits == 1) {
2026                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2027                     ++offset;
2028                 }
2029                 else {
2030                     STRLEN vlen = otheroctets;
2031                     otherval = *o++;
2032                     while (--vlen) {
2033                         otherval <<= 8;
2034                         otherval |= *o++;
2035                     }
2036                 }
2037
2038                 if (opc == '+' && otherval)
2039                     NOOP;   /* replace with otherval */
2040                 else if (opc == '!' && !otherval)
2041                     otherval = 1;
2042                 else if (opc == '-' && otherval)
2043                     otherval = 0;
2044                 else if (opc == '&' && !otherval)
2045                     otherval = 0;
2046                 else {
2047                     s += octets; /* no replacement */
2048                     continue;
2049                 }
2050
2051                 if (bits == 8)
2052                     *s++ = (U8)( otherval & 0xff);
2053                 else if (bits == 16) {
2054                     *s++ = (U8)((otherval >>  8) & 0xff);
2055                     *s++ = (U8)( otherval        & 0xff);
2056                 }
2057                 else if (bits == 32) {
2058                     *s++ = (U8)((otherval >> 24) & 0xff);
2059                     *s++ = (U8)((otherval >> 16) & 0xff);
2060                     *s++ = (U8)((otherval >>  8) & 0xff);
2061                     *s++ = (U8)( otherval        & 0xff);
2062                 }
2063             }
2064         }
2065         sv_free(other); /* through with it! */
2066     } /* while */
2067     return swatch;
2068 }
2069
2070 /*
2071 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2072
2073 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2074 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2075 bytes available. The return value is the pointer to the byte after the
2076 end of the new character. In other words,
2077
2078     d = uvchr_to_utf8(d, uv);
2079
2080 is the recommended wide native character-aware way of saying
2081
2082     *(d++) = uv;
2083
2084 =cut
2085 */
2086
2087 /* On ASCII machines this is normally a macro but we want a
2088    real function in case XS code wants it
2089 */
2090 U8 *
2091 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2092 {
2093     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2094 }
2095
2096 U8 *
2097 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2098 {
2099     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2100 }
2101
2102 /*
2103 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2104 flags
2105
2106 Returns the native character value of the first character in the string
2107 C<s>
2108 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2109 length, in bytes, of that character.
2110
2111 Allows length and flags to be passed to low level routine.
2112
2113 =cut
2114 */
2115 /* On ASCII machines this is normally a macro but we want
2116    a real function in case XS code wants it
2117 */
2118 UV
2119 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2120 U32 flags)
2121 {
2122     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2123     return UNI_TO_NATIVE(uv);
2124 }
2125
2126 /*
2127 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2128
2129 Build to the scalar dsv a displayable version of the string spv,
2130 length len, the displayable version being at most pvlim bytes long
2131 (if longer, the rest is truncated and "..." will be appended).
2132
2133 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2134 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2135 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2136 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2137 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2138 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2139
2140 The pointer to the PV of the dsv is returned.
2141
2142 =cut */
2143 char *
2144 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2145 {
2146     int truncated = 0;
2147     const char *s, *e;
2148
2149     sv_setpvn(dsv, "", 0);
2150     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2151          UV u;
2152           /* This serves double duty as a flag and a character to print after
2153              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2154           */
2155          char ok = 0;
2156
2157          if (pvlim && SvCUR(dsv) >= pvlim) {
2158               truncated++;
2159               break;
2160          }
2161          u = utf8_to_uvchr((U8*)s, 0);
2162          if (u < 256) {
2163              const unsigned char c = (unsigned char)u & 0xFF;
2164              if (flags & UNI_DISPLAY_BACKSLASH) {
2165                  switch (c) {
2166                  case '\n':
2167                      ok = 'n'; break;
2168                  case '\r':
2169                      ok = 'r'; break;
2170                  case '\t':
2171                      ok = 't'; break;
2172                  case '\f':
2173                      ok = 'f'; break;
2174                  case '\a':
2175                      ok = 'a'; break;
2176                  case '\\':
2177                      ok = '\\'; break;
2178                  default: break;
2179                  }
2180                  if (ok) {
2181                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2182                  }
2183              }
2184              /* isPRINT() is the locale-blind version. */
2185              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2186                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2187                  ok = 1;
2188              }
2189          }
2190          if (!ok)
2191              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2192     }
2193     if (truncated)
2194          sv_catpvs(dsv, "...");
2195
2196     return SvPVX(dsv);
2197 }
2198
2199 /*
2200 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2201
2202 Build to the scalar dsv a displayable version of the scalar sv,
2203 the displayable version being at most pvlim bytes long
2204 (if longer, the rest is truncated and "..." will be appended).
2205
2206 The flags argument is as in pv_uni_display().
2207
2208 The pointer to the PV of the dsv is returned.
2209
2210 =cut
2211 */
2212 char *
2213 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2214 {
2215      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2216                                 SvCUR(ssv), pvlim, flags);
2217 }
2218
2219 /*
2220 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2221
2222 Return true if the strings s1 and s2 differ case-insensitively, false
2223 if not (if they are equal case-insensitively).  If u1 is true, the
2224 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2225 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2226 are false, the respective string is assumed to be in native 8-bit
2227 encoding.
2228
2229 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2230 in there (they will point at the beginning of the I<next> character).
2231 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2232 pointers beyond which scanning will not continue under any
2233 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2234 s2+l2 will be used as goal end pointers that will also stop the scan,
2235 and which qualify towards defining a successful match: all the scans
2236 that define an explicit length must reach their goal pointers for
2237 a match to succeed).
2238
2239 For case-insensitiveness, the "casefolding" of Unicode is used
2240 instead of upper/lowercasing both the characters, see
2241 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2242
2243 =cut */
2244 I32
2245 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2246 {
2247      dVAR;
2248      register const U8 *p1  = (const U8*)s1;
2249      register const U8 *p2  = (const U8*)s2;
2250      register const U8 *f1 = NULL;
2251      register const U8 *f2 = NULL;
2252      register U8 *e1 = NULL;
2253      register U8 *q1 = NULL;
2254      register U8 *e2 = NULL;
2255      register U8 *q2 = NULL;
2256      STRLEN n1 = 0, n2 = 0;
2257      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2258      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2259      U8 natbuf[1+1];
2260      STRLEN foldlen1, foldlen2;
2261      bool match;
2262
2263      if (pe1)
2264           e1 = *(U8**)pe1;
2265      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2266           f1 = (const U8*)s1 + l1;
2267      if (pe2)
2268           e2 = *(U8**)pe2;
2269      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2270           f2 = (const U8*)s2 + l2;
2271
2272      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2273           return 1; /* mismatch; possible infinite loop or false positive */
2274
2275      if (!u1 || !u2)
2276           natbuf[1] = 0; /* Need to terminate the buffer. */
2277
2278      while ((e1 == 0 || p1 < e1) &&
2279             (f1 == 0 || p1 < f1) &&
2280             (e2 == 0 || p2 < e2) &&
2281             (f2 == 0 || p2 < f2)) {
2282           if (n1 == 0) {
2283                if (u1)
2284                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2285                else {
2286                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2287                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2288                }
2289                q1 = foldbuf1;
2290                n1 = foldlen1;
2291           }
2292           if (n2 == 0) {
2293                if (u2)
2294                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2295                else {
2296                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2297                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2298                }
2299                q2 = foldbuf2;
2300                n2 = foldlen2;
2301           }
2302           while (n1 && n2) {
2303                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2304                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2305                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2306                    return 1; /* mismatch */
2307                n1 -= UTF8SKIP(q1);
2308                q1 += UTF8SKIP(q1);
2309                n2 -= UTF8SKIP(q2);
2310                q2 += UTF8SKIP(q2);
2311           }
2312           if (n1 == 0)
2313                p1 += u1 ? UTF8SKIP(p1) : 1;
2314           if (n2 == 0)
2315                p2 += u2 ? UTF8SKIP(p2) : 1;
2316
2317      }
2318
2319      /* A match is defined by all the scans that specified
2320       * an explicit length reaching their final goals. */
2321      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2322
2323      if (match) {
2324           if (pe1)
2325                *pe1 = (char*)p1;
2326           if (pe2)
2327                *pe2 = (char*)p2;
2328      }
2329
2330      return match ? 0 : 1; /* 0 match, 1 mismatch */
2331 }
2332
2333 /*
2334  * Local variables:
2335  * c-indentation-style: bsd
2336  * c-basic-offset: 4
2337  * indent-tabs-mode: t
2338  * End:
2339  *
2340  * ex: set ts=8 sts=4 sw=4 noet:
2341  */