utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241     PERL_UNUSED_CONTEXT;
 242 #ifdef IS_UTF8_CHAR
 243     if (IS_UTF8_CHAR_FAST(len))
 244         return IS_UTF8_CHAR(s, len) ? len : 0;
 245 #endif /* #ifdef IS_UTF8_CHAR */
 246     return is_utf8_char_slow(s, len);
 247 }
 248
 249 /*
 250 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 251
 252 Returns true if first C<len> bytes of the given string form a valid
 253 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 254 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 255 because a valid ASCII string is a valid UTF-8 string.
 256
 257 See also is_utf8_string_loclen() and is_utf8_string_loc().
 258
 259 =cut
 260 */
 261
 262 bool
 263 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 264 {
 265     const U8* x = s;
 266     const U8* send;
 267
 268     PERL_UNUSED_CONTEXT;
 269     if (!len)
 270         len = strlen((const char *)s);
 271     send = s + len;
 272
 273     while (x < send) {
 274         STRLEN c;
 275          /* Inline the easy bits of is_utf8_char() here for speed... */
 276          if (UTF8_IS_INVARIANT(*x))
 277               c = 1;
 278          else if (!UTF8_IS_START(*x))
 279              goto out;
 280          else {
 281               /* ... and call is_utf8_char() only if really needed. */
 282 #ifdef IS_UTF8_CHAR
 283              c = UTF8SKIP(x);
 284              if (IS_UTF8_CHAR_FAST(c)) {
 285                  if (!IS_UTF8_CHAR(x, c))
 286                      c = 0;
 287              }
 288              else
 289                 c = is_utf8_char_slow(x, c);
 290 #else
 291              c = is_utf8_char(x);
 292 #endif /* #ifdef IS_UTF8_CHAR */
 293               if (!c)
 294                   goto out;
 295          }
 296         x += c;
 297     }
 298
 299  out:
 300     if (x != send)
 301         return FALSE;
 302
 303     return TRUE;
 304 }
 305
 306 /*
 307 Implemented as a macro in utf8.h
 308
 309 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 310
 311 Like is_utf8_string() but stores the location of the failure (in the
 312 case of "utf8ness failure") or the location s+len (in the case of
 313 "utf8ness success") in the C<ep>.
 314
 315 See also is_utf8_string_loclen() and is_utf8_string().
 316
 317 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 318
 319 Like is_utf8_string() but stores the location of the failure (in the
 320 case of "utf8ness failure") or the location s+len (in the case of
 321 "utf8ness success") in the C<ep>, and the number of UTF-8
 322 encoded characters in the C<el>.
 323
 324 See also is_utf8_string_loc() and is_utf8_string().
 325
 326 =cut
 327 */
 328
 329 bool
 330 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 331 {
 332     const U8* x = s;
 333     const U8* send;
 334     STRLEN c;
 335     PERL_UNUSED_CONTEXT;
 336
 337     if (!len)
 338         len = strlen((const char *)s);
 339     send = s + len;
 340     if (el)
 341         *el = 0;
 342
 343     while (x < send) {
 344          /* Inline the easy bits of is_utf8_char() here for speed... */
 345          if (UTF8_IS_INVARIANT(*x))
 346              c = 1;
 347          else if (!UTF8_IS_START(*x))
 348              goto out;
 349          else {
 350              /* ... and call is_utf8_char() only if really needed. */
 351 #ifdef IS_UTF8_CHAR
 352              c = UTF8SKIP(x);
 353              if (IS_UTF8_CHAR_FAST(c)) {
 354                  if (!IS_UTF8_CHAR(x, c))
 355                      c = 0;
 356              } else
 357                  c = is_utf8_char_slow(x, c);
 358 #else
 359              c = is_utf8_char(x);
 360 #endif /* #ifdef IS_UTF8_CHAR */
 361              if (!c)
 362                  goto out;
 363          }
 364          x += c;
 365          if (el)
 366              (*el)++;
 367     }
 368
 369  out:
 370     if (ep)
 371         *ep = x;
 372     if (x != send)
 373         return FALSE;
 374
 375     return TRUE;
 376 }
 377
 378 /*
 379
 380 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 381
 382 Bottom level UTF-8 decode routine.
 383 Returns the unicode code point value of the first character in the string C<s>
 384 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 385 C<retlen> will be set to the length, in bytes, of that character.
 386
 387 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 388 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 389 it is assumed that the caller will raise a warning, and this function
 390 will silently just set C<retlen> to C<-1> and return zero.  If the
 391 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 392 malformations will be given, C<retlen> will be set to the expected
 393 length of the UTF-8 character in bytes, and zero will be returned.
 394
 395 The C<flags> can also contain various flags to allow deviations from
 396 the strict UTF-8 encoding (see F<utf8.h>).
 397
 398 Most code should use utf8_to_uvchr() rather than call this directly.
 399
 400 =cut
 401 */
 402
 403 UV
 404 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 405 {
 406     dVAR;
 407     const U8 * const s0 = s;
 408     UV uv = *s, ouv = 0;
 409     STRLEN len = 1;
 410     const bool dowarn = ckWARN_d(WARN_UTF8);
 411     const UV startbyte = *s;
 412     STRLEN expectlen = 0;
 413     U32 warning = 0;
 414
 415 /* This list is a superset of the UTF8_ALLOW_XXX. */
 416
 417 #define UTF8_WARN_EMPTY                          1
 418 #define UTF8_WARN_CONTINUATION                   2
 419 #define UTF8_WARN_NON_CONTINUATION               3
 420 #define UTF8_WARN_FE_FF                          4
 421 #define UTF8_WARN_SHORT                          5
 422 #define UTF8_WARN_OVERFLOW                       6
 423 #define UTF8_WARN_SURROGATE                      7
 424 #define UTF8_WARN_LONG                           8
 425 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 426
 427     if (curlen == 0 &&
 428         !(flags & UTF8_ALLOW_EMPTY)) {
 429         warning = UTF8_WARN_EMPTY;
 430         goto malformed;
 431     }
 432
 433     if (UTF8_IS_INVARIANT(uv)) {
 434         if (retlen)
 435             *retlen = 1;
 436         return (UV) (NATIVE_TO_UTF(*s));
 437     }
 438
 439     if (UTF8_IS_CONTINUATION(uv) &&
 440         !(flags & UTF8_ALLOW_CONTINUATION)) {
 441         warning = UTF8_WARN_CONTINUATION;
 442         goto malformed;
 443     }
 444
 445     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 446         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 447         warning = UTF8_WARN_NON_CONTINUATION;
 448         goto malformed;
 449     }
 450
 451 #ifdef EBCDIC
 452     uv = NATIVE_TO_UTF(uv);
 453 #else
 454     if ((uv == 0xfe || uv == 0xff) &&
 455         !(flags & UTF8_ALLOW_FE_FF)) {
 456         warning = UTF8_WARN_FE_FF;
 457         goto malformed;
 458     }
 459 #endif
 460
 461     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 462     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 463     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 464     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 465 #ifdef EBCDIC
 466     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 467     else                        { len =  7; uv &= 0x01; }
 468 #else
 469     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 470     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 471     else                        { len = 13; uv = 0; } /* whoa! */
 472 #endif
 473
 474     if (retlen)
 475         *retlen = len;
 476
 477     expectlen = len;
 478
 479     if ((curlen < expectlen) &&
 480         !(flags & UTF8_ALLOW_SHORT)) {
 481         warning = UTF8_WARN_SHORT;
 482         goto malformed;
 483     }
 484
 485     len--;
 486     s++;
 487     ouv = uv;
 488
 489     while (len--) {
 490         if (!UTF8_IS_CONTINUATION(*s) &&
 491             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 492             s--;
 493             warning = UTF8_WARN_NON_CONTINUATION;
 494             goto malformed;
 495         }
 496         else
 497             uv = UTF8_ACCUMULATE(uv, *s);
 498         if (!(uv > ouv)) {
 499             /* These cannot be allowed. */
 500             if (uv == ouv) {
 501                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 502                     warning = UTF8_WARN_LONG;
 503                     goto malformed;
 504                 }
 505             }
 506             else { /* uv < ouv */
 507                 /* This cannot be allowed. */
 508                 warning = UTF8_WARN_OVERFLOW;
 509                 goto malformed;
 510             }
 511         }
 512         s++;
 513         ouv = uv;
 514     }
 515
 516     if (UNICODE_IS_SURROGATE(uv) &&
 517         !(flags & UTF8_ALLOW_SURROGATE)) {
 518         warning = UTF8_WARN_SURROGATE;
 519         goto malformed;
 520     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 521                !(flags & UTF8_ALLOW_LONG)) {
 522         warning = UTF8_WARN_LONG;
 523         goto malformed;
 524     } else if (UNICODE_IS_ILLEGAL(uv) &&
 525                !(flags & UTF8_ALLOW_FFFF)) {
 526         warning = UTF8_WARN_FFFF;
 527         goto malformed;
 528     }
 529
 530     return uv;
 531
 532 malformed:
 533
 534     if (flags & UTF8_CHECK_ONLY) {
 535         if (retlen)
 536             *retlen = -1;
 537         return 0;
 538     }
 539
 540     if (dowarn) {
 541         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 542
 543         switch (warning) {
 544         case 0: /* Intentionally empty. */ break;
 545         case UTF8_WARN_EMPTY:
 546             sv_catpvs(sv, "(empty string)");
 547             break;
 548         case UTF8_WARN_CONTINUATION:
 549             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 550             break;
 551         case UTF8_WARN_NON_CONTINUATION:
 552             if (s == s0)
 553                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 554                            (UV)s[1], startbyte);
 555             else {
 556                 const int len = (int)(s-s0);
 557                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 558                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 559             }
 560
 561             break;
 562         case UTF8_WARN_FE_FF:
 563             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 564             break;
 565         case UTF8_WARN_SHORT:
 566             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 567                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 568             expectlen = curlen;         /* distance for caller to skip */
 569             break;
 570         case UTF8_WARN_OVERFLOW:
 571             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 572                            ouv, *s, startbyte);
 573             break;
 574         case UTF8_WARN_SURROGATE:
 575             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 576             break;
 577         case UTF8_WARN_LONG:
 578             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 579                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 580             break;
 581         case UTF8_WARN_FFFF:
 582             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 583             break;
 584         default:
 585             sv_catpvs(sv, "(unknown reason)");
 586             break;
 587         }
 588
 589         if (warning) {
 590             const char * const s = SvPVX_const(sv);
 591
 592             if (PL_op)
 593                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 594                             "%s in %s", s,  OP_DESC(PL_op));
 595             else
 596                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 597         }
 598     }
 599
 600     if (retlen)
 601         *retlen = expectlen ? expectlen : len;
 602
 603     return 0;
 604 }
 605
 606 /*
 607 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 608
 609 Returns the native character value of the first character in the string C<s>
 610 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 611 length, in bytes, of that character.
 612
 613 If C<s> does not point to a well-formed UTF-8 character, zero is
 614 returned and retlen is set, if possible, to -1.
 615
 616 =cut
 617 */
 618
 619 UV
 620 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 621 {
 622     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 623                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 624 }
 625
 626 /*
 627 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 628
 629 Returns the Unicode code point of the first character in the string C<s>
 630 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 631 length, in bytes, of that character.
 632
 633 This function should only be used when returned UV is considered
 634 an index into the Unicode semantic tables (e.g. swashes).
 635
 636 If C<s> does not point to a well-formed UTF-8 character, zero is
 637 returned and retlen is set, if possible, to -1.
 638
 639 =cut
 640 */
 641
 642 UV
 643 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 644 {
 645     /* Call the low level routine asking for checks */
 646     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 647                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 648 }
 649
 650 /*
 651 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 652
 653 Return the length of the UTF-8 char encoded string C<s> in characters.
 654 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 655 up past C<e>, croaks.
 656
 657 =cut
 658 */
 659
 660 STRLEN
 661 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 662 {
 663     dVAR;
 664     STRLEN len = 0;
 665
 666     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 667      * the bitops (especially ~) can create illegal UTF-8.
 668      * In other words: in Perl UTF-8 is not just for Unicode. */
 669
 670     if (e < s)
 671         goto warn_and_return;
 672     while (s < e) {
 673         const U8 t = UTF8SKIP(s);
 674         if (e - s < t) {
 675             warn_and_return:
 676             if (ckWARN_d(WARN_UTF8)) {
 677                 if (PL_op)
 678                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 679                             "%s in %s", unees, OP_DESC(PL_op));
 680                 else
 681                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 682             }
 683             return len;
 684         }
 685         s += t;
 686         len++;
 687     }
 688
 689     return len;
 690 }
 691
 692 /*
 693 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 694
 695 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 696 and C<b>.
 697
 698 WARNING: use only if you *know* that the pointers point inside the
 699 same UTF-8 buffer.
 700
 701 =cut
 702 */
 703
 704 IV
 705 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 706 {
 707     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 708 }
 709
 710 /*
 711 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 712
 713 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 714 forward or backward.
 715
 716 WARNING: do not use the following unless you *know* C<off> is within
 717 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 718 on the first byte of character or just after the last byte of a character.
 719
 720 =cut
 721 */
 722
 723 U8 *
 724 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 725 {
 726     PERL_UNUSED_CONTEXT;
 727     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 728      * the bitops (especially ~) can create illegal UTF-8.
 729      * In other words: in Perl UTF-8 is not just for Unicode. */
 730
 731     if (off >= 0) {
 732         while (off--)
 733             s += UTF8SKIP(s);
 734     }
 735     else {
 736         while (off++) {
 737             s--;
 738             while (UTF8_IS_CONTINUATION(*s))
 739                 s--;
 740         }
 741     }
 742     return (U8 *)s;
 743 }
 744
 745 /*
 746 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 747
 748 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 749 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 750 updates len to contain the new length.
 751 Returns zero on failure, setting C<len> to -1.
 752
 753 =cut
 754 */
 755
 756 U8 *
 757 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 758 {
 759     U8 * const save = s;
 760     U8 * const send = s + *len;
 761     U8 *d;
 762
 763     /* ensure valid UTF-8 and chars < 256 before updating string */
 764     while (s < send) {
 765         U8 c = *s++;
 766
 767         if (!UTF8_IS_INVARIANT(c) &&
 768             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 769              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 770             *len = -1;
 771             return 0;
 772         }
 773     }
 774
 775     d = s = save;
 776     while (s < send) {
 777         STRLEN ulen;
 778         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 779         s += ulen;
 780     }
 781     *d = '\0';
 782     *len = d - save;
 783     return save;
 784 }
 785
 786 /*
 787 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 788
 789 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 790 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 791 the newly-created string, and updates C<len> to contain the new
 792 length.  Returns the original string if no conversion occurs, C<len>
 793 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 794 0 if C<s> is converted or contains all 7bit characters.
 795
 796 =cut
 797 */
 798
 799 U8 *
 800 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 801 {
 802     U8 *d;
 803     const U8 *start = s;
 804     const U8 *send;
 805     I32 count = 0;
 806
 807     PERL_UNUSED_CONTEXT;
 808     if (!*is_utf8)
 809         return (U8 *)start;
 810
 811     /* ensure valid UTF-8 and chars < 256 before converting string */
 812     for (send = s + *len; s < send;) {
 813         U8 c = *s++;
 814         if (!UTF8_IS_INVARIANT(c)) {
 815             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 816                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 817                 count++;
 818             else
 819                 return (U8 *)start;
 820         }
 821     }
 822
 823     *is_utf8 = 0;
 824
 825     Newx(d, (*len) - count + 1, U8);
 826     s = start; start = d;
 827     while (s < send) {
 828         U8 c = *s++;
 829         if (!UTF8_IS_INVARIANT(c)) {
 830             /* Then it is two-byte encoded */
 831             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 832             c = ASCII_TO_NATIVE(c);
 833         }
 834         *d++ = c;
 835     }
 836     *d = '\0';
 837     *len = d - start;
 838     return (U8 *)start;
 839 }
 840
 841 /*
 842 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 843
 844 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 845 Returns a pointer to the newly-created string, and sets C<len> to
 846 reflect the new length.
 847
 848 If you want to convert to UTF-8 from other encodings than ASCII,
 849 see sv_recode_to_utf8().
 850
 851 =cut
 852 */
 853
 854 U8*
 855 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 856 {
 857     const U8 * const send = s + (*len);
 858     U8 *d;
 859     U8 *dst;
 860     PERL_UNUSED_CONTEXT;
 861
 862     Newx(d, (*len) * 2 + 1, U8);
 863     dst = d;
 864
 865     while (s < send) {
 866         const UV uv = NATIVE_TO_ASCII(*s++);
 867         if (UNI_IS_INVARIANT(uv))
 868             *d++ = (U8)UTF_TO_NATIVE(uv);
 869         else {
 870             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 871             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 872         }
 873     }
 874     *d = '\0';
 875     *len = d-dst;
 876     return dst;
 877 }
 878
 879 /*
 880  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 881  *
 882  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 883  * We optimize for native, for obvious reasons. */
 884
 885 U8*
 886 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 887 {
 888     U8* pend;
 889     U8* dstart = d;
 890
 891     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 892          d[0] = 0;
 893          *newlen = 1;
 894          return d;
 895     }
 896
 897     if (bytelen & 1)
 898         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 899
 900     pend = p + bytelen;
 901
 902     while (p < pend) {
 903         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 904         p += 2;
 905         if (uv < 0x80) {
 906             *d++ = (U8)uv;
 907             continue;
 908         }
 909         if (uv < 0x800) {
 910             *d++ = (U8)(( uv >>  6)         | 0xc0);
 911             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 912             continue;
 913         }
 914         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 915             UV low = (p[0] << 8) + p[1];
 916             p += 2;
 917             if (low < 0xdc00 || low >= 0xdfff)
 918                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 919             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 920         }
 921         if (uv < 0x10000) {
 922             *d++ = (U8)(( uv >> 12)         | 0xe0);
 923             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 924             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 925             continue;
 926         }
 927         else {
 928             *d++ = (U8)(( uv >> 18)         | 0xf0);
 929             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 930             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 931             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 932             continue;
 933         }
 934     }
 935     *newlen = d - dstart;
 936     return d;
 937 }
 938
 939 /* Note: this one is slightly destructive of the source. */
 940
 941 U8*
 942 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 943 {
 944     U8* s = (U8*)p;
 945     U8* const send = s + bytelen;
 946     while (s < send) {
 947         const U8 tmp = s[0];
 948         s[0] = s[1];
 949         s[1] = tmp;
 950         s += 2;
 951     }
 952     return utf16_to_utf8(p, d, bytelen, newlen);
 953 }
 954
 955 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 956
 957 bool
 958 Perl_is_uni_alnum(pTHX_ UV c)
 959 {
 960     U8 tmpbuf[UTF8_MAXBYTES+1];
 961     uvchr_to_utf8(tmpbuf, c);
 962     return is_utf8_alnum(tmpbuf);
 963 }
 964
 965 bool
 966 Perl_is_uni_alnumc(pTHX_ UV c)
 967 {
 968     U8 tmpbuf[UTF8_MAXBYTES+1];
 969     uvchr_to_utf8(tmpbuf, c);
 970     return is_utf8_alnumc(tmpbuf);
 971 }
 972
 973 bool
 974 Perl_is_uni_idfirst(pTHX_ UV c)
 975 {
 976     U8 tmpbuf[UTF8_MAXBYTES+1];
 977     uvchr_to_utf8(tmpbuf, c);
 978     return is_utf8_idfirst(tmpbuf);
 979 }
 980
 981 bool
 982 Perl_is_uni_alpha(pTHX_ UV c)
 983 {
 984     U8 tmpbuf[UTF8_MAXBYTES+1];
 985     uvchr_to_utf8(tmpbuf, c);
 986     return is_utf8_alpha(tmpbuf);
 987 }
 988
 989 bool
 990 Perl_is_uni_ascii(pTHX_ UV c)
 991 {
 992     U8 tmpbuf[UTF8_MAXBYTES+1];
 993     uvchr_to_utf8(tmpbuf, c);
 994     return is_utf8_ascii(tmpbuf);
 995 }
 996
 997 bool
 998 Perl_is_uni_space(pTHX_ UV c)
 999 {
1000     U8 tmpbuf[UTF8_MAXBYTES+1];
1001     uvchr_to_utf8(tmpbuf, c);
1002     return is_utf8_space(tmpbuf);
1003 }
1004
1005 bool
1006 Perl_is_uni_digit(pTHX_ UV c)
1007 {
1008     U8 tmpbuf[UTF8_MAXBYTES+1];
1009     uvchr_to_utf8(tmpbuf, c);
1010     return is_utf8_digit(tmpbuf);
1011 }
1012
1013 bool
1014 Perl_is_uni_upper(pTHX_ UV c)
1015 {
1016     U8 tmpbuf[UTF8_MAXBYTES+1];
1017     uvchr_to_utf8(tmpbuf, c);
1018     return is_utf8_upper(tmpbuf);
1019 }
1020
1021 bool
1022 Perl_is_uni_lower(pTHX_ UV c)
1023 {
1024     U8 tmpbuf[UTF8_MAXBYTES+1];
1025     uvchr_to_utf8(tmpbuf, c);
1026     return is_utf8_lower(tmpbuf);
1027 }
1028
1029 bool
1030 Perl_is_uni_cntrl(pTHX_ UV c)
1031 {
1032     U8 tmpbuf[UTF8_MAXBYTES+1];
1033     uvchr_to_utf8(tmpbuf, c);
1034     return is_utf8_cntrl(tmpbuf);
1035 }
1036
1037 bool
1038 Perl_is_uni_graph(pTHX_ UV c)
1039 {
1040     U8 tmpbuf[UTF8_MAXBYTES+1];
1041     uvchr_to_utf8(tmpbuf, c);
1042     return is_utf8_graph(tmpbuf);
1043 }
1044
1045 bool
1046 Perl_is_uni_print(pTHX_ UV c)
1047 {
1048     U8 tmpbuf[UTF8_MAXBYTES+1];
1049     uvchr_to_utf8(tmpbuf, c);
1050     return is_utf8_print(tmpbuf);
1051 }
1052
1053 bool
1054 Perl_is_uni_punct(pTHX_ UV c)
1055 {
1056     U8 tmpbuf[UTF8_MAXBYTES+1];
1057     uvchr_to_utf8(tmpbuf, c);
1058     return is_utf8_punct(tmpbuf);
1059 }
1060
1061 bool
1062 Perl_is_uni_xdigit(pTHX_ UV c)
1063 {
1064     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1065     uvchr_to_utf8(tmpbuf, c);
1066     return is_utf8_xdigit(tmpbuf);
1067 }
1068
1069 UV
1070 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1071 {
1072     uvchr_to_utf8(p, c);
1073     return to_utf8_upper(p, p, lenp);
1074 }
1075
1076 UV
1077 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1078 {
1079     uvchr_to_utf8(p, c);
1080     return to_utf8_title(p, p, lenp);
1081 }
1082
1083 UV
1084 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1085 {
1086     uvchr_to_utf8(p, c);
1087     return to_utf8_lower(p, p, lenp);
1088 }
1089
1090 UV
1091 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1092 {
1093     uvchr_to_utf8(p, c);
1094     return to_utf8_fold(p, p, lenp);
1095 }
1096
1097 /* for now these all assume no locale info available for Unicode > 255 */
1098
1099 bool
1100 Perl_is_uni_alnum_lc(pTHX_ UV c)
1101 {
1102     return is_uni_alnum(c);     /* XXX no locale support yet */
1103 }
1104
1105 bool
1106 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1107 {
1108     return is_uni_alnumc(c);    /* XXX no locale support yet */
1109 }
1110
1111 bool
1112 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1113 {
1114     return is_uni_idfirst(c);   /* XXX no locale support yet */
1115 }
1116
1117 bool
1118 Perl_is_uni_alpha_lc(pTHX_ UV c)
1119 {
1120     return is_uni_alpha(c);     /* XXX no locale support yet */
1121 }
1122
1123 bool
1124 Perl_is_uni_ascii_lc(pTHX_ UV c)
1125 {
1126     return is_uni_ascii(c);     /* XXX no locale support yet */
1127 }
1128
1129 bool
1130 Perl_is_uni_space_lc(pTHX_ UV c)
1131 {
1132     return is_uni_space(c);     /* XXX no locale support yet */
1133 }
1134
1135 bool
1136 Perl_is_uni_digit_lc(pTHX_ UV c)
1137 {
1138     return is_uni_digit(c);     /* XXX no locale support yet */
1139 }
1140
1141 bool
1142 Perl_is_uni_upper_lc(pTHX_ UV c)
1143 {
1144     return is_uni_upper(c);     /* XXX no locale support yet */
1145 }
1146
1147 bool
1148 Perl_is_uni_lower_lc(pTHX_ UV c)
1149 {
1150     return is_uni_lower(c);     /* XXX no locale support yet */
1151 }
1152
1153 bool
1154 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1155 {
1156     return is_uni_cntrl(c);     /* XXX no locale support yet */
1157 }
1158
1159 bool
1160 Perl_is_uni_graph_lc(pTHX_ UV c)
1161 {
1162     return is_uni_graph(c);     /* XXX no locale support yet */
1163 }
1164
1165 bool
1166 Perl_is_uni_print_lc(pTHX_ UV c)
1167 {
1168     return is_uni_print(c);     /* XXX no locale support yet */
1169 }
1170
1171 bool
1172 Perl_is_uni_punct_lc(pTHX_ UV c)
1173 {
1174     return is_uni_punct(c);     /* XXX no locale support yet */
1175 }
1176
1177 bool
1178 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1179 {
1180     return is_uni_xdigit(c);    /* XXX no locale support yet */
1181 }
1182
1183 U32
1184 Perl_to_uni_upper_lc(pTHX_ U32 c)
1185 {
1186     /* XXX returns only the first character -- do not use XXX */
1187     /* XXX no locale support yet */
1188     STRLEN len;
1189     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1190     return (U32)to_uni_upper(c, tmpbuf, &len);
1191 }
1192
1193 U32
1194 Perl_to_uni_title_lc(pTHX_ U32 c)
1195 {
1196     /* XXX returns only the first character XXX -- do not use XXX */
1197     /* XXX no locale support yet */
1198     STRLEN len;
1199     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1200     return (U32)to_uni_title(c, tmpbuf, &len);
1201 }
1202
1203 U32
1204 Perl_to_uni_lower_lc(pTHX_ U32 c)
1205 {
1206     /* XXX returns only the first character -- do not use XXX */
1207     /* XXX no locale support yet */
1208     STRLEN len;
1209     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1210     return (U32)to_uni_lower(c, tmpbuf, &len);
1211 }
1212
1213 static bool
1214 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1215                  const char *const swashname)
1216 {
1217     dVAR;
1218     if (!is_utf8_char(p))
1219         return FALSE;
1220     if (!*swash)
1221         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1222     return swash_fetch(*swash, p, TRUE) != 0;
1223 }
1224
1225 bool
1226 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1227 {
1228     dVAR;
1229     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1230      * descendant of isalnum(3), in other words, it doesn't
1231      * contain the '_'. --jhi */
1232     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1233 }
1234
1235 bool
1236 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1237 {
1238     dVAR;
1239     return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1240 }
1241
1242 bool
1243 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1244 {
1245     dVAR;
1246     if (*p == '_')
1247         return TRUE;
1248     /* is_utf8_idstart would be more logical. */
1249     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1250 }
1251
1252 bool
1253 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1254 {
1255     dVAR;
1256     if (*p == '_')
1257         return TRUE;
1258     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1259 }
1260
1261 bool
1262 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1263 {
1264     dVAR;
1265     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1266 }
1267
1268 bool
1269 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1270 {
1271     dVAR;
1272     return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1273 }
1274
1275 bool
1276 Perl_is_utf8_space(pTHX_ const U8 *p)
1277 {
1278     dVAR;
1279     return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1280 }
1281
1282 bool
1283 Perl_is_utf8_digit(pTHX_ const U8 *p)
1284 {
1285     dVAR;
1286     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1287 }
1288
1289 bool
1290 Perl_is_utf8_upper(pTHX_ const U8 *p)
1291 {
1292     dVAR;
1293     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1294 }
1295
1296 bool
1297 Perl_is_utf8_lower(pTHX_ const U8 *p)
1298 {
1299     dVAR;
1300     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1301 }
1302
1303 bool
1304 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1305 {
1306     dVAR;
1307     return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1308 }
1309
1310 bool
1311 Perl_is_utf8_graph(pTHX_ const U8 *p)
1312 {
1313     dVAR;
1314     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1315 }
1316
1317 bool
1318 Perl_is_utf8_print(pTHX_ const U8 *p)
1319 {
1320     dVAR;
1321     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1322 }
1323
1324 bool
1325 Perl_is_utf8_punct(pTHX_ const U8 *p)
1326 {
1327     dVAR;
1328     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1329 }
1330
1331 bool
1332 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1333 {
1334     dVAR;
1335     return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1336 }
1337
1338 bool
1339 Perl_is_utf8_mark(pTHX_ const U8 *p)
1340 {
1341     dVAR;
1342     return is_utf8_common(p, &PL_utf8_mark, "IsM");
1343 }
1344
1345 /*
1346 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1347
1348 The "p" contains the pointer to the UTF-8 string encoding
1349 the character that is being converted.
1350
1351 The "ustrp" is a pointer to the character buffer to put the
1352 conversion result to.  The "lenp" is a pointer to the length
1353 of the result.
1354
1355 The "swashp" is a pointer to the swash to use.
1356
1357 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1358 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1359 but not always, a multicharacter mapping), is tried first.
1360
1361 The "special" is a string like "utf8::ToSpecLower", which means the
1362 hash %utf8::ToSpecLower.  The access to the hash is through
1363 Perl_to_utf8_case().
1364
1365 The "normal" is a string like "ToLower" which means the swash
1366 %utf8::ToLower.
1367
1368 =cut */
1369
1370 UV
1371 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1372                         SV **swashp, const char *normal, const char *special)
1373 {
1374     dVAR;
1375     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1376     STRLEN len = 0;
1377
1378     const UV uv0 = utf8_to_uvchr(p, NULL);
1379     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1380      * are necessary in EBCDIC, they are redundant no-ops
1381      * in ASCII-ish platforms, and hopefully optimized away. */
1382     const UV uv1 = NATIVE_TO_UNI(uv0);
1383     uvuni_to_utf8(tmpbuf, uv1);
1384
1385     if (!*swashp) /* load on-demand */
1386          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1387
1388     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1389     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1390          /* It might be "special" (sometimes, but not always,
1391           * a multicharacter mapping) */
1392          HV *hv;
1393          SV **svp;
1394
1395          if ((hv  = get_hv(special, FALSE)) &&
1396              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1397              (*svp)) {
1398              const char *s;
1399
1400               s = SvPV_const(*svp, len);
1401               if (len == 1)
1402                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1403               else {
1404 #ifdef EBCDIC
1405                    /* If we have EBCDIC we need to remap the characters
1406                     * since any characters in the low 256 are Unicode
1407                     * code points, not EBCDIC. */
1408                    U8 *t = (U8*)s, *tend = t + len, *d;
1409
1410                    d = tmpbuf;
1411                    if (SvUTF8(*svp)) {
1412                         STRLEN tlen = 0;
1413
1414                         while (t < tend) {
1415                              const UV c = utf8_to_uvchr(t, &tlen);
1416                              if (tlen > 0) {
1417                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1418                                   t += tlen;
1419                              }
1420                              else
1421                                   break;
1422                         }
1423                    }
1424                    else {
1425                         while (t < tend) {
1426                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1427                              t++;
1428                         }
1429                    }
1430                    len = d - tmpbuf;
1431                    Copy(tmpbuf, ustrp, len, U8);
1432 #else
1433                    Copy(s, ustrp, len, U8);
1434 #endif
1435               }
1436          }
1437     }
1438
1439     if (!len && *swashp) {
1440         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1441
1442          if (uv2) {
1443               /* It was "normal" (a single character mapping). */
1444               const UV uv3 = UNI_TO_NATIVE(uv2);
1445               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1446          }
1447     }
1448
1449     if (!len) /* Neither: just copy. */
1450          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1451
1452     if (lenp)
1453          *lenp = len;
1454
1455     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1456 }
1457
1458 /*
1459 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1460
1461 Convert the UTF-8 encoded character at p to its uppercase version and
1462 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1463 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1464 the uppercase version may be longer than the original character.
1465
1466 The first character of the uppercased version is returned
1467 (but note, as explained above, that there may be more.)
1468
1469 =cut */
1470
1471 UV
1472 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1473 {
1474     dVAR;
1475     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1476                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1477 }
1478
1479 /*
1480 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1481
1482 Convert the UTF-8 encoded character at p to its titlecase version and
1483 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1484 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1485 titlecase version may be longer than the original character.
1486
1487 The first character of the titlecased version is returned
1488 (but note, as explained above, that there may be more.)
1489
1490 =cut */
1491
1492 UV
1493 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1494 {
1495     dVAR;
1496     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1497                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1498 }
1499
1500 /*
1501 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1502
1503 Convert the UTF-8 encoded character at p to its lowercase version and
1504 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1505 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1506 lowercase version may be longer than the original character.
1507
1508 The first character of the lowercased version is returned
1509 (but note, as explained above, that there may be more.)
1510
1511 =cut */
1512
1513 UV
1514 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1515 {
1516     dVAR;
1517     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1518                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1519 }
1520
1521 /*
1522 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1523
1524 Convert the UTF-8 encoded character at p to its foldcase version and
1525 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1526 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1527 foldcase version may be longer than the original character (up to
1528 three characters).
1529
1530 The first character of the foldcased version is returned
1531 (but note, as explained above, that there may be more.)
1532
1533 =cut */
1534
1535 UV
1536 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1537 {
1538     dVAR;
1539     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1540                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1541 }
1542
1543 /* Note:
1544  * A "swash" is a swatch hash.
1545  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1546  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1547  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1548  */
1549 SV*
1550 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1551 {
1552     dVAR;
1553     SV* retval;
1554     SV* const tokenbufsv = sv_newmortal();
1555     dSP;
1556     const size_t pkg_len = strlen(pkg);
1557     const size_t name_len = strlen(name);
1558     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1559     SV* errsv_save;
1560
1561     PUSHSTACKi(PERLSI_MAGIC);
1562     ENTER;
1563     SAVEI32(PL_hints);
1564     PL_hints = 0;
1565     save_re_context();
1566     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1567         ENTER;
1568         errsv_save = newSVsv(ERRSV);
1569         /* It is assumed that callers of this routine are not passing in any
1570            user derived data.  */
1571         /* Need to do this after save_re_context() as it will set PL_tainted to
1572            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1573            Even line to create errsv_save can turn on PL_tainted.  */
1574         SAVEBOOL(PL_tainted);
1575         PL_tainted = 0;
1576         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1577                          NULL);
1578         if (!SvTRUE(ERRSV))
1579             sv_setsv(ERRSV, errsv_save);
1580         SvREFCNT_dec(errsv_save);
1581         LEAVE;
1582     }
1583     SPAGAIN;
1584     PUSHMARK(SP);
1585     EXTEND(SP,5);
1586     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1587     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1588     PUSHs(listsv);
1589     PUSHs(sv_2mortal(newSViv(minbits)));
1590     PUSHs(sv_2mortal(newSViv(none)));
1591     PUTBACK;
1592     if (IN_PERL_COMPILETIME) {
1593         /* XXX ought to be handled by lex_start */
1594         SAVEI32(PL_in_my);
1595         PL_in_my = 0;
1596         sv_setpv(tokenbufsv, PL_tokenbuf);
1597     }
1598     errsv_save = newSVsv(ERRSV);
1599     if (call_method("SWASHNEW", G_SCALAR))
1600         retval = newSVsv(*PL_stack_sp--);
1601     else
1602         retval = &PL_sv_undef;
1603     if (!SvTRUE(ERRSV))
1604         sv_setsv(ERRSV, errsv_save);
1605     SvREFCNT_dec(errsv_save);
1606     LEAVE;
1607     POPSTACK;
1608     if (IN_PERL_COMPILETIME) {
1609         STRLEN len;
1610         const char* const pv = SvPV_const(tokenbufsv, len);
1611
1612         Copy(pv, PL_tokenbuf, len+1, char);
1613         CopHINTS_set(PL_curcop, PL_hints);
1614     }
1615     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1616         if (SvPOK(retval))
1617             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1618                        retval);
1619         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1620     }
1621     return retval;
1622 }
1623
1624
1625 /* This API is wrong for special case conversions since we may need to
1626  * return several Unicode characters for a single Unicode character
1627  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1628  * the lower-level routine, and it is similarly broken for returning
1629  * multiple values.  --jhi */
1630 /* Now SWASHGET is recasted into S_swash_get in this file. */
1631
1632 /* Note:
1633  * Returns the value of property/mapping C<swash> for the first character
1634  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1635  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1636  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1637  */
1638 UV
1639 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1640 {
1641     dVAR;
1642     HV* const hv = (HV*)SvRV(swash);
1643     U32 klen;
1644     U32 off;
1645     STRLEN slen;
1646     STRLEN needents;
1647     const U8 *tmps = NULL;
1648     U32 bit;
1649     SV *swatch;
1650     U8 tmputf8[2];
1651     UV c = NATIVE_TO_ASCII(*ptr);
1652
1653     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1654         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1655         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1656         ptr = tmputf8;
1657     }
1658     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1659      * then the "swatch" is a vec() for al the chars which start
1660      * with 0xAA..0xYY
1661      * So the key in the hash (klen) is length of encoded char -1
1662      */
1663     klen = UTF8SKIP(ptr) - 1;
1664     off  = ptr[klen];
1665
1666     if (klen == 0) {
1667       /* If char in invariant then swatch is for all the invariant chars
1668        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1669        */
1670         needents = UTF_CONTINUATION_MARK;
1671         off      = NATIVE_TO_UTF(ptr[klen]);
1672     }
1673     else {
1674       /* If char is encoded then swatch is for the prefix */
1675         needents = (1 << UTF_ACCUMULATION_SHIFT);
1676         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1677     }
1678
1679     /*
1680      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1681      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1682      * it's nothing to sniff at.)  Pity we usually come through at least
1683      * two function calls to get here...
1684      *
1685      * NB: this code assumes that swatches are never modified, once generated!
1686      */
1687
1688     if (hv   == PL_last_swash_hv &&
1689         klen == PL_last_swash_klen &&
1690         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1691     {
1692         tmps = PL_last_swash_tmps;
1693         slen = PL_last_swash_slen;
1694     }
1695     else {
1696         /* Try our second-level swatch cache, kept in a hash. */
1697         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1698
1699         /* If not cached, generate it via swash_get */
1700         if (!svp || !SvPOK(*svp)
1701                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1702             /* We use utf8n_to_uvuni() as we want an index into
1703                Unicode tables, not a native character number.
1704              */
1705             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1706                                            ckWARN(WARN_UTF8) ?
1707                                            0 : UTF8_ALLOW_ANY);
1708             swatch = swash_get(swash,
1709                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1710                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1711                                 needents);
1712
1713             if (IN_PERL_COMPILETIME)
1714                 CopHINTS_set(PL_curcop, PL_hints);
1715
1716             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1717
1718             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1719                      || (slen << 3) < needents)
1720                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1721         }
1722
1723         PL_last_swash_hv = hv;
1724         PL_last_swash_klen = klen;
1725         /* FIXME change interpvar.h?  */
1726         PL_last_swash_tmps = (U8 *) tmps;
1727         PL_last_swash_slen = slen;
1728         if (klen)
1729             Copy(ptr, PL_last_swash_key, klen, U8);
1730     }
1731
1732     switch ((int)((slen << 3) / needents)) {
1733     case 1:
1734         bit = 1 << (off & 7);
1735         off >>= 3;
1736         return (tmps[off] & bit) != 0;
1737     case 8:
1738         return tmps[off];
1739     case 16:
1740         off <<= 1;
1741         return (tmps[off] << 8) + tmps[off + 1] ;
1742     case 32:
1743         off <<= 2;
1744         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1745     }
1746     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1747 }
1748
1749 /* Note:
1750  * Returns a swatch (a bit vector string) for a code point sequence
1751  * that starts from the value C<start> and comprises the number C<span>.
1752  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1753  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1754  */
1755 STATIC SV*
1756 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1757 {
1758     SV *swatch;
1759     U8 *l, *lend, *x, *xend, *s;
1760     STRLEN lcur, xcur, scur;
1761
1762     HV* const hv = (HV*)SvRV(swash);
1763     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1764     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1765     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1766     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1767     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1768     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1769     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1770     const STRLEN bits  = SvUV(*bitssvp);
1771     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1772     const UV     none  = SvUV(*nonesvp);
1773     const UV     end   = start + span;
1774
1775     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1776         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1777                                                  (UV)bits);
1778     }
1779
1780     /* create and initialize $swatch */
1781     swatch = newSVpvs("");
1782     scur   = octets ? (span * octets) : (span + 7) / 8;
1783     SvGROW(swatch, scur + 1);
1784     s = (U8*)SvPVX(swatch);
1785     if (octets && none) {
1786         const U8* const e = s + scur;
1787         while (s < e) {
1788             if (bits == 8)
1789                 *s++ = (U8)(none & 0xff);
1790             else if (bits == 16) {
1791                 *s++ = (U8)((none >>  8) & 0xff);
1792                 *s++ = (U8)( none        & 0xff);
1793             }
1794             else if (bits == 32) {
1795                 *s++ = (U8)((none >> 24) & 0xff);
1796                 *s++ = (U8)((none >> 16) & 0xff);
1797                 *s++ = (U8)((none >>  8) & 0xff);
1798                 *s++ = (U8)( none        & 0xff);
1799             }
1800         }
1801         *s = '\0';
1802     }
1803     else {
1804         (void)memzero((U8*)s, scur + 1);
1805     }
1806     SvCUR_set(swatch, scur);
1807     s = (U8*)SvPVX(swatch);
1808
1809     /* read $swash->{LIST} */
1810     l = (U8*)SvPV(*listsvp, lcur);
1811     lend = l + lcur;
1812     while (l < lend) {
1813         UV min, max, val, key;
1814         STRLEN numlen;
1815         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1816
1817         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1818
1819         numlen = lend - l;
1820         min = grok_hex((char *)l, &numlen, &flags, NULL);
1821         if (numlen)
1822             l += numlen;
1823         else if (nl) {
1824             l = nl + 1; /* 1 is length of "\n" */
1825             continue;
1826         }
1827         else {
1828             l = lend; /* to LIST's end at which \n is not found */
1829             break;
1830         }
1831
1832         if (isBLANK(*l)) {
1833             ++l;
1834             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1835             numlen = lend - l;
1836             max = grok_hex((char *)l, &numlen, &flags, NULL);
1837             if (numlen)
1838                 l += numlen;
1839             else
1840                 max = min;
1841
1842             if (octets) {
1843                 if (isBLANK(*l)) {
1844                     ++l;
1845                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1846                             PERL_SCAN_DISALLOW_PREFIX;
1847                     numlen = lend - l;
1848                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1849                     if (numlen)
1850                         l += numlen;
1851                     else
1852                         val = 0;
1853                 }
1854                 else {
1855                     val = 0;
1856                     if (typeto) {
1857                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1858                                          typestr, l);
1859                     }
1860                 }
1861             }
1862             else
1863                 val = 0; /* bits == 1, then val should be ignored */
1864         }
1865         else {
1866             max = min;
1867             if (octets) {
1868                 val = 0;
1869                 if (typeto) {
1870                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1871                 }
1872             }
1873             else
1874                 val = 0; /* bits == 1, then val should be ignored */
1875         }
1876
1877         if (nl)
1878             l = nl + 1;
1879         else
1880             l = lend;
1881
1882         if (max < start)
1883             continue;
1884
1885         if (octets) {
1886             if (min < start) {
1887                 if (!none || val < none) {
1888                     val += start - min;
1889                 }
1890                 min = start;
1891             }
1892             for (key = min; key <= max; key++) {
1893                 STRLEN offset;
1894                 if (key >= end)
1895                     goto go_out_list;
1896                 /* offset must be non-negative (start <= min <= key < end) */
1897                 offset = octets * (key - start);
1898                 if (bits == 8)
1899                     s[offset] = (U8)(val & 0xff);
1900                 else if (bits == 16) {
1901                     s[offset    ] = (U8)((val >>  8) & 0xff);
1902                     s[offset + 1] = (U8)( val        & 0xff);
1903                 }
1904                 else if (bits == 32) {
1905                     s[offset    ] = (U8)((val >> 24) & 0xff);
1906                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1907                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1908                     s[offset + 3] = (U8)( val        & 0xff);
1909                 }
1910
1911                 if (!none || val < none)
1912                     ++val;
1913             }
1914         }
1915         else { /* bits == 1, then val should be ignored */
1916             if (min < start)
1917                 min = start;
1918             for (key = min; key <= max; key++) {
1919                 const STRLEN offset = (STRLEN)(key - start);
1920                 if (key >= end)
1921                     goto go_out_list;
1922                 s[offset >> 3] |= 1 << (offset & 7);
1923             }
1924         }
1925     } /* while */
1926   go_out_list:
1927
1928     /* read $swash->{EXTRAS} */
1929     x = (U8*)SvPV(*extssvp, xcur);
1930     xend = x + xcur;
1931     while (x < xend) {
1932         STRLEN namelen;
1933         U8 *namestr;
1934         SV** othersvp;
1935         HV* otherhv;
1936         STRLEN otherbits;
1937         SV **otherbitssvp, *other;
1938         U8 *s, *o, *nl;
1939         STRLEN slen, olen;
1940
1941         U8 opc = *x++;
1942         if (opc == '\n')
1943             continue;
1944
1945         nl = (U8*)memchr(x, '\n', xend - x);
1946
1947         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1948             if (nl) {
1949                 x = nl + 1; /* 1 is length of "\n" */
1950                 continue;
1951             }
1952             else {
1953                 x = xend; /* to EXTRAS' end at which \n is not found */
1954                 break;
1955             }
1956         }
1957
1958         namestr = x;
1959         if (nl) {
1960             namelen = nl - namestr;
1961             x = nl + 1;
1962         }
1963         else {
1964             namelen = xend - namestr;
1965             x = xend;
1966         }
1967
1968         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1969         otherhv = (HV*)SvRV(*othersvp);
1970         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1971         otherbits = (STRLEN)SvUV(*otherbitssvp);
1972         if (bits < otherbits)
1973             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1974
1975         /* The "other" swatch must be destroyed after. */
1976         other = swash_get(*othersvp, start, span);
1977         o = (U8*)SvPV(other, olen);
1978
1979         if (!olen)
1980             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1981
1982         s = (U8*)SvPV(swatch, slen);
1983         if (bits == 1 && otherbits == 1) {
1984             if (slen != olen)
1985                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1986
1987             switch (opc) {
1988             case '+':
1989                 while (slen--)
1990                     *s++ |= *o++;
1991                 break;
1992             case '!':
1993                 while (slen--)
1994                     *s++ |= ~*o++;
1995                 break;
1996             case '-':
1997                 while (slen--)
1998                     *s++ &= ~*o++;
1999                 break;
2000             case '&':
2001                 while (slen--)
2002                     *s++ &= *o++;
2003                 break;
2004             default:
2005                 break;
2006             }
2007         }
2008         else {
2009             STRLEN otheroctets = otherbits >> 3;
2010             STRLEN offset = 0;
2011             U8* send = s + slen;
2012
2013             while (s < send) {
2014                 UV otherval = 0;
2015
2016                 if (otherbits == 1) {
2017                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2018                     ++offset;
2019                 }
2020                 else {
2021                     STRLEN vlen = otheroctets;
2022                     otherval = *o++;
2023                     while (--vlen) {
2024                         otherval <<= 8;
2025                         otherval |= *o++;
2026                     }
2027                 }
2028
2029                 if (opc == '+' && otherval)
2030                     /*EMPTY*/;   /* replace with otherval */
2031                 else if (opc == '!' && !otherval)
2032                     otherval = 1;
2033                 else if (opc == '-' && otherval)
2034                     otherval = 0;
2035                 else if (opc == '&' && !otherval)
2036                     otherval = 0;
2037                 else {
2038                     s += octets; /* no replacement */
2039                     continue;
2040                 }
2041
2042                 if (bits == 8)
2043                     *s++ = (U8)( otherval & 0xff);
2044                 else if (bits == 16) {
2045                     *s++ = (U8)((otherval >>  8) & 0xff);
2046                     *s++ = (U8)( otherval        & 0xff);
2047                 }
2048                 else if (bits == 32) {
2049                     *s++ = (U8)((otherval >> 24) & 0xff);
2050                     *s++ = (U8)((otherval >> 16) & 0xff);
2051                     *s++ = (U8)((otherval >>  8) & 0xff);
2052                     *s++ = (U8)( otherval        & 0xff);
2053                 }
2054             }
2055         }
2056         sv_free(other); /* through with it! */
2057     } /* while */
2058     return swatch;
2059 }
2060
2061 /*
2062 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2063
2064 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2065 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2066 bytes available. The return value is the pointer to the byte after the
2067 end of the new character. In other words,
2068
2069     d = uvchr_to_utf8(d, uv);
2070
2071 is the recommended wide native character-aware way of saying
2072
2073     *(d++) = uv;
2074
2075 =cut
2076 */
2077
2078 /* On ASCII machines this is normally a macro but we want a
2079    real function in case XS code wants it
2080 */
2081 U8 *
2082 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2083 {
2084     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2085 }
2086
2087 U8 *
2088 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2089 {
2090     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2091 }
2092
2093 /*
2094 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2095 flags
2096
2097 Returns the native character value of the first character in the string
2098 C<s>
2099 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2100 length, in bytes, of that character.
2101
2102 Allows length and flags to be passed to low level routine.
2103
2104 =cut
2105 */
2106 /* On ASCII machines this is normally a macro but we want
2107    a real function in case XS code wants it
2108 */
2109 UV
2110 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2111 U32 flags)
2112 {
2113     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2114     return UNI_TO_NATIVE(uv);
2115 }
2116
2117 /*
2118 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2119
2120 Build to the scalar dsv a displayable version of the string spv,
2121 length len, the displayable version being at most pvlim bytes long
2122 (if longer, the rest is truncated and "..." will be appended).
2123
2124 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2125 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2126 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2127 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2128 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2129 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2130
2131 The pointer to the PV of the dsv is returned.
2132
2133 =cut */
2134 char *
2135 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2136 {
2137     int truncated = 0;
2138     const char *s, *e;
2139
2140     sv_setpvn(dsv, "", 0);
2141     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2142          UV u;
2143           /* This serves double duty as a flag and a character to print after
2144              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2145           */
2146          char ok = 0;
2147
2148          if (pvlim && SvCUR(dsv) >= pvlim) {
2149               truncated++;
2150               break;
2151          }
2152          u = utf8_to_uvchr((U8*)s, 0);
2153          if (u < 256) {
2154              const unsigned char c = (unsigned char)u & 0xFF;
2155              if (flags & UNI_DISPLAY_BACKSLASH) {
2156                  switch (c) {
2157                  case '\n':
2158                      ok = 'n'; break;
2159                  case '\r':
2160                      ok = 'r'; break;
2161                  case '\t':
2162                      ok = 't'; break;
2163                  case '\f':
2164                      ok = 'f'; break;
2165                  case '\a':
2166                      ok = 'a'; break;
2167                  case '\\':
2168                      ok = '\\'; break;
2169                  default: break;
2170                  }
2171                  if (ok) {
2172                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2173                  }
2174              }
2175              /* isPRINT() is the locale-blind version. */
2176              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2177                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2178                  ok = 1;
2179              }
2180          }
2181          if (!ok)
2182              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2183     }
2184     if (truncated)
2185          sv_catpvs(dsv, "...");
2186
2187     return SvPVX(dsv);
2188 }
2189
2190 /*
2191 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2192
2193 Build to the scalar dsv a displayable version of the scalar sv,
2194 the displayable version being at most pvlim bytes long
2195 (if longer, the rest is truncated and "..." will be appended).
2196
2197 The flags argument is as in pv_uni_display().
2198
2199 The pointer to the PV of the dsv is returned.
2200
2201 =cut
2202 */
2203 char *
2204 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2205 {
2206      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2207                                 SvCUR(ssv), pvlim, flags);
2208 }
2209
2210 /*
2211 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2212
2213 Return true if the strings s1 and s2 differ case-insensitively, false
2214 if not (if they are equal case-insensitively).  If u1 is true, the
2215 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2216 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2217 are false, the respective string is assumed to be in native 8-bit
2218 encoding.
2219
2220 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2221 in there (they will point at the beginning of the I<next> character).
2222 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2223 pointers beyond which scanning will not continue under any
2224 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2225 s2+l2 will be used as goal end pointers that will also stop the scan,
2226 and which qualify towards defining a successful match: all the scans
2227 that define an explicit length must reach their goal pointers for
2228 a match to succeed).
2229
2230 For case-insensitiveness, the "casefolding" of Unicode is used
2231 instead of upper/lowercasing both the characters, see
2232 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2233
2234 =cut */
2235 I32
2236 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2237 {
2238      dVAR;
2239      register const U8 *p1  = (const U8*)s1;
2240      register const U8 *p2  = (const U8*)s2;
2241      register const U8 *f1 = NULL;
2242      register const U8 *f2 = NULL;
2243      register U8 *e1 = NULL;
2244      register U8 *q1 = NULL;
2245      register U8 *e2 = NULL;
2246      register U8 *q2 = NULL;
2247      STRLEN n1 = 0, n2 = 0;
2248      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2249      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2250      U8 natbuf[1+1];
2251      STRLEN foldlen1, foldlen2;
2252      bool match;
2253
2254      if (pe1)
2255           e1 = *(U8**)pe1;
2256      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2257           f1 = (const U8*)s1 + l1;
2258      if (pe2)
2259           e2 = *(U8**)pe2;
2260      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2261           f2 = (const U8*)s2 + l2;
2262
2263      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2264           return 1; /* mismatch; possible infinite loop or false positive */
2265
2266      if (!u1 || !u2)
2267           natbuf[1] = 0; /* Need to terminate the buffer. */
2268
2269      while ((e1 == 0 || p1 < e1) &&
2270             (f1 == 0 || p1 < f1) &&
2271             (e2 == 0 || p2 < e2) &&
2272             (f2 == 0 || p2 < f2)) {
2273           if (n1 == 0) {
2274                if (u1)
2275                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2276                else {
2277                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2278                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2279                }
2280                q1 = foldbuf1;
2281                n1 = foldlen1;
2282           }
2283           if (n2 == 0) {
2284                if (u2)
2285                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2286                else {
2287                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2288                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2289                }
2290                q2 = foldbuf2;
2291                n2 = foldlen2;
2292           }
2293           while (n1 && n2) {
2294                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2295                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2296                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2297                    return 1; /* mismatch */
2298                n1 -= UTF8SKIP(q1);
2299                q1 += UTF8SKIP(q1);
2300                n2 -= UTF8SKIP(q2);
2301                q2 += UTF8SKIP(q2);
2302           }
2303           if (n1 == 0)
2304                p1 += u1 ? UTF8SKIP(p1) : 1;
2305           if (n2 == 0)
2306                p2 += u2 ? UTF8SKIP(p2) : 1;
2307
2308      }
2309
2310      /* A match is defined by all the scans that specified
2311       * an explicit length reaching their final goals. */
2312      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2313
2314      if (match) {
2315           if (pe1)
2316                *pe1 = (char*)p1;
2317           if (pe2)
2318                *pe2 = (char*)p2;
2319      }
2320
2321      return match ? 0 : 1; /* 0 match, 1 mismatch */
2322 }
2323
2324 /*
2325  * Local variables:
2326  * c-indentation-style: bsd
2327  * c-basic-offset: 4
2328  * indent-tabs-mode: t
2329  * End:
2330  *
2331  * ex: set ts=8 sts=4 sw=4 noet:
2332  */