utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241 #ifdef IS_UTF8_CHAR
 242     if (IS_UTF8_CHAR_FAST(len))
 243         return IS_UTF8_CHAR(s, len) ? len : 0;
 244 #endif /* #ifdef IS_UTF8_CHAR */
 245     return is_utf8_char_slow(s, len);
 246 }
 247
 248 /*
 249 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 250
 251 Returns true if first C<len> bytes of the given string form a valid
 252 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 253 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 254 because a valid ASCII string is a valid UTF-8 string.
 255
 256 See also is_utf8_string_loclen() and is_utf8_string_loc().
 257
 258 =cut
 259 */
 260
 261 bool
 262 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 263 {
 264     const U8* x = s;
 265     const U8* send;
 266
 267     if (!len)
 268         len = strlen((const char *)s);
 269     send = s + len;
 270
 271     while (x < send) {
 272         STRLEN c;
 273          /* Inline the easy bits of is_utf8_char() here for speed... */
 274          if (UTF8_IS_INVARIANT(*x))
 275               c = 1;
 276          else if (!UTF8_IS_START(*x))
 277              goto out;
 278          else {
 279               /* ... and call is_utf8_char() only if really needed. */
 280 #ifdef IS_UTF8_CHAR
 281              c = UTF8SKIP(x);
 282              if (IS_UTF8_CHAR_FAST(c)) {
 283                  if (!IS_UTF8_CHAR(x, c))
 284                      goto out;
 285              } else if (!is_utf8_char_slow(x, c))
 286                  goto out;
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* x = s;
 330     const U8* send;
 331     STRLEN c;
 332
 333     if (!len)
 334         len = strlen((const char *)s);
 335     send = s + len;
 336     if (el)
 337         *el = 0;
 338
 339     while (x < send) {
 340          /* Inline the easy bits of is_utf8_char() here for speed... */
 341          if (UTF8_IS_INVARIANT(*x))
 342              c = 1;
 343          else if (!UTF8_IS_START(*x))
 344              goto out;
 345          else {
 346              /* ... and call is_utf8_char() only if really needed. */
 347 #ifdef IS_UTF8_CHAR
 348              c = UTF8SKIP(x);
 349              if (IS_UTF8_CHAR_FAST(c)) {
 350                  if (!IS_UTF8_CHAR(x, c))
 351                      c = 0;
 352              } else
 353                  c = is_utf8_char_slow(x, c);
 354 #else
 355              c = is_utf8_char(x);
 356 #endif /* #ifdef IS_UTF8_CHAR */
 357              if (!c)
 358                  goto out;
 359          }
 360          x += c;
 361          if (el)
 362              (*el)++;
 363     }
 364
 365  out:
 366     if (ep)
 367         *ep = x;
 368     if (x != send)
 369         return FALSE;
 370
 371     return TRUE;
 372 }
 373
 374 /*
 375
 376 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 377
 378 Bottom level UTF-8 decode routine.
 379 Returns the unicode code point value of the first character in the string C<s>
 380 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 381 C<retlen> will be set to the length, in bytes, of that character.
 382
 383 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 384 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 385 it is assumed that the caller will raise a warning, and this function
 386 will silently just set C<retlen> to C<-1> and return zero.  If the
 387 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 388 malformations will be given, C<retlen> will be set to the expected
 389 length of the UTF-8 character in bytes, and zero will be returned.
 390
 391 The C<flags> can also contain various flags to allow deviations from
 392 the strict UTF-8 encoding (see F<utf8.h>).
 393
 394 Most code should use utf8_to_uvchr() rather than call this directly.
 395
 396 =cut
 397 */
 398
 399 UV
 400 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 401 {
 402     const U8 *s0 = s;
 403     UV uv = *s, ouv = 0;
 404     STRLEN len = 1;
 405     const bool dowarn = ckWARN_d(WARN_UTF8);
 406     const UV startbyte = *s;
 407     STRLEN expectlen = 0;
 408     U32 warning = 0;
 409
 410 /* This list is a superset of the UTF8_ALLOW_XXX. */
 411
 412 #define UTF8_WARN_EMPTY                          1
 413 #define UTF8_WARN_CONTINUATION                   2
 414 #define UTF8_WARN_NON_CONTINUATION               3
 415 #define UTF8_WARN_FE_FF                          4
 416 #define UTF8_WARN_SHORT                          5
 417 #define UTF8_WARN_OVERFLOW                       6
 418 #define UTF8_WARN_SURROGATE                      7
 419 #define UTF8_WARN_LONG                           8
 420 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 421
 422     if (curlen == 0 &&
 423         !(flags & UTF8_ALLOW_EMPTY)) {
 424         warning = UTF8_WARN_EMPTY;
 425         goto malformed;
 426     }
 427
 428     if (UTF8_IS_INVARIANT(uv)) {
 429         if (retlen)
 430             *retlen = 1;
 431         return (UV) (NATIVE_TO_UTF(*s));
 432     }
 433
 434     if (UTF8_IS_CONTINUATION(uv) &&
 435         !(flags & UTF8_ALLOW_CONTINUATION)) {
 436         warning = UTF8_WARN_CONTINUATION;
 437         goto malformed;
 438     }
 439
 440     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 441         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 442         warning = UTF8_WARN_NON_CONTINUATION;
 443         goto malformed;
 444     }
 445
 446 #ifdef EBCDIC
 447     uv = NATIVE_TO_UTF(uv);
 448 #else
 449     if ((uv == 0xfe || uv == 0xff) &&
 450         !(flags & UTF8_ALLOW_FE_FF)) {
 451         warning = UTF8_WARN_FE_FF;
 452         goto malformed;
 453     }
 454 #endif
 455
 456     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 457     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 458     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 459     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 460 #ifdef EBCDIC
 461     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 462     else                        { len =  7; uv &= 0x01; }
 463 #else
 464     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 465     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 466     else                        { len = 13; uv = 0; } /* whoa! */
 467 #endif
 468
 469     if (retlen)
 470         *retlen = len;
 471
 472     expectlen = len;
 473
 474     if ((curlen < expectlen) &&
 475         !(flags & UTF8_ALLOW_SHORT)) {
 476         warning = UTF8_WARN_SHORT;
 477         goto malformed;
 478     }
 479
 480     len--;
 481     s++;
 482     ouv = uv;
 483
 484     while (len--) {
 485         if (!UTF8_IS_CONTINUATION(*s) &&
 486             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 487             s--;
 488             warning = UTF8_WARN_NON_CONTINUATION;
 489             goto malformed;
 490         }
 491         else
 492             uv = UTF8_ACCUMULATE(uv, *s);
 493         if (!(uv > ouv)) {
 494             /* These cannot be allowed. */
 495             if (uv == ouv) {
 496                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 497                     warning = UTF8_WARN_LONG;
 498                     goto malformed;
 499                 }
 500             }
 501             else { /* uv < ouv */
 502                 /* This cannot be allowed. */
 503                 warning = UTF8_WARN_OVERFLOW;
 504                 goto malformed;
 505             }
 506         }
 507         s++;
 508         ouv = uv;
 509     }
 510
 511     if (UNICODE_IS_SURROGATE(uv) &&
 512         !(flags & UTF8_ALLOW_SURROGATE)) {
 513         warning = UTF8_WARN_SURROGATE;
 514         goto malformed;
 515     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 516                !(flags & UTF8_ALLOW_LONG)) {
 517         warning = UTF8_WARN_LONG;
 518         goto malformed;
 519     } else if (UNICODE_IS_ILLEGAL(uv) &&
 520                !(flags & UTF8_ALLOW_FFFF)) {
 521         warning = UTF8_WARN_FFFF;
 522         goto malformed;
 523     }
 524
 525     return uv;
 526
 527 malformed:
 528
 529     if (flags & UTF8_CHECK_ONLY) {
 530         if (retlen)
 531             *retlen = -1;
 532         return 0;
 533     }
 534
 535     if (dowarn) {
 536         SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 537
 538         switch (warning) {
 539         case 0: /* Intentionally empty. */ break;
 540         case UTF8_WARN_EMPTY:
 541             Perl_sv_catpv(aTHX_ sv, "(empty string)");
 542             break;
 543         case UTF8_WARN_CONTINUATION:
 544             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 545             break;
 546         case UTF8_WARN_NON_CONTINUATION:
 547             if (s == s0)
 548                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 549                            (UV)s[1], startbyte);
 550             else {
 551                 const int len = (int)(s-s0);
 552                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 553                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 554             }
 555
 556             break;
 557         case UTF8_WARN_FE_FF:
 558             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 559             break;
 560         case UTF8_WARN_SHORT:
 561             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 562                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 563             expectlen = curlen;         /* distance for caller to skip */
 564             break;
 565         case UTF8_WARN_OVERFLOW:
 566             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 567                            ouv, *s, startbyte);
 568             break;
 569         case UTF8_WARN_SURROGATE:
 570             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 571             break;
 572         case UTF8_WARN_LONG:
 573             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 574                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 575             break;
 576         case UTF8_WARN_FFFF:
 577             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 578             break;
 579         default:
 580             Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
 581             break;
 582         }
 583
 584         if (warning) {
 585             const char * const s = SvPVX_const(sv);
 586
 587             if (PL_op)
 588                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 589                             "%s in %s", s,  OP_DESC(PL_op));
 590             else
 591                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 592         }
 593     }
 594
 595     if (retlen)
 596         *retlen = expectlen ? expectlen : len;
 597
 598     return 0;
 599 }
 600
 601 /*
 602 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 603
 604 Returns the native character value of the first character in the string C<s>
 605 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 606 length, in bytes, of that character.
 607
 608 If C<s> does not point to a well-formed UTF-8 character, zero is
 609 returned and retlen is set, if possible, to -1.
 610
 611 =cut
 612 */
 613
 614 UV
 615 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 616 {
 617     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 618                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 619 }
 620
 621 /*
 622 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 623
 624 Returns the Unicode code point of the first character in the string C<s>
 625 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 626 length, in bytes, of that character.
 627
 628 This function should only be used when returned UV is considered
 629 an index into the Unicode semantic tables (e.g. swashes).
 630
 631 If C<s> does not point to a well-formed UTF-8 character, zero is
 632 returned and retlen is set, if possible, to -1.
 633
 634 =cut
 635 */
 636
 637 UV
 638 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 639 {
 640     /* Call the low level routine asking for checks */
 641     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 642                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 643 }
 644
 645 /*
 646 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 647
 648 Return the length of the UTF-8 char encoded string C<s> in characters.
 649 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 650 up past C<e>, croaks.
 651
 652 =cut
 653 */
 654
 655 STRLEN
 656 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 657 {
 658     STRLEN len = 0;
 659
 660     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 661      * the bitops (especially ~) can create illegal UTF-8.
 662      * In other words: in Perl UTF-8 is not just for Unicode. */
 663
 664     if (e < s)
 665         goto warn_and_return;
 666     while (s < e) {
 667         const U8 t = UTF8SKIP(s);
 668         if (e - s < t) {
 669             warn_and_return:
 670             if (ckWARN_d(WARN_UTF8)) {
 671                 if (PL_op)
 672                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 673                             "%s in %s", unees, OP_DESC(PL_op));
 674                 else
 675                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 676             }
 677             return len;
 678         }
 679         s += t;
 680         len++;
 681     }
 682
 683     return len;
 684 }
 685
 686 /*
 687 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 688
 689 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 690 and C<b>.
 691
 692 WARNING: use only if you *know* that the pointers point inside the
 693 same UTF-8 buffer.
 694
 695 =cut
 696 */
 697
 698 IV
 699 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 700 {
 701     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
 702 }
 703
 704 /*
 705 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 706
 707 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 708 forward or backward.
 709
 710 WARNING: do not use the following unless you *know* C<off> is within
 711 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 712 on the first byte of character or just after the last byte of a character.
 713
 714 =cut
 715 */
 716
 717 U8 *
 718 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 719 {
 720     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 721      * the bitops (especially ~) can create illegal UTF-8.
 722      * In other words: in Perl UTF-8 is not just for Unicode. */
 723
 724     if (off >= 0) {
 725         while (off--)
 726             s += UTF8SKIP(s);
 727     }
 728     else {
 729         while (off++) {
 730             s--;
 731             while (UTF8_IS_CONTINUATION(*s))
 732                 s--;
 733         }
 734     }
 735     return (U8 *)s;
 736 }
 737
 738 /*
 739 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 740
 741 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 742 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 743 updates len to contain the new length.
 744 Returns zero on failure, setting C<len> to -1.
 745
 746 =cut
 747 */
 748
 749 U8 *
 750 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 751 {
 752     U8 *send;
 753     U8 *d;
 754     U8 *save = s;
 755
 756     /* ensure valid UTF-8 and chars < 256 before updating string */
 757     for (send = s + *len; s < send; ) {
 758         U8 c = *s++;
 759
 760         if (!UTF8_IS_INVARIANT(c) &&
 761             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 762              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 763             *len = -1;
 764             return 0;
 765         }
 766     }
 767
 768     d = s = save;
 769     while (s < send) {
 770         STRLEN ulen;
 771         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 772         s += ulen;
 773     }
 774     *d = '\0';
 775     *len = d - save;
 776     return save;
 777 }
 778
 779 /*
 780 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 781
 782 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 783 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 784 the newly-created string, and updates C<len> to contain the new
 785 length.  Returns the original string if no conversion occurs, C<len>
 786 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 787 0 if C<s> is converted or contains all 7bit characters.
 788
 789 =cut
 790 */
 791
 792 U8 *
 793 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 794 {
 795     U8 *d;
 796     const U8 *start = s;
 797     const U8 *send;
 798     I32 count = 0;
 799     const U8 *s2;
 800
 801     if (!*is_utf8)
 802         return (U8 *)start;
 803
 804     /* ensure valid UTF-8 and chars < 256 before converting string */
 805     for (send = s + *len; s < send;) {
 806         U8 c = *s++;
 807         if (!UTF8_IS_INVARIANT(c)) {
 808             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 809                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 810                 count++;
 811             else
 812                 return (U8 *)start;
 813         }
 814     }
 815
 816     *is_utf8 = 0;
 817
 818     Newxz(d, (*len) - count + 1, U8);
 819     s2 = start; start = d;
 820     while (s2 < send) {
 821         U8 c = *s2++;
 822         if (!UTF8_IS_INVARIANT(c)) {
 823             /* Then it is two-byte encoded */
 824             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
 825             c = ASCII_TO_NATIVE(c);
 826         }
 827         *d++ = c;
 828     }
 829     *d = '\0';
 830     *len = d - start;
 831     return (U8 *)start;
 832 }
 833
 834 /*
 835 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 836
 837 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 838 Returns a pointer to the newly-created string, and sets C<len> to
 839 reflect the new length.
 840
 841 If you want to convert to UTF-8 from other encodings than ASCII,
 842 see sv_recode_to_utf8().
 843
 844 =cut
 845 */
 846
 847 U8*
 848 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 849 {
 850     const U8 * const send = s + (*len);
 851     U8 *d;
 852     U8 *dst;
 853
 854     Newxz(d, (*len) * 2 + 1, U8);
 855     dst = d;
 856
 857     while (s < send) {
 858         const UV uv = NATIVE_TO_ASCII(*s++);
 859         if (UNI_IS_INVARIANT(uv))
 860             *d++ = (U8)UTF_TO_NATIVE(uv);
 861         else {
 862             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 863             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 864         }
 865     }
 866     *d = '\0';
 867     *len = d-dst;
 868     return dst;
 869 }
 870
 871 /*
 872  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 873  *
 874  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 875  * We optimize for native, for obvious reasons. */
 876
 877 U8*
 878 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 879 {
 880     U8* pend;
 881     U8* dstart = d;
 882
 883     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 884          d[0] = 0;
 885          *newlen = 1;
 886          return d;
 887     }
 888
 889     if (bytelen & 1)
 890         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 891
 892     pend = p + bytelen;
 893
 894     while (p < pend) {
 895         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 896         p += 2;
 897         if (uv < 0x80) {
 898             *d++ = (U8)uv;
 899             continue;
 900         }
 901         if (uv < 0x800) {
 902             *d++ = (U8)(( uv >>  6)         | 0xc0);
 903             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 904             continue;
 905         }
 906         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 907             UV low = (p[0] << 8) + p[1];
 908             p += 2;
 909             if (low < 0xdc00 || low >= 0xdfff)
 910                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 911             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 912         }
 913         if (uv < 0x10000) {
 914             *d++ = (U8)(( uv >> 12)         | 0xe0);
 915             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 916             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 917             continue;
 918         }
 919         else {
 920             *d++ = (U8)(( uv >> 18)         | 0xf0);
 921             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 922             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 923             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 924             continue;
 925         }
 926     }
 927     *newlen = d - dstart;
 928     return d;
 929 }
 930
 931 /* Note: this one is slightly destructive of the source. */
 932
 933 U8*
 934 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 935 {
 936     U8* s = (U8*)p;
 937     U8* send = s + bytelen;
 938     while (s < send) {
 939         U8 tmp = s[0];
 940         s[0] = s[1];
 941         s[1] = tmp;
 942         s += 2;
 943     }
 944     return utf16_to_utf8(p, d, bytelen, newlen);
 945 }
 946
 947 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 948
 949 bool
 950 Perl_is_uni_alnum(pTHX_ UV c)
 951 {
 952     U8 tmpbuf[UTF8_MAXBYTES+1];
 953     uvchr_to_utf8(tmpbuf, c);
 954     return is_utf8_alnum(tmpbuf);
 955 }
 956
 957 bool
 958 Perl_is_uni_alnumc(pTHX_ UV c)
 959 {
 960     U8 tmpbuf[UTF8_MAXBYTES+1];
 961     uvchr_to_utf8(tmpbuf, c);
 962     return is_utf8_alnumc(tmpbuf);
 963 }
 964
 965 bool
 966 Perl_is_uni_idfirst(pTHX_ UV c)
 967 {
 968     U8 tmpbuf[UTF8_MAXBYTES+1];
 969     uvchr_to_utf8(tmpbuf, c);
 970     return is_utf8_idfirst(tmpbuf);
 971 }
 972
 973 bool
 974 Perl_is_uni_alpha(pTHX_ UV c)
 975 {
 976     U8 tmpbuf[UTF8_MAXBYTES+1];
 977     uvchr_to_utf8(tmpbuf, c);
 978     return is_utf8_alpha(tmpbuf);
 979 }
 980
 981 bool
 982 Perl_is_uni_ascii(pTHX_ UV c)
 983 {
 984     U8 tmpbuf[UTF8_MAXBYTES+1];
 985     uvchr_to_utf8(tmpbuf, c);
 986     return is_utf8_ascii(tmpbuf);
 987 }
 988
 989 bool
 990 Perl_is_uni_space(pTHX_ UV c)
 991 {
 992     U8 tmpbuf[UTF8_MAXBYTES+1];
 993     uvchr_to_utf8(tmpbuf, c);
 994     return is_utf8_space(tmpbuf);
 995 }
 996
 997 bool
 998 Perl_is_uni_digit(pTHX_ UV c)
 999 {
1000     U8 tmpbuf[UTF8_MAXBYTES+1];
1001     uvchr_to_utf8(tmpbuf, c);
1002     return is_utf8_digit(tmpbuf);
1003 }
1004
1005 bool
1006 Perl_is_uni_upper(pTHX_ UV c)
1007 {
1008     U8 tmpbuf[UTF8_MAXBYTES+1];
1009     uvchr_to_utf8(tmpbuf, c);
1010     return is_utf8_upper(tmpbuf);
1011 }
1012
1013 bool
1014 Perl_is_uni_lower(pTHX_ UV c)
1015 {
1016     U8 tmpbuf[UTF8_MAXBYTES+1];
1017     uvchr_to_utf8(tmpbuf, c);
1018     return is_utf8_lower(tmpbuf);
1019 }
1020
1021 bool
1022 Perl_is_uni_cntrl(pTHX_ UV c)
1023 {
1024     U8 tmpbuf[UTF8_MAXBYTES+1];
1025     uvchr_to_utf8(tmpbuf, c);
1026     return is_utf8_cntrl(tmpbuf);
1027 }
1028
1029 bool
1030 Perl_is_uni_graph(pTHX_ UV c)
1031 {
1032     U8 tmpbuf[UTF8_MAXBYTES+1];
1033     uvchr_to_utf8(tmpbuf, c);
1034     return is_utf8_graph(tmpbuf);
1035 }
1036
1037 bool
1038 Perl_is_uni_print(pTHX_ UV c)
1039 {
1040     U8 tmpbuf[UTF8_MAXBYTES+1];
1041     uvchr_to_utf8(tmpbuf, c);
1042     return is_utf8_print(tmpbuf);
1043 }
1044
1045 bool
1046 Perl_is_uni_punct(pTHX_ UV c)
1047 {
1048     U8 tmpbuf[UTF8_MAXBYTES+1];
1049     uvchr_to_utf8(tmpbuf, c);
1050     return is_utf8_punct(tmpbuf);
1051 }
1052
1053 bool
1054 Perl_is_uni_xdigit(pTHX_ UV c)
1055 {
1056     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1057     uvchr_to_utf8(tmpbuf, c);
1058     return is_utf8_xdigit(tmpbuf);
1059 }
1060
1061 UV
1062 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1063 {
1064     uvchr_to_utf8(p, c);
1065     return to_utf8_upper(p, p, lenp);
1066 }
1067
1068 UV
1069 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1070 {
1071     uvchr_to_utf8(p, c);
1072     return to_utf8_title(p, p, lenp);
1073 }
1074
1075 UV
1076 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1077 {
1078     uvchr_to_utf8(p, c);
1079     return to_utf8_lower(p, p, lenp);
1080 }
1081
1082 UV
1083 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1084 {
1085     uvchr_to_utf8(p, c);
1086     return to_utf8_fold(p, p, lenp);
1087 }
1088
1089 /* for now these all assume no locale info available for Unicode > 255 */
1090
1091 bool
1092 Perl_is_uni_alnum_lc(pTHX_ UV c)
1093 {
1094     return is_uni_alnum(c);     /* XXX no locale support yet */
1095 }
1096
1097 bool
1098 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1099 {
1100     return is_uni_alnumc(c);    /* XXX no locale support yet */
1101 }
1102
1103 bool
1104 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1105 {
1106     return is_uni_idfirst(c);   /* XXX no locale support yet */
1107 }
1108
1109 bool
1110 Perl_is_uni_alpha_lc(pTHX_ UV c)
1111 {
1112     return is_uni_alpha(c);     /* XXX no locale support yet */
1113 }
1114
1115 bool
1116 Perl_is_uni_ascii_lc(pTHX_ UV c)
1117 {
1118     return is_uni_ascii(c);     /* XXX no locale support yet */
1119 }
1120
1121 bool
1122 Perl_is_uni_space_lc(pTHX_ UV c)
1123 {
1124     return is_uni_space(c);     /* XXX no locale support yet */
1125 }
1126
1127 bool
1128 Perl_is_uni_digit_lc(pTHX_ UV c)
1129 {
1130     return is_uni_digit(c);     /* XXX no locale support yet */
1131 }
1132
1133 bool
1134 Perl_is_uni_upper_lc(pTHX_ UV c)
1135 {
1136     return is_uni_upper(c);     /* XXX no locale support yet */
1137 }
1138
1139 bool
1140 Perl_is_uni_lower_lc(pTHX_ UV c)
1141 {
1142     return is_uni_lower(c);     /* XXX no locale support yet */
1143 }
1144
1145 bool
1146 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1147 {
1148     return is_uni_cntrl(c);     /* XXX no locale support yet */
1149 }
1150
1151 bool
1152 Perl_is_uni_graph_lc(pTHX_ UV c)
1153 {
1154     return is_uni_graph(c);     /* XXX no locale support yet */
1155 }
1156
1157 bool
1158 Perl_is_uni_print_lc(pTHX_ UV c)
1159 {
1160     return is_uni_print(c);     /* XXX no locale support yet */
1161 }
1162
1163 bool
1164 Perl_is_uni_punct_lc(pTHX_ UV c)
1165 {
1166     return is_uni_punct(c);     /* XXX no locale support yet */
1167 }
1168
1169 bool
1170 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1171 {
1172     return is_uni_xdigit(c);    /* XXX no locale support yet */
1173 }
1174
1175 U32
1176 Perl_to_uni_upper_lc(pTHX_ U32 c)
1177 {
1178     /* XXX returns only the first character -- do not use XXX */
1179     /* XXX no locale support yet */
1180     STRLEN len;
1181     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1182     return (U32)to_uni_upper(c, tmpbuf, &len);
1183 }
1184
1185 U32
1186 Perl_to_uni_title_lc(pTHX_ U32 c)
1187 {
1188     /* XXX returns only the first character XXX -- do not use XXX */
1189     /* XXX no locale support yet */
1190     STRLEN len;
1191     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1192     return (U32)to_uni_title(c, tmpbuf, &len);
1193 }
1194
1195 U32
1196 Perl_to_uni_lower_lc(pTHX_ U32 c)
1197 {
1198     /* XXX returns only the first character -- do not use XXX */
1199     /* XXX no locale support yet */
1200     STRLEN len;
1201     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1202     return (U32)to_uni_lower(c, tmpbuf, &len);
1203 }
1204
1205 static bool
1206 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1207                  const char *const swashname)
1208 {
1209     if (!is_utf8_char((U8 *)p))
1210         return FALSE;
1211     if (!*swash)
1212         *swash = swash_init("utf8", (char *)swashname, &PL_sv_undef, 1, 0);
1213     return swash_fetch(*swash, (U8 *)p, TRUE) != 0;
1214 }
1215
1216 bool
1217 Perl_is_utf8_alnum(pTHX_ U8 *p)
1218 {
1219     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1220      * descendant of isalnum(3), in other words, it doesn't
1221      * contain the '_'. --jhi */
1222     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
1223 }
1224
1225 bool
1226 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1227 {
1228     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
1229 }
1230
1231 bool
1232 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1233 {
1234     if (*p == '_')
1235         return TRUE;
1236     /* is_utf8_idstart would be more logical. */
1237     return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
1238 }
1239
1240 bool
1241 Perl_is_utf8_idcont(pTHX_ U8 *p)
1242 {
1243     if (*p == '_')
1244         return TRUE;
1245     return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
1246 }
1247
1248 bool
1249 Perl_is_utf8_alpha(pTHX_ U8 *p)
1250 {
1251     return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
1252 }
1253
1254 bool
1255 Perl_is_utf8_ascii(pTHX_ U8 *p)
1256 {
1257     return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
1258 }
1259
1260 bool
1261 Perl_is_utf8_space(pTHX_ U8 *p)
1262 {
1263     return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
1264 }
1265
1266 bool
1267 Perl_is_utf8_digit(pTHX_ U8 *p)
1268 {
1269     return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
1270 }
1271
1272 bool
1273 Perl_is_utf8_upper(pTHX_ U8 *p)
1274 {
1275     return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
1276 }
1277
1278 bool
1279 Perl_is_utf8_lower(pTHX_ U8 *p)
1280 {
1281     return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
1282 }
1283
1284 bool
1285 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1286 {
1287     return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
1288 }
1289
1290 bool
1291 Perl_is_utf8_graph(pTHX_ U8 *p)
1292 {
1293     return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
1294 }
1295
1296 bool
1297 Perl_is_utf8_print(pTHX_ U8 *p)
1298 {
1299     return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
1300 }
1301
1302 bool
1303 Perl_is_utf8_punct(pTHX_ U8 *p)
1304 {
1305     return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
1306 }
1307
1308 bool
1309 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1310 {
1311     return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
1312 }
1313
1314 bool
1315 Perl_is_utf8_mark(pTHX_ U8 *p)
1316 {
1317     return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
1318 }
1319
1320 /*
1321 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1322
1323 The "p" contains the pointer to the UTF-8 string encoding
1324 the character that is being converted.
1325
1326 The "ustrp" is a pointer to the character buffer to put the
1327 conversion result to.  The "lenp" is a pointer to the length
1328 of the result.
1329
1330 The "swashp" is a pointer to the swash to use.
1331
1332 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1333 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1334 but not always, a multicharacter mapping), is tried first.
1335
1336 The "special" is a string like "utf8::ToSpecLower", which means the
1337 hash %utf8::ToSpecLower.  The access to the hash is through
1338 Perl_to_utf8_case().
1339
1340 The "normal" is a string like "ToLower" which means the swash
1341 %utf8::ToLower.
1342
1343 =cut */
1344
1345 UV
1346 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1347 {
1348     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1349     STRLEN len = 0;
1350
1351     const UV uv0 = utf8_to_uvchr(p, NULL);
1352     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1353      * are necessary in EBCDIC, they are redundant no-ops
1354      * in ASCII-ish platforms, and hopefully optimized away. */
1355     const UV uv1 = NATIVE_TO_UNI(uv0);
1356     uvuni_to_utf8(tmpbuf, uv1);
1357
1358     if (!*swashp) /* load on-demand */
1359          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1360
1361     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1362     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1363          /* It might be "special" (sometimes, but not always,
1364           * a multicharacter mapping) */
1365          HV *hv;
1366          SV **svp;
1367
1368          if ((hv  = get_hv(special, FALSE)) &&
1369              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1370              (*svp)) {
1371              const char *s;
1372
1373               s = SvPV_const(*svp, len);
1374               if (len == 1)
1375                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1376               else {
1377 #ifdef EBCDIC
1378                    /* If we have EBCDIC we need to remap the characters
1379                     * since any characters in the low 256 are Unicode
1380                     * code points, not EBCDIC. */
1381                    U8 *t = (U8*)s, *tend = t + len, *d;
1382
1383                    d = tmpbuf;
1384                    if (SvUTF8(*svp)) {
1385                         STRLEN tlen = 0;
1386
1387                         while (t < tend) {
1388                              UV c = utf8_to_uvchr(t, &tlen);
1389                              if (tlen > 0) {
1390                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1391                                   t += tlen;
1392                              }
1393                              else
1394                                   break;
1395                         }
1396                    }
1397                    else {
1398                         while (t < tend) {
1399                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1400                              t++;
1401                         }
1402                    }
1403                    len = d - tmpbuf;
1404                    Copy(tmpbuf, ustrp, len, U8);
1405 #else
1406                    Copy(s, ustrp, len, U8);
1407 #endif
1408               }
1409          }
1410     }
1411
1412     if (!len && *swashp) {
1413          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1414
1415          if (uv2) {
1416               /* It was "normal" (a single character mapping). */
1417               UV uv3 = UNI_TO_NATIVE(uv2);
1418
1419               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1420          }
1421     }
1422
1423     if (!len) /* Neither: just copy. */
1424          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1425
1426     if (lenp)
1427          *lenp = len;
1428
1429     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1430 }
1431
1432 /*
1433 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1434
1435 Convert the UTF-8 encoded character at p to its uppercase version and
1436 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1437 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1438 the uppercase version may be longer than the original character.
1439
1440 The first character of the uppercased version is returned
1441 (but note, as explained above, that there may be more.)
1442
1443 =cut */
1444
1445 UV
1446 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1447 {
1448     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1449                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1450 }
1451
1452 /*
1453 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1454
1455 Convert the UTF-8 encoded character at p to its titlecase version and
1456 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1457 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1458 titlecase version may be longer than the original character.
1459
1460 The first character of the titlecased version is returned
1461 (but note, as explained above, that there may be more.)
1462
1463 =cut */
1464
1465 UV
1466 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1467 {
1468     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1469                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1470 }
1471
1472 /*
1473 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1474
1475 Convert the UTF-8 encoded character at p to its lowercase version and
1476 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1477 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1478 lowercase version may be longer than the original character.
1479
1480 The first character of the lowercased version is returned
1481 (but note, as explained above, that there may be more.)
1482
1483 =cut */
1484
1485 UV
1486 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1487 {
1488     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1489                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1490 }
1491
1492 /*
1493 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1494
1495 Convert the UTF-8 encoded character at p to its foldcase version and
1496 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1497 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1498 foldcase version may be longer than the original character (up to
1499 three characters).
1500
1501 The first character of the foldcased version is returned
1502 (but note, as explained above, that there may be more.)
1503
1504 =cut */
1505
1506 UV
1507 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1508 {
1509     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1510                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1511 }
1512
1513 /* Note:
1514  * A "swash" is a swatch hash.
1515  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1516  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1517  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1518  */
1519 SV*
1520 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1521 {
1522     SV* retval;
1523     SV* const tokenbufsv = sv_newmortal();
1524     dSP;
1525     const size_t pkg_len = strlen(pkg);
1526     const size_t name_len = strlen(name);
1527     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1528     SV* errsv_save;
1529
1530     PUSHSTACKi(PERLSI_MAGIC);
1531     ENTER;
1532     SAVEI32(PL_hints);
1533     PL_hints = 0;
1534     save_re_context();
1535     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1536         ENTER;
1537         errsv_save = newSVsv(ERRSV);
1538         /* It is assumed that callers of this routine are not passing in any
1539            user derived data.  */
1540         /* Need to do this after save_re_context() as it will set PL_tainted to
1541            1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1542            Even line to create errsv_save can turn on PL_tainted.  */
1543         SAVEBOOL(PL_tainted);
1544         PL_tainted = 0;
1545         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1546                          NULL);
1547         if (!SvTRUE(ERRSV))
1548             sv_setsv(ERRSV, errsv_save);
1549         SvREFCNT_dec(errsv_save);
1550         LEAVE;
1551     }
1552     SPAGAIN;
1553     PUSHMARK(SP);
1554     EXTEND(SP,5);
1555     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1556     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1557     PUSHs(listsv);
1558     PUSHs(sv_2mortal(newSViv(minbits)));
1559     PUSHs(sv_2mortal(newSViv(none)));
1560     PUTBACK;
1561     if (IN_PERL_COMPILETIME) {
1562         /* XXX ought to be handled by lex_start */
1563         SAVEI32(PL_in_my);
1564         PL_in_my = 0;
1565         sv_setpv(tokenbufsv, PL_tokenbuf);
1566     }
1567     errsv_save = newSVsv(ERRSV);
1568     if (call_method("SWASHNEW", G_SCALAR))
1569         retval = newSVsv(*PL_stack_sp--);
1570     else
1571         retval = &PL_sv_undef;
1572     if (!SvTRUE(ERRSV))
1573         sv_setsv(ERRSV, errsv_save);
1574     SvREFCNT_dec(errsv_save);
1575     LEAVE;
1576     POPSTACK;
1577     if (IN_PERL_COMPILETIME) {
1578         STRLEN len;
1579         const char* const pv = SvPV_const(tokenbufsv, len);
1580
1581         Copy(pv, PL_tokenbuf, len+1, char);
1582         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1583     }
1584     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1585         if (SvPOK(retval))
1586             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1587                        retval);
1588         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1589     }
1590     return retval;
1591 }
1592
1593
1594 /* This API is wrong for special case conversions since we may need to
1595  * return several Unicode characters for a single Unicode character
1596  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1597  * the lower-level routine, and it is similarly broken for returning
1598  * multiple values.  --jhi */
1599 /* Now SWASHGET is recasted into S_swash_get in this file. */
1600
1601 /* Note:
1602  * Returns the value of property/mapping C<swash> for the first character
1603  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1604  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1605  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1606  */
1607 UV
1608 Perl_swash_fetch(pTHX_ SV *swash, U8 *ptr, bool do_utf8)
1609 {
1610     HV* const hv = (HV*)SvRV(swash);
1611     U32 klen;
1612     U32 off;
1613     STRLEN slen;
1614     STRLEN needents;
1615     const U8 *tmps = NULL;
1616     U32 bit;
1617     SV *swatch;
1618     U8 tmputf8[2];
1619     UV c = NATIVE_TO_ASCII(*ptr);
1620
1621     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1622         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1623         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1624         ptr = tmputf8;
1625     }
1626     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1627      * then the "swatch" is a vec() for al the chars which start
1628      * with 0xAA..0xYY
1629      * So the key in the hash (klen) is length of encoded char -1
1630      */
1631     klen = UTF8SKIP(ptr) - 1;
1632     off  = ptr[klen];
1633
1634     if (klen == 0) {
1635       /* If char in invariant then swatch is for all the invariant chars
1636        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1637        */
1638         needents = UTF_CONTINUATION_MARK;
1639         off      = NATIVE_TO_UTF(ptr[klen]);
1640     }
1641     else {
1642       /* If char is encoded then swatch is for the prefix */
1643         needents = (1 << UTF_ACCUMULATION_SHIFT);
1644         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1645     }
1646
1647     /*
1648      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1649      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1650      * it's nothing to sniff at.)  Pity we usually come through at least
1651      * two function calls to get here...
1652      *
1653      * NB: this code assumes that swatches are never modified, once generated!
1654      */
1655
1656     if (hv   == PL_last_swash_hv &&
1657         klen == PL_last_swash_klen &&
1658         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1659     {
1660         tmps = PL_last_swash_tmps;
1661         slen = PL_last_swash_slen;
1662     }
1663     else {
1664         /* Try our second-level swatch cache, kept in a hash. */
1665         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1666
1667         /* If not cached, generate it via swash_get */
1668         if (!svp || !SvPOK(*svp)
1669                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1670             /* We use utf8n_to_uvuni() as we want an index into
1671                Unicode tables, not a native character number.
1672              */
1673             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1674                                            ckWARN(WARN_UTF8) ?
1675                                            0 : UTF8_ALLOW_ANY);
1676             swatch = swash_get(swash,
1677                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1678                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1679                                 needents);
1680
1681             if (IN_PERL_COMPILETIME)
1682                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1683
1684             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1685
1686             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1687                      || (slen << 3) < needents)
1688                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1689         }
1690
1691         PL_last_swash_hv = hv;
1692         PL_last_swash_klen = klen;
1693         /* FIXME change interpvar.h?  */
1694         PL_last_swash_tmps = (U8 *) tmps;
1695         PL_last_swash_slen = slen;
1696         if (klen)
1697             Copy(ptr, PL_last_swash_key, klen, U8);
1698     }
1699
1700     switch ((int)((slen << 3) / needents)) {
1701     case 1:
1702         bit = 1 << (off & 7);
1703         off >>= 3;
1704         return (tmps[off] & bit) != 0;
1705     case 8:
1706         return tmps[off];
1707     case 16:
1708         off <<= 1;
1709         return (tmps[off] << 8) + tmps[off + 1] ;
1710     case 32:
1711         off <<= 2;
1712         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1713     }
1714     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1715     return 0;
1716 }
1717
1718 /* Note:
1719  * Returns a swatch (a bit vector string) for a code point sequence
1720  * that starts from the value C<start> and comprises the number C<span>.
1721  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1722  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1723  */
1724 STATIC SV*
1725 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1726 {
1727     SV *swatch;
1728     U8 *l, *lend, *x, *xend, *s;
1729     STRLEN lcur, xcur, scur;
1730
1731     HV* const hv = (HV*)SvRV(swash);
1732     SV** const listsvp = hv_fetch(hv, "LIST", 4, FALSE);
1733     SV** const typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
1734     SV** const bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
1735     SV** const nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
1736     SV** const extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
1737     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1738     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1739     const STRLEN bits  = SvUV(*bitssvp);
1740     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1741     const UV     none  = SvUV(*nonesvp);
1742     const UV     end   = start + span;
1743
1744     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1745         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1746                                                  (UV)bits);
1747     }
1748
1749     /* create and initialize $swatch */
1750     swatch = newSVpvn("",0);
1751     scur   = octets ? (span * octets) : (span + 7) / 8;
1752     SvGROW(swatch, scur + 1);
1753     s = (U8*)SvPVX(swatch);
1754     if (octets && none) {
1755         const U8* const e = s + scur;
1756         while (s < e) {
1757             if (bits == 8)
1758                 *s++ = (U8)(none & 0xff);
1759             else if (bits == 16) {
1760                 *s++ = (U8)((none >>  8) & 0xff);
1761                 *s++ = (U8)( none        & 0xff);
1762             }
1763             else if (bits == 32) {
1764                 *s++ = (U8)((none >> 24) & 0xff);
1765                 *s++ = (U8)((none >> 16) & 0xff);
1766                 *s++ = (U8)((none >>  8) & 0xff);
1767                 *s++ = (U8)( none        & 0xff);
1768             }
1769         }
1770         *s = '\0';
1771     }
1772     else {
1773         (void)memzero((U8*)s, scur + 1);
1774     }
1775     SvCUR_set(swatch, scur);
1776     s = (U8*)SvPVX(swatch);
1777
1778     /* read $swash->{LIST} */
1779     l = (U8*)SvPV(*listsvp, lcur);
1780     lend = l + lcur;
1781     while (l < lend) {
1782         UV min, max, val, key;
1783         STRLEN numlen;
1784         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1785
1786         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1787
1788         numlen = lend - l;
1789         min = grok_hex((char *)l, &numlen, &flags, NULL);
1790         if (numlen)
1791             l += numlen;
1792         else if (nl) {
1793             l = nl + 1; /* 1 is length of "\n" */
1794             continue;
1795         }
1796         else {
1797             l = lend; /* to LIST's end at which \n is not found */
1798             break;
1799         }
1800
1801         if (isBLANK(*l)) {
1802             ++l;
1803             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1804             numlen = lend - l;
1805             max = grok_hex((char *)l, &numlen, &flags, NULL);
1806             if (numlen)
1807                 l += numlen;
1808             else
1809                 max = min;
1810
1811             if (octets) {
1812                 if (isBLANK(*l)) {
1813                     ++l;
1814                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1815                             PERL_SCAN_DISALLOW_PREFIX;
1816                     numlen = lend - l;
1817                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1818                     if (numlen)
1819                         l += numlen;
1820                     else
1821                         val = 0;
1822                 }
1823                 else {
1824                     val = 0;
1825                     if (typeto) {
1826                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1827                                          typestr, l);
1828                     }
1829                 }
1830             }
1831             else
1832                 val = 0; /* bits == 1, then val should be ignored */
1833         }
1834         else {
1835             max = min;
1836             if (octets) {
1837                 val = 0;
1838                 if (typeto) {
1839                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1840                 }
1841             }
1842             else
1843                 val = 0; /* bits == 1, then val should be ignored */
1844         }
1845
1846         if (nl)
1847             l = nl + 1;
1848         else
1849             l = lend;
1850
1851         if (max < start)
1852             continue;
1853
1854         if (octets) {
1855             if (min < start) {
1856                 if (!none || val < none) {
1857                     val += start - min;
1858                 }
1859                 min = start;
1860             }
1861             for (key = min; key <= max; key++) {
1862                 STRLEN offset;
1863                 if (key >= end)
1864                     goto go_out_list;
1865                 /* offset must be non-negative (start <= min <= key < end) */
1866                 offset = octets * (key - start);
1867                 if (bits == 8)
1868                     s[offset] = (U8)(val & 0xff);
1869                 else if (bits == 16) {
1870                     s[offset    ] = (U8)((val >>  8) & 0xff);
1871                     s[offset + 1] = (U8)( val        & 0xff);
1872                 }
1873                 else if (bits == 32) {
1874                     s[offset    ] = (U8)((val >> 24) & 0xff);
1875                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1876                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1877                     s[offset + 3] = (U8)( val        & 0xff);
1878                 }
1879
1880                 if (!none || val < none)
1881                     ++val;
1882             }
1883         }
1884         else { /* bits == 1, then val should be ignored */
1885             if (min < start)
1886                 min = start;
1887             for (key = min; key <= max; key++) {
1888                 const STRLEN offset = (STRLEN)(key - start);
1889                 if (key >= end)
1890                     goto go_out_list;
1891                 s[offset >> 3] |= 1 << (offset & 7);
1892             }
1893         }
1894     } /* while */
1895   go_out_list:
1896
1897     /* read $swash->{EXTRAS} */
1898     x = (U8*)SvPV(*extssvp, xcur);
1899     xend = x + xcur;
1900     while (x < xend) {
1901         STRLEN namelen;
1902         U8 *namestr;
1903         SV** othersvp;
1904         HV* otherhv;
1905         STRLEN otherbits;
1906         SV **otherbitssvp, *other;
1907         U8 *s, *o, *nl;
1908         STRLEN slen, olen;
1909
1910         U8 opc = *x++;
1911         if (opc == '\n')
1912             continue;
1913
1914         nl = (U8*)memchr(x, '\n', xend - x);
1915
1916         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1917             if (nl) {
1918                 x = nl + 1; /* 1 is length of "\n" */
1919                 continue;
1920             }
1921             else {
1922                 x = xend; /* to EXTRAS' end at which \n is not found */
1923                 break;
1924             }
1925         }
1926
1927         namestr = x;
1928         if (nl) {
1929             namelen = nl - namestr;
1930             x = nl + 1;
1931         }
1932         else {
1933             namelen = xend - namestr;
1934             x = xend;
1935         }
1936
1937         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1938         otherhv = (HV*)SvRV(*othersvp);
1939         otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
1940         otherbits = (STRLEN)SvUV(*otherbitssvp);
1941         if (bits < otherbits)
1942             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1943
1944         /* The "other" swatch must be destroyed after. */
1945         other = swash_get(*othersvp, start, span);
1946         o = (U8*)SvPV(other, olen);
1947
1948         if (!olen)
1949             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1950
1951         s = (U8*)SvPV(swatch, slen);
1952         if (bits == 1 && otherbits == 1) {
1953             if (slen != olen)
1954                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1955
1956             switch (opc) {
1957             case '+':
1958                 while (slen--)
1959                     *s++ |= *o++;
1960                 break;
1961             case '!':
1962                 while (slen--)
1963                     *s++ |= ~*o++;
1964                 break;
1965             case '-':
1966                 while (slen--)
1967                     *s++ &= ~*o++;
1968                 break;
1969             case '&':
1970                 while (slen--)
1971                     *s++ &= *o++;
1972                 break;
1973             default:
1974                 break;
1975             }
1976         }
1977         else {
1978             STRLEN otheroctets = otherbits >> 3;
1979             STRLEN offset = 0;
1980             U8* send = s + slen;
1981
1982             while (s < send) {
1983                 UV otherval = 0;
1984
1985                 if (otherbits == 1) {
1986                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
1987                     ++offset;
1988                 }
1989                 else {
1990                     STRLEN vlen = otheroctets;
1991                     otherval = *o++;
1992                     while (--vlen) {
1993                         otherval <<= 8;
1994                         otherval |= *o++;
1995                     }
1996                 }
1997
1998                 if (opc == '+' && otherval)
1999                     ; /* replace with otherval */
2000                 else if (opc == '!' && !otherval)
2001                     otherval = 1;
2002                 else if (opc == '-' && otherval)
2003                     otherval = 0;
2004                 else if (opc == '&' && !otherval)
2005                     otherval = 0;
2006                 else {
2007                     s += octets; /* no replacement */
2008                     continue;
2009                 }
2010
2011                 if (bits == 8)
2012                     *s++ = (U8)( otherval & 0xff);
2013                 else if (bits == 16) {
2014                     *s++ = (U8)((otherval >>  8) & 0xff);
2015                     *s++ = (U8)( otherval        & 0xff);
2016                 }
2017                 else if (bits == 32) {
2018                     *s++ = (U8)((otherval >> 24) & 0xff);
2019                     *s++ = (U8)((otherval >> 16) & 0xff);
2020                     *s++ = (U8)((otherval >>  8) & 0xff);
2021                     *s++ = (U8)( otherval        & 0xff);
2022                 }
2023             }
2024         }
2025         sv_free(other); /* through with it! */
2026     } /* while */
2027     return swatch;
2028 }
2029
2030 /*
2031 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2032
2033 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2034 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2035 bytes available. The return value is the pointer to the byte after the
2036 end of the new character. In other words,
2037
2038     d = uvchr_to_utf8(d, uv);
2039
2040 is the recommended wide native character-aware way of saying
2041
2042     *(d++) = uv;
2043
2044 =cut
2045 */
2046
2047 /* On ASCII machines this is normally a macro but we want a
2048    real function in case XS code wants it
2049 */
2050 U8 *
2051 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2052 {
2053     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2054 }
2055
2056 U8 *
2057 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2058 {
2059     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2060 }
2061
2062 /*
2063 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2064 flags
2065
2066 Returns the native character value of the first character in the string
2067 C<s>
2068 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2069 length, in bytes, of that character.
2070
2071 Allows length and flags to be passed to low level routine.
2072
2073 =cut
2074 */
2075 /* On ASCII machines this is normally a macro but we want
2076    a real function in case XS code wants it
2077 */
2078 UV
2079 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2080 {
2081     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2082     return UNI_TO_NATIVE(uv);
2083 }
2084
2085 /*
2086 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2087
2088 Build to the scalar dsv a displayable version of the string spv,
2089 length len, the displayable version being at most pvlim bytes long
2090 (if longer, the rest is truncated and "..." will be appended).
2091
2092 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2093 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2094 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2095 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2096 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2097 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2098
2099 The pointer to the PV of the dsv is returned.
2100
2101 =cut */
2102 char *
2103 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2104 {
2105     int truncated = 0;
2106     const char *s, *e;
2107
2108     sv_setpvn(dsv, "", 0);
2109     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2110          UV u;
2111           /* This serves double duty as a flag and a character to print after
2112              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2113           */
2114          char ok = 0;
2115
2116          if (pvlim && SvCUR(dsv) >= pvlim) {
2117               truncated++;
2118               break;
2119          }
2120          u = utf8_to_uvchr((U8*)s, 0);
2121          if (u < 256) {
2122              const unsigned char c = (unsigned char)u & 0xFF;
2123              if (flags & UNI_DISPLAY_BACKSLASH) {
2124                  switch (c) {
2125                  case '\n':
2126                      ok = 'n'; break;
2127                  case '\r':
2128                      ok = 'r'; break;
2129                  case '\t':
2130                      ok = 't'; break;
2131                  case '\f':
2132                      ok = 'f'; break;
2133                  case '\a':
2134                      ok = 'a'; break;
2135                  case '\\':
2136                      ok = '\\'; break;
2137                  default: break;
2138                  }
2139                  if (ok) {
2140                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2141                  }
2142              }
2143              /* isPRINT() is the locale-blind version. */
2144              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2145                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2146                  ok = 1;
2147              }
2148          }
2149          if (!ok)
2150              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2151     }
2152     if (truncated)
2153          sv_catpvn(dsv, "...", 3);
2154
2155     return SvPVX(dsv);
2156 }
2157
2158 /*
2159 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2160
2161 Build to the scalar dsv a displayable version of the scalar sv,
2162 the displayable version being at most pvlim bytes long
2163 (if longer, the rest is truncated and "..." will be appended).
2164
2165 The flags argument is as in pv_uni_display().
2166
2167 The pointer to the PV of the dsv is returned.
2168
2169 =cut */
2170 char *
2171 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2172 {
2173      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
2174                                 SvCUR(ssv), pvlim, flags);
2175 }
2176
2177 /*
2178 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2179
2180 Return true if the strings s1 and s2 differ case-insensitively, false
2181 if not (if they are equal case-insensitively).  If u1 is true, the
2182 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2183 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2184 are false, the respective string is assumed to be in native 8-bit
2185 encoding.
2186
2187 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2188 in there (they will point at the beginning of the I<next> character).
2189 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2190 pointers beyond which scanning will not continue under any
2191 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2192 s2+l2 will be used as goal end pointers that will also stop the scan,
2193 and which qualify towards defining a successful match: all the scans
2194 that define an explicit length must reach their goal pointers for
2195 a match to succeed).
2196
2197 For case-insensitiveness, the "casefolding" of Unicode is used
2198 instead of upper/lowercasing both the characters, see
2199 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2200
2201 =cut */
2202 I32
2203 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2204 {
2205      register const U8 *p1  = (const U8*)s1;
2206      register const U8 *p2  = (const U8*)s2;
2207      register const U8 *f1 = NULL;
2208      register U8 *f2 = NULL;
2209      register U8 *e1 = NULL;
2210      register U8 *q1 = NULL;
2211      register U8 *e2 = NULL;
2212      register U8 *q2 = NULL;
2213      STRLEN n1 = 0, n2 = 0;
2214      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2215      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2216      U8 natbuf[1+1];
2217      STRLEN foldlen1, foldlen2;
2218      bool match;
2219
2220      if (pe1)
2221           e1 = *(U8**)pe1;
2222      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2223           f1 = (const U8*)s1 + l1;
2224      if (pe2)
2225           e2 = *(U8**)pe2;
2226      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2227           f2 = (const U8*)s2 + l2;
2228
2229      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2230           return 1; /* mismatch; possible infinite loop or false positive */
2231
2232      if (!u1 || !u2)
2233           natbuf[1] = 0; /* Need to terminate the buffer. */
2234
2235      while ((e1 == 0 || p1 < e1) &&
2236             (f1 == 0 || p1 < f1) &&
2237             (e2 == 0 || p2 < e2) &&
2238             (f2 == 0 || p2 < f2)) {
2239           if (n1 == 0) {
2240                if (u1)
2241                     to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
2242                else {
2243                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2244                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2245                }
2246                q1 = foldbuf1;
2247                n1 = foldlen1;
2248           }
2249           if (n2 == 0) {
2250                if (u2)
2251                     to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
2252                else {
2253                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2254                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2255                }
2256                q2 = foldbuf2;
2257                n2 = foldlen2;
2258           }
2259           while (n1 && n2) {
2260                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2261                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2262                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2263                    return 1; /* mismatch */
2264                n1 -= UTF8SKIP(q1);
2265                q1 += UTF8SKIP(q1);
2266                n2 -= UTF8SKIP(q2);
2267                q2 += UTF8SKIP(q2);
2268           }
2269           if (n1 == 0)
2270                p1 += u1 ? UTF8SKIP(p1) : 1;
2271           if (n2 == 0)
2272                p2 += u2 ? UTF8SKIP(p2) : 1;
2273
2274      }
2275
2276      /* A match is defined by all the scans that specified
2277       * an explicit length reaching their final goals. */
2278      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2279
2280      if (match) {
2281           if (pe1)
2282                *pe1 = (char*)p1;
2283           if (pe2)
2284                *pe2 = (char*)p2;
2285      }
2286
2287      return match ? 0 : 1; /* 0 match, 1 mismatch */
2288 }
2289
2290 /*
2291  * Local variables:
2292  * c-indentation-style: bsd
2293  * c-basic-offset: 4
2294  * indent-tabs-mode: t
2295  * End:
2296  *
2297  * ex: set ts=8 sts=4 sw=4 noet:
2298  */