utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  13  * heard of that we don't want to see any closer; and that's the one place
  14  * we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  * 'Well do I understand your speech,' he answered in the same language;
  17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  18  * as is the custom in the West, if you wish to be answered?'
  19  *
  20  * ...the travellers perceived that the floor was paved with stones of many
  21  * hues; branching runes and strange devices intertwined beneath their feet.
  22  */
  23
  24 #include "EXTERN.h"
  25 #define PERL_IN_UTF8_C
  26 #include "perl.h"
  27
  28 static const char unees[] =
  29     "Malformed UTF-8 character (unexpected end of string)";
  30
  31 /*
  32 =head1 Unicode Support
  33
  34 This file contains various utility functions for manipulating UTF8-encoded
  35 strings. For the uninitiated, this is a method of representing arbitrary
  36 Unicode characters as a variable number of bytes, in such a way that
  37 characters in the ASCII range are unmodified, and a zero byte never appears
  38 within non-zero characters.
  39
  40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  41
  42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
  43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
  44 bytes available. The return value is the pointer to the byte after the
  45 end of the new character. In other words,
  46
  47     d = uvuni_to_utf8_flags(d, uv, flags);
  48
  49 or, in most cases,
  50
  51     d = uvuni_to_utf8(d, uv);
  52
  53 (which is equivalent to)
  54
  55     d = uvuni_to_utf8_flags(d, uv, 0);
  56
  57 is the recommended Unicode-aware way of saying
  58
  59     *(d++) = uv;
  60
  61 =cut
  62 */
  63
  64 U8 *
  65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  66 {
  67     if (ckWARN(WARN_UTF8)) {
  68          if (UNICODE_IS_SURROGATE(uv) &&
  69              !(flags & UNICODE_ALLOW_SURROGATE))
  70               Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
  71          else if (
  72                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  73                     !(flags & UNICODE_ALLOW_FDD0))
  74                    ||
  75                    ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
  76                     !(flags & UNICODE_ALLOW_FFFF))) &&
  77                   /* UNICODE_ALLOW_SUPER includes
  78                    * FFFEs and FFFFs beyond 0x10FFFF. */
  79                   ((uv <= PERL_UNICODE_MAX) ||
  80                    !(flags & UNICODE_ALLOW_SUPER))
  81                   )
  82               Perl_warner(aTHX_ packWARN(WARN_UTF8),
  83                          "Unicode character 0x%04"UVxf" is illegal", uv);
  84     }
  85     if (UNI_IS_INVARIANT(uv)) {
  86         *d++ = (U8)UTF_TO_NATIVE(uv);
  87         return d;
  88     }
  89 #if defined(EBCDIC)
  90     else {
  91         STRLEN len  = UNISKIP(uv);
  92         U8 *p = d+len-1;
  93         while (p > d) {
  94             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  95             uv >>= UTF_ACCUMULATION_SHIFT;
  96         }
  97         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  98         return d+len;
  99     }
 100 #else /* Non loop style */
 101     if (uv < 0x800) {
 102         *d++ = (U8)(( uv >>  6)         | 0xc0);
 103         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 104         return d;
 105     }
 106     if (uv < 0x10000) {
 107         *d++ = (U8)(( uv >> 12)         | 0xe0);
 108         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x200000) {
 113         *d++ = (U8)(( uv >> 18)         | 0xf0);
 114         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 115         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 116         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 117         return d;
 118     }
 119     if (uv < 0x4000000) {
 120         *d++ = (U8)(( uv >> 24)         | 0xf8);
 121         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 122         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 123         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 124         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 125         return d;
 126     }
 127     if (uv < 0x80000000) {
 128         *d++ = (U8)(( uv >> 30)         | 0xfc);
 129         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 130         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 131         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 132         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 133         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 134         return d;
 135     }
 136 #ifdef HAS_QUAD
 137     if (uv < UTF8_QUAD_MAX)
 138 #endif
 139     {
 140         *d++ =                            0xfe; /* Can't match U+FEFF! */
 141         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 142         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 143         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 144         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 145         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 146         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 147         return d;
 148     }
 149 #ifdef HAS_QUAD
 150     {
 151         *d++ =                            0xff;         /* Can't match U+FFFE! */
 152         *d++ =                            0x80;         /* 6 Reserved bits */
 153         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 154         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 155         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 156         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 157         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 158         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 159         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 160         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 161         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 162         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 163         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 164         return d;
 165     }
 166 #endif
 167 #endif /* Loop style */
 168 }
 169
 170 /*
 171
 172 Tests if some arbitrary number of bytes begins in a valid UTF-8
 173 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 174 UTF-8 character.  The actual number of bytes in the UTF-8 character
 175 will be returned if it is valid, otherwise 0.
 176
 177 This is the "slow" version as opposed to the "fast" version which is
 178 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 179 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 180 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 181 you should use the _slow().  In practice this means that the _slow()
 182 will be used very rarely, since the maximum Unicode code point (as of
 183 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 184 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 185 five bytes or more.
 186
 187 =cut */
 188 STATIC STRLEN
 189 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
 190 {
 191     U8 u = *s;
 192     STRLEN slen;
 193     UV uv, ouv;
 194
 195     if (UTF8_IS_INVARIANT(u))
 196         return 1;
 197
 198     if (!UTF8_IS_START(u))
 199         return 0;
 200
 201     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 202         return 0;
 203
 204     slen = len - 1;
 205     s++;
 206 #ifdef EBCDIC
 207     u = NATIVE_TO_UTF(u);
 208 #endif
 209     u &= UTF_START_MASK(len);
 210     uv  = u;
 211     ouv = uv;
 212     while (slen--) {
 213         if (!UTF8_IS_CONTINUATION(*s))
 214             return 0;
 215         uv = UTF8_ACCUMULATE(uv, *s);
 216         if (uv < ouv)
 217             return 0;
 218         ouv = uv;
 219         s++;
 220     }
 221
 222     if ((STRLEN)UNISKIP(uv) < len)
 223         return 0;
 224
 225     return len;
 226 }
 227
 228 /*
 229 =for apidoc A|STRLEN|is_utf8_char|const U8 *s
 230
 231 Tests if some arbitrary number of bytes begins in a valid UTF-8
 232 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 233 UTF-8 character.  The actual number of bytes in the UTF-8 character
 234 will be returned if it is valid, otherwise 0.
 235
 236 =cut */
 237 STRLEN
 238 Perl_is_utf8_char(pTHX_ const U8 *s)
 239 {
 240     const STRLEN len = UTF8SKIP(s);
 241 #ifdef IS_UTF8_CHAR
 242     if (IS_UTF8_CHAR_FAST(len))
 243         return IS_UTF8_CHAR(s, len) ? len : 0;
 244 #endif /* #ifdef IS_UTF8_CHAR */
 245     return is_utf8_char_slow(s, len);
 246 }
 247
 248 /*
 249 =for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
 250
 251 Returns true if first C<len> bytes of the given string form a valid
 252 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
 253 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
 254 because a valid ASCII string is a valid UTF-8 string.
 255
 256 See also is_utf8_string_loclen() and is_utf8_string_loc().
 257
 258 =cut
 259 */
 260
 261 bool
 262 Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
 263 {
 264     const U8* x = s;
 265     const U8* send;
 266
 267     if (!len)
 268         len = strlen((const char *)s);
 269     send = s + len;
 270
 271     while (x < send) {
 272         STRLEN c;
 273          /* Inline the easy bits of is_utf8_char() here for speed... */
 274          if (UTF8_IS_INVARIANT(*x))
 275               c = 1;
 276          else if (!UTF8_IS_START(*x))
 277              goto out;
 278          else {
 279               /* ... and call is_utf8_char() only if really needed. */
 280 #ifdef IS_UTF8_CHAR
 281              c = UTF8SKIP(x);
 282              if (IS_UTF8_CHAR_FAST(c)) {
 283                  if (!IS_UTF8_CHAR(x, c))
 284                      goto out;
 285              } else if (!is_utf8_char_slow(x, c))
 286                  goto out;
 287 #else
 288              c = is_utf8_char(x);
 289 #endif /* #ifdef IS_UTF8_CHAR */
 290               if (!c)
 291                   goto out;
 292          }
 293         x += c;
 294     }
 295
 296  out:
 297     if (x != send)
 298         return FALSE;
 299
 300     return TRUE;
 301 }
 302
 303 /*
 304 Implemented as a macro in utf8.h
 305
 306 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
 307
 308 Like is_utf8_string() but stores the location of the failure (in the
 309 case of "utf8ness failure") or the location s+len (in the case of
 310 "utf8ness success") in the C<ep>.
 311
 312 See also is_utf8_string_loclen() and is_utf8_string().
 313
 314 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
 315
 316 Like is_utf8_string() but stores the location of the failure (in the
 317 case of "utf8ness failure") or the location s+len (in the case of
 318 "utf8ness success") in the C<ep>, and the number of UTF-8
 319 encoded characters in the C<el>.
 320
 321 See also is_utf8_string_loc() and is_utf8_string().
 322
 323 =cut
 324 */
 325
 326 bool
 327 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 328 {
 329     const U8* x = s;
 330     const U8* send;
 331     STRLEN c;
 332
 333     if (!len)
 334         len = strlen((const char *)s);
 335     send = s + len;
 336     if (el)
 337         *el = 0;
 338
 339     while (x < send) {
 340          /* Inline the easy bits of is_utf8_char() here for speed... */
 341          if (UTF8_IS_INVARIANT(*x))
 342              c = 1;
 343          else if (!UTF8_IS_START(*x))
 344              goto out;
 345          else {
 346              /* ... and call is_utf8_char() only if really needed. */
 347 #ifdef IS_UTF8_CHAR
 348              c = UTF8SKIP(x);
 349              if (IS_UTF8_CHAR_FAST(c)) {
 350                  if (!IS_UTF8_CHAR(x, c))
 351                      c = 0;
 352              } else
 353                  c = is_utf8_char_slow(x, c);
 354 #else
 355              c = is_utf8_char(x);
 356 #endif /* #ifdef IS_UTF8_CHAR */
 357              if (!c)
 358                  goto out;
 359          }
 360          x += c;
 361          if (el)
 362              (*el)++;
 363     }
 364
 365  out:
 366     if (ep)
 367         *ep = x;
 368     if (x != send)
 369         return FALSE;
 370
 371     return TRUE;
 372 }
 373
 374 /*
 375
 376 =for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 377
 378 Bottom level UTF-8 decode routine.
 379 Returns the unicode code point value of the first character in the string C<s>
 380 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
 381 C<retlen> will be set to the length, in bytes, of that character.
 382
 383 If C<s> does not point to a well-formed UTF-8 character, the behaviour
 384 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 385 it is assumed that the caller will raise a warning, and this function
 386 will silently just set C<retlen> to C<-1> and return zero.  If the
 387 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 388 malformations will be given, C<retlen> will be set to the expected
 389 length of the UTF-8 character in bytes, and zero will be returned.
 390
 391 The C<flags> can also contain various flags to allow deviations from
 392 the strict UTF-8 encoding (see F<utf8.h>).
 393
 394 Most code should use utf8_to_uvchr() rather than call this directly.
 395
 396 =cut
 397 */
 398
 399 UV
 400 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 401 {
 402     dVAR;
 403     const U8 *s0 = s;
 404     UV uv = *s, ouv = 0;
 405     STRLEN len = 1;
 406     const bool dowarn = ckWARN_d(WARN_UTF8);
 407     const UV startbyte = *s;
 408     STRLEN expectlen = 0;
 409     U32 warning = 0;
 410
 411 /* This list is a superset of the UTF8_ALLOW_XXX. */
 412
 413 #define UTF8_WARN_EMPTY                          1
 414 #define UTF8_WARN_CONTINUATION                   2
 415 #define UTF8_WARN_NON_CONTINUATION               3
 416 #define UTF8_WARN_FE_FF                          4
 417 #define UTF8_WARN_SHORT                          5
 418 #define UTF8_WARN_OVERFLOW                       6
 419 #define UTF8_WARN_SURROGATE                      7
 420 #define UTF8_WARN_LONG                           8
 421 #define UTF8_WARN_FFFF                           9 /* Also FFFE. */
 422
 423     if (curlen == 0 &&
 424         !(flags & UTF8_ALLOW_EMPTY)) {
 425         warning = UTF8_WARN_EMPTY;
 426         goto malformed;
 427     }
 428
 429     if (UTF8_IS_INVARIANT(uv)) {
 430         if (retlen)
 431             *retlen = 1;
 432         return (UV) (NATIVE_TO_UTF(*s));
 433     }
 434
 435     if (UTF8_IS_CONTINUATION(uv) &&
 436         !(flags & UTF8_ALLOW_CONTINUATION)) {
 437         warning = UTF8_WARN_CONTINUATION;
 438         goto malformed;
 439     }
 440
 441     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 442         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 443         warning = UTF8_WARN_NON_CONTINUATION;
 444         goto malformed;
 445     }
 446
 447 #ifdef EBCDIC
 448     uv = NATIVE_TO_UTF(uv);
 449 #else
 450     if ((uv == 0xfe || uv == 0xff) &&
 451         !(flags & UTF8_ALLOW_FE_FF)) {
 452         warning = UTF8_WARN_FE_FF;
 453         goto malformed;
 454     }
 455 #endif
 456
 457     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 458     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 459     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 460     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 461 #ifdef EBCDIC
 462     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 463     else                        { len =  7; uv &= 0x01; }
 464 #else
 465     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 466     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 467     else                        { len = 13; uv = 0; } /* whoa! */
 468 #endif
 469
 470     if (retlen)
 471         *retlen = len;
 472
 473     expectlen = len;
 474
 475     if ((curlen < expectlen) &&
 476         !(flags & UTF8_ALLOW_SHORT)) {
 477         warning = UTF8_WARN_SHORT;
 478         goto malformed;
 479     }
 480
 481     len--;
 482     s++;
 483     ouv = uv;
 484
 485     while (len--) {
 486         if (!UTF8_IS_CONTINUATION(*s) &&
 487             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 488             s--;
 489             warning = UTF8_WARN_NON_CONTINUATION;
 490             goto malformed;
 491         }
 492         else
 493             uv = UTF8_ACCUMULATE(uv, *s);
 494         if (!(uv > ouv)) {
 495             /* These cannot be allowed. */
 496             if (uv == ouv) {
 497                 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
 498                     warning = UTF8_WARN_LONG;
 499                     goto malformed;
 500                 }
 501             }
 502             else { /* uv < ouv */
 503                 /* This cannot be allowed. */
 504                 warning = UTF8_WARN_OVERFLOW;
 505                 goto malformed;
 506             }
 507         }
 508         s++;
 509         ouv = uv;
 510     }
 511
 512     if (UNICODE_IS_SURROGATE(uv) &&
 513         !(flags & UTF8_ALLOW_SURROGATE)) {
 514         warning = UTF8_WARN_SURROGATE;
 515         goto malformed;
 516     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
 517                !(flags & UTF8_ALLOW_LONG)) {
 518         warning = UTF8_WARN_LONG;
 519         goto malformed;
 520     } else if (UNICODE_IS_ILLEGAL(uv) &&
 521                !(flags & UTF8_ALLOW_FFFF)) {
 522         warning = UTF8_WARN_FFFF;
 523         goto malformed;
 524     }
 525
 526     return uv;
 527
 528 malformed:
 529
 530     if (flags & UTF8_CHECK_ONLY) {
 531         if (retlen)
 532             *retlen = -1;
 533         return 0;
 534     }
 535
 536     if (dowarn) {
 537         SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
 538
 539         switch (warning) {
 540         case 0: /* Intentionally empty. */ break;
 541         case UTF8_WARN_EMPTY:
 542             sv_catpvs(sv, "(empty string)");
 543             break;
 544         case UTF8_WARN_CONTINUATION:
 545             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 546             break;
 547         case UTF8_WARN_NON_CONTINUATION:
 548             if (s == s0)
 549                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 550                            (UV)s[1], startbyte);
 551             else {
 552                 const int len = (int)(s-s0);
 553                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 554                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
 555             }
 556
 557             break;
 558         case UTF8_WARN_FE_FF:
 559             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 560             break;
 561         case UTF8_WARN_SHORT:
 562             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 563                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
 564             expectlen = curlen;         /* distance for caller to skip */
 565             break;
 566         case UTF8_WARN_OVERFLOW:
 567             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 568                            ouv, *s, startbyte);
 569             break;
 570         case UTF8_WARN_SURROGATE:
 571             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 572             break;
 573         case UTF8_WARN_LONG:
 574             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 575                            (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 576             break;
 577         case UTF8_WARN_FFFF:
 578             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 579             break;
 580         default:
 581             sv_catpvs(sv, "(unknown reason)");
 582             break;
 583         }
 584
 585         if (warning) {
 586             const char * const s = SvPVX_const(sv);
 587
 588             if (PL_op)
 589                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
 590                             "%s in %s", s,  OP_DESC(PL_op));
 591             else
 592                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
 593         }
 594     }
 595
 596     if (retlen)
 597         *retlen = expectlen ? expectlen : len;
 598
 599     return 0;
 600 }
 601
 602 /*
 603 =for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
 604
 605 Returns the native character value of the first character in the string C<s>
 606 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 607 length, in bytes, of that character.
 608
 609 If C<s> does not point to a well-formed UTF-8 character, zero is
 610 returned and retlen is set, if possible, to -1.
 611
 612 =cut
 613 */
 614
 615 UV
 616 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 617 {
 618     return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
 619                           ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 620 }
 621
 622 /*
 623 =for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
 624
 625 Returns the Unicode code point of the first character in the string C<s>
 626 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 627 length, in bytes, of that character.
 628
 629 This function should only be used when returned UV is considered
 630 an index into the Unicode semantic tables (e.g. swashes).
 631
 632 If C<s> does not point to a well-formed UTF-8 character, zero is
 633 returned and retlen is set, if possible, to -1.
 634
 635 =cut
 636 */
 637
 638 UV
 639 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 640 {
 641     /* Call the low level routine asking for checks */
 642     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
 643                                ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 644 }
 645
 646 /*
 647 =for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
 648
 649 Return the length of the UTF-8 char encoded string C<s> in characters.
 650 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 651 up past C<e>, croaks.
 652
 653 =cut
 654 */
 655
 656 STRLEN
 657 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
 658 {
 659     dVAR;
 660     STRLEN len = 0;
 661
 662     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 663      * the bitops (especially ~) can create illegal UTF-8.
 664      * In other words: in Perl UTF-8 is not just for Unicode. */
 665
 666     if (e < s)
 667         goto warn_and_return;
 668     while (s < e) {
 669         const U8 t = UTF8SKIP(s);
 670         if (e - s < t) {
 671             warn_and_return:
 672             if (ckWARN_d(WARN_UTF8)) {
 673                 if (PL_op)
 674                     Perl_warner(aTHX_ packWARN(WARN_UTF8),
 675                             "%s in %s", unees, OP_DESC(PL_op));
 676                 else
 677                     Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 678             }
 679             return len;
 680         }
 681         s += t;
 682         len++;
 683     }
 684
 685     return len;
 686 }
 687
 688 /*
 689 =for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
 690
 691 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
 692 and C<b>.
 693
 694 WARNING: use only if you *know* that the pointers point inside the
 695 same UTF-8 buffer.
 696
 697 =cut
 698 */
 699
 700 IV
 701 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
 702 {
 703     dVAR;
 704     IV off = 0;
 705
 706     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 707      * the bitops (especially ~) can create illegal UTF-8.
 708      * In other words: in Perl UTF-8 is not just for Unicode. */
 709
 710     if (a < b) {
 711         while (a < b) {
 712             const U8 c = UTF8SKIP(a);
 713             if (b - a < c)
 714                 goto warn_and_return;
 715             a += c;
 716             off--;
 717         }
 718     }
 719     else {
 720         while (b < a) {
 721             const U8 c = UTF8SKIP(b);
 722
 723             if (a - b < c) {
 724                 warn_and_return:
 725                 if (ckWARN_d(WARN_UTF8)) {
 726                     if (PL_op)
 727                         Perl_warner(aTHX_ packWARN(WARN_UTF8),
 728                                     "%s in %s", unees, OP_DESC(PL_op));
 729                     else
 730                         Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
 731                 }
 732                 return off;
 733             }
 734             b += c;
 735             off++;
 736         }
 737     }
 738
 739     return off;
 740 }
 741
 742 /*
 743 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 744
 745 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 746 forward or backward.
 747
 748 WARNING: do not use the following unless you *know* C<off> is within
 749 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 750 on the first byte of character or just after the last byte of a character.
 751
 752 =cut
 753 */
 754
 755 U8 *
 756 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
 757 {
 758     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 759      * the bitops (especially ~) can create illegal UTF-8.
 760      * In other words: in Perl UTF-8 is not just for Unicode. */
 761
 762     if (off >= 0) {
 763         while (off--)
 764             s += UTF8SKIP(s);
 765     }
 766     else {
 767         while (off++) {
 768             s--;
 769             while (UTF8_IS_CONTINUATION(*s))
 770                 s--;
 771         }
 772     }
 773     return (U8 *)s;
 774 }
 775
 776 /*
 777 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 778
 779 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 780 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 781 updates len to contain the new length.
 782 Returns zero on failure, setting C<len> to -1.
 783
 784 =cut
 785 */
 786
 787 U8 *
 788 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 789 {
 790     U8 *send;
 791     U8 *d;
 792     U8 *save = s;
 793
 794     /* ensure valid UTF-8 and chars < 256 before updating string */
 795     for (send = s + *len; s < send; ) {
 796         U8 c = *s++;
 797
 798         if (!UTF8_IS_INVARIANT(c) &&
 799             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 800              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 801             *len = -1;
 802             return 0;
 803         }
 804     }
 805
 806     d = s = save;
 807     while (s < send) {
 808         STRLEN ulen;
 809         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 810         s += ulen;
 811     }
 812     *d = '\0';
 813     *len = d - save;
 814     return save;
 815 }
 816
 817 /*
 818 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
 819
 820 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
 821 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 822 the newly-created string, and updates C<len> to contain the new
 823 length.  Returns the original string if no conversion occurs, C<len>
 824 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 825 0 if C<s> is converted or contains all 7bit characters.
 826
 827 =cut
 828 */
 829
 830 U8 *
 831 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
 832 {
 833     U8 *d;
 834     const U8 *start = s;
 835     const U8 *send;
 836     I32 count = 0;
 837
 838     if (!*is_utf8)
 839         return (U8 *)start;
 840
 841     /* ensure valid UTF-8 and chars < 256 before converting string */
 842     for (send = s + *len; s < send;) {
 843         U8 c = *s++;
 844         if (!UTF8_IS_INVARIANT(c)) {
 845             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 846                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 847                 count++;
 848             else
 849                 return (U8 *)start;
 850         }
 851     }
 852
 853     *is_utf8 = 0;
 854
 855     Newxz(d, (*len) - count + 1, U8);
 856     s = start; start = d;
 857     while (s < send) {
 858         U8 c = *s++;
 859         if (!UTF8_IS_INVARIANT(c)) {
 860             /* Then it is two-byte encoded */
 861             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 862             c = ASCII_TO_NATIVE(c);
 863         }
 864         *d++ = c;
 865     }
 866     *d = '\0';
 867     *len = d - start;
 868     return (U8 *)start;
 869 }
 870
 871 /*
 872 =for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
 873
 874 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
 875 Returns a pointer to the newly-created string, and sets C<len> to
 876 reflect the new length.
 877
 878 If you want to convert to UTF-8 from other encodings than ASCII,
 879 see sv_recode_to_utf8().
 880
 881 =cut
 882 */
 883
 884 U8*
 885 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
 886 {
 887     const U8 * const send = s + (*len);
 888     U8 *d;
 889     U8 *dst;
 890
 891     Newxz(d, (*len) * 2 + 1, U8);
 892     dst = d;
 893
 894     while (s < send) {
 895         const UV uv = NATIVE_TO_ASCII(*s++);
 896         if (UNI_IS_INVARIANT(uv))
 897             *d++ = (U8)UTF_TO_NATIVE(uv);
 898         else {
 899             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
 900             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
 901         }
 902     }
 903     *d = '\0';
 904     *len = d-dst;
 905     return dst;
 906 }
 907
 908 /*
 909  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 910  *
 911  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 912  * We optimize for native, for obvious reasons. */
 913
 914 U8*
 915 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 916 {
 917     U8* pend;
 918     U8* dstart = d;
 919
 920     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
 921          d[0] = 0;
 922          *newlen = 1;
 923          return d;
 924     }
 925
 926     if (bytelen & 1)
 927         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
 928
 929     pend = p + bytelen;
 930
 931     while (p < pend) {
 932         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 933         p += 2;
 934         if (uv < 0x80) {
 935             *d++ = (U8)uv;
 936             continue;
 937         }
 938         if (uv < 0x800) {
 939             *d++ = (U8)(( uv >>  6)         | 0xc0);
 940             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 941             continue;
 942         }
 943         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 944             UV low = (p[0] << 8) + p[1];
 945             p += 2;
 946             if (low < 0xdc00 || low >= 0xdfff)
 947                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 948             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 949         }
 950         if (uv < 0x10000) {
 951             *d++ = (U8)(( uv >> 12)         | 0xe0);
 952             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 953             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 954             continue;
 955         }
 956         else {
 957             *d++ = (U8)(( uv >> 18)         | 0xf0);
 958             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 959             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 960             *d++ = (U8)(( uv        & 0x3f) | 0x80);
 961             continue;
 962         }
 963     }
 964     *newlen = d - dstart;
 965     return d;
 966 }
 967
 968 /* Note: this one is slightly destructive of the source. */
 969
 970 U8*
 971 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 972 {
 973     U8* s = (U8*)p;
 974     U8* send = s + bytelen;
 975     while (s < send) {
 976         U8 tmp = s[0];
 977         s[0] = s[1];
 978         s[1] = tmp;
 979         s += 2;
 980     }
 981     return utf16_to_utf8(p, d, bytelen, newlen);
 982 }
 983
 984 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 985
 986 bool
 987 Perl_is_uni_alnum(pTHX_ UV c)
 988 {
 989     U8 tmpbuf[UTF8_MAXBYTES+1];
 990     uvchr_to_utf8(tmpbuf, c);
 991     return is_utf8_alnum(tmpbuf);
 992 }
 993
 994 bool
 995 Perl_is_uni_alnumc(pTHX_ UV c)
 996 {
 997     U8 tmpbuf[UTF8_MAXBYTES+1];
 998     uvchr_to_utf8(tmpbuf, c);
 999     return is_utf8_alnumc(tmpbuf);
1000 }
1001
1002 bool
1003 Perl_is_uni_idfirst(pTHX_ UV c)
1004 {
1005     U8 tmpbuf[UTF8_MAXBYTES+1];
1006     uvchr_to_utf8(tmpbuf, c);
1007     return is_utf8_idfirst(tmpbuf);
1008 }
1009
1010 bool
1011 Perl_is_uni_alpha(pTHX_ UV c)
1012 {
1013     U8 tmpbuf[UTF8_MAXBYTES+1];
1014     uvchr_to_utf8(tmpbuf, c);
1015     return is_utf8_alpha(tmpbuf);
1016 }
1017
1018 bool
1019 Perl_is_uni_ascii(pTHX_ UV c)
1020 {
1021     U8 tmpbuf[UTF8_MAXBYTES+1];
1022     uvchr_to_utf8(tmpbuf, c);
1023     return is_utf8_ascii(tmpbuf);
1024 }
1025
1026 bool
1027 Perl_is_uni_space(pTHX_ UV c)
1028 {
1029     U8 tmpbuf[UTF8_MAXBYTES+1];
1030     uvchr_to_utf8(tmpbuf, c);
1031     return is_utf8_space(tmpbuf);
1032 }
1033
1034 bool
1035 Perl_is_uni_digit(pTHX_ UV c)
1036 {
1037     U8 tmpbuf[UTF8_MAXBYTES+1];
1038     uvchr_to_utf8(tmpbuf, c);
1039     return is_utf8_digit(tmpbuf);
1040 }
1041
1042 bool
1043 Perl_is_uni_upper(pTHX_ UV c)
1044 {
1045     U8 tmpbuf[UTF8_MAXBYTES+1];
1046     uvchr_to_utf8(tmpbuf, c);
1047     return is_utf8_upper(tmpbuf);
1048 }
1049
1050 bool
1051 Perl_is_uni_lower(pTHX_ UV c)
1052 {
1053     U8 tmpbuf[UTF8_MAXBYTES+1];
1054     uvchr_to_utf8(tmpbuf, c);
1055     return is_utf8_lower(tmpbuf);
1056 }
1057
1058 bool
1059 Perl_is_uni_cntrl(pTHX_ UV c)
1060 {
1061     U8 tmpbuf[UTF8_MAXBYTES+1];
1062     uvchr_to_utf8(tmpbuf, c);
1063     return is_utf8_cntrl(tmpbuf);
1064 }
1065
1066 bool
1067 Perl_is_uni_graph(pTHX_ UV c)
1068 {
1069     U8 tmpbuf[UTF8_MAXBYTES+1];
1070     uvchr_to_utf8(tmpbuf, c);
1071     return is_utf8_graph(tmpbuf);
1072 }
1073
1074 bool
1075 Perl_is_uni_print(pTHX_ UV c)
1076 {
1077     U8 tmpbuf[UTF8_MAXBYTES+1];
1078     uvchr_to_utf8(tmpbuf, c);
1079     return is_utf8_print(tmpbuf);
1080 }
1081
1082 bool
1083 Perl_is_uni_punct(pTHX_ UV c)
1084 {
1085     U8 tmpbuf[UTF8_MAXBYTES+1];
1086     uvchr_to_utf8(tmpbuf, c);
1087     return is_utf8_punct(tmpbuf);
1088 }
1089
1090 bool
1091 Perl_is_uni_xdigit(pTHX_ UV c)
1092 {
1093     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1094     uvchr_to_utf8(tmpbuf, c);
1095     return is_utf8_xdigit(tmpbuf);
1096 }
1097
1098 UV
1099 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1100 {
1101     uvchr_to_utf8(p, c);
1102     return to_utf8_upper(p, p, lenp);
1103 }
1104
1105 UV
1106 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1107 {
1108     uvchr_to_utf8(p, c);
1109     return to_utf8_title(p, p, lenp);
1110 }
1111
1112 UV
1113 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1114 {
1115     uvchr_to_utf8(p, c);
1116     return to_utf8_lower(p, p, lenp);
1117 }
1118
1119 UV
1120 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1121 {
1122     uvchr_to_utf8(p, c);
1123     return to_utf8_fold(p, p, lenp);
1124 }
1125
1126 /* for now these all assume no locale info available for Unicode > 255 */
1127
1128 bool
1129 Perl_is_uni_alnum_lc(pTHX_ UV c)
1130 {
1131     return is_uni_alnum(c);     /* XXX no locale support yet */
1132 }
1133
1134 bool
1135 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1136 {
1137     return is_uni_alnumc(c);    /* XXX no locale support yet */
1138 }
1139
1140 bool
1141 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1142 {
1143     return is_uni_idfirst(c);   /* XXX no locale support yet */
1144 }
1145
1146 bool
1147 Perl_is_uni_alpha_lc(pTHX_ UV c)
1148 {
1149     return is_uni_alpha(c);     /* XXX no locale support yet */
1150 }
1151
1152 bool
1153 Perl_is_uni_ascii_lc(pTHX_ UV c)
1154 {
1155     return is_uni_ascii(c);     /* XXX no locale support yet */
1156 }
1157
1158 bool
1159 Perl_is_uni_space_lc(pTHX_ UV c)
1160 {
1161     return is_uni_space(c);     /* XXX no locale support yet */
1162 }
1163
1164 bool
1165 Perl_is_uni_digit_lc(pTHX_ UV c)
1166 {
1167     return is_uni_digit(c);     /* XXX no locale support yet */
1168 }
1169
1170 bool
1171 Perl_is_uni_upper_lc(pTHX_ UV c)
1172 {
1173     return is_uni_upper(c);     /* XXX no locale support yet */
1174 }
1175
1176 bool
1177 Perl_is_uni_lower_lc(pTHX_ UV c)
1178 {
1179     return is_uni_lower(c);     /* XXX no locale support yet */
1180 }
1181
1182 bool
1183 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1184 {
1185     return is_uni_cntrl(c);     /* XXX no locale support yet */
1186 }
1187
1188 bool
1189 Perl_is_uni_graph_lc(pTHX_ UV c)
1190 {
1191     return is_uni_graph(c);     /* XXX no locale support yet */
1192 }
1193
1194 bool
1195 Perl_is_uni_print_lc(pTHX_ UV c)
1196 {
1197     return is_uni_print(c);     /* XXX no locale support yet */
1198 }
1199
1200 bool
1201 Perl_is_uni_punct_lc(pTHX_ UV c)
1202 {
1203     return is_uni_punct(c);     /* XXX no locale support yet */
1204 }
1205
1206 bool
1207 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1208 {
1209     return is_uni_xdigit(c);    /* XXX no locale support yet */
1210 }
1211
1212 U32
1213 Perl_to_uni_upper_lc(pTHX_ U32 c)
1214 {
1215     /* XXX returns only the first character -- do not use XXX */
1216     /* XXX no locale support yet */
1217     STRLEN len;
1218     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1219     return (U32)to_uni_upper(c, tmpbuf, &len);
1220 }
1221
1222 U32
1223 Perl_to_uni_title_lc(pTHX_ U32 c)
1224 {
1225     /* XXX returns only the first character XXX -- do not use XXX */
1226     /* XXX no locale support yet */
1227     STRLEN len;
1228     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1229     return (U32)to_uni_title(c, tmpbuf, &len);
1230 }
1231
1232 U32
1233 Perl_to_uni_lower_lc(pTHX_ U32 c)
1234 {
1235     /* XXX returns only the first character -- do not use XXX */
1236     /* XXX no locale support yet */
1237     STRLEN len;
1238     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1239     return (U32)to_uni_lower(c, tmpbuf, &len);
1240 }
1241
1242 static bool
1243 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1244                  const char *const swashname)
1245 {
1246     dVAR;
1247     if (!is_utf8_char(p))
1248         return FALSE;
1249     if (!*swash)
1250         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1251     return swash_fetch(*swash, p, TRUE) != 0;
1252 }
1253
1254 bool
1255 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1256 {
1257     dVAR;
1258     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1259      * descendant of isalnum(3), in other words, it doesn't
1260      * contain the '_'. --jhi */
1261     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
1262 }
1263
1264 bool
1265 Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1266 {
1267     dVAR;
1268     return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
1269 }
1270
1271 bool
1272 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1273 {
1274     dVAR;
1275     if (*p == '_')
1276         return TRUE;
1277     /* is_utf8_idstart would be more logical. */
1278     return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
1279 }
1280
1281 bool
1282 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1283 {
1284     dVAR;
1285     if (*p == '_')
1286         return TRUE;
1287     return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
1288 }
1289
1290 bool
1291 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1292 {
1293     dVAR;
1294     return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
1295 }
1296
1297 bool
1298 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1299 {
1300     dVAR;
1301     return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
1302 }
1303
1304 bool
1305 Perl_is_utf8_space(pTHX_ const U8 *p)
1306 {
1307     dVAR;
1308     return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
1309 }
1310
1311 bool
1312 Perl_is_utf8_digit(pTHX_ const U8 *p)
1313 {
1314     dVAR;
1315     return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
1316 }
1317
1318 bool
1319 Perl_is_utf8_upper(pTHX_ const U8 *p)
1320 {
1321     dVAR;
1322     return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
1323 }
1324
1325 bool
1326 Perl_is_utf8_lower(pTHX_ const U8 *p)
1327 {
1328     dVAR;
1329     return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
1330 }
1331
1332 bool
1333 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1334 {
1335     dVAR;
1336     return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
1337 }
1338
1339 bool
1340 Perl_is_utf8_graph(pTHX_ const U8 *p)
1341 {
1342     dVAR;
1343     return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
1344 }
1345
1346 bool
1347 Perl_is_utf8_print(pTHX_ const U8 *p)
1348 {
1349     dVAR;
1350     return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
1351 }
1352
1353 bool
1354 Perl_is_utf8_punct(pTHX_ const U8 *p)
1355 {
1356     dVAR;
1357     return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
1358 }
1359
1360 bool
1361 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1362 {
1363     dVAR;
1364     return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
1365 }
1366
1367 bool
1368 Perl_is_utf8_mark(pTHX_ const U8 *p)
1369 {
1370     dVAR;
1371     return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
1372 }
1373
1374 /*
1375 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1376
1377 The "p" contains the pointer to the UTF-8 string encoding
1378 the character that is being converted.
1379
1380 The "ustrp" is a pointer to the character buffer to put the
1381 conversion result to.  The "lenp" is a pointer to the length
1382 of the result.
1383
1384 The "swashp" is a pointer to the swash to use.
1385
1386 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1387 and loaded by SWASHNEW, using lib/utf8_heavy.pl.  The special (usually,
1388 but not always, a multicharacter mapping), is tried first.
1389
1390 The "special" is a string like "utf8::ToSpecLower", which means the
1391 hash %utf8::ToSpecLower.  The access to the hash is through
1392 Perl_to_utf8_case().
1393
1394 The "normal" is a string like "ToLower" which means the swash
1395 %utf8::ToLower.
1396
1397 =cut */
1398
1399 UV
1400 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1401                         SV **swashp, const char *normal, const char *special)
1402 {
1403     dVAR;
1404     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1405     STRLEN len = 0;
1406
1407     const UV uv0 = utf8_to_uvchr(p, NULL);
1408     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1409      * are necessary in EBCDIC, they are redundant no-ops
1410      * in ASCII-ish platforms, and hopefully optimized away. */
1411     const UV uv1 = NATIVE_TO_UNI(uv0);
1412     uvuni_to_utf8(tmpbuf, uv1);
1413
1414     if (!*swashp) /* load on-demand */
1415          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1416
1417     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1418     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1419          /* It might be "special" (sometimes, but not always,
1420           * a multicharacter mapping) */
1421          HV *hv;
1422          SV **svp;
1423
1424          if ((hv  = get_hv(special, FALSE)) &&
1425              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1426              (*svp)) {
1427              const char *s;
1428
1429               s = SvPV_const(*svp, len);
1430               if (len == 1)
1431                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1432               else {
1433 #ifdef EBCDIC
1434                    /* If we have EBCDIC we need to remap the characters
1435                     * since any characters in the low 256 are Unicode
1436                     * code points, not EBCDIC. */
1437                    U8 *t = (U8*)s, *tend = t + len, *d;
1438
1439                    d = tmpbuf;
1440                    if (SvUTF8(*svp)) {
1441                         STRLEN tlen = 0;
1442
1443                         while (t < tend) {
1444                              UV c = utf8_to_uvchr(t, &tlen);
1445                              if (tlen > 0) {
1446                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1447                                   t += tlen;
1448                              }
1449                              else
1450                                   break;
1451                         }
1452                    }
1453                    else {
1454                         while (t < tend) {
1455                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1456                              t++;
1457                         }
1458                    }
1459                    len = d - tmpbuf;
1460                    Copy(tmpbuf, ustrp, len, U8);
1461 #else
1462                    Copy(s, ustrp, len, U8);
1463 #endif
1464               }
1465          }
1466     }
1467
1468     if (!len && *swashp) {
1469          UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1470
1471          if (uv2) {
1472               /* It was "normal" (a single character mapping). */
1473               UV uv3 = UNI_TO_NATIVE(uv2);
1474
1475               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1476          }
1477     }
1478
1479     if (!len) /* Neither: just copy. */
1480          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1481
1482     if (lenp)
1483          *lenp = len;
1484
1485     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1486 }
1487
1488 /*
1489 =for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1490
1491 Convert the UTF-8 encoded character at p to its uppercase version and
1492 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1493 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1494 the uppercase version may be longer than the original character.
1495
1496 The first character of the uppercased version is returned
1497 (but note, as explained above, that there may be more.)
1498
1499 =cut */
1500
1501 UV
1502 Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1503 {
1504     dVAR;
1505     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1506                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1507 }
1508
1509 /*
1510 =for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1511
1512 Convert the UTF-8 encoded character at p to its titlecase version and
1513 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1514 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1515 titlecase version may be longer than the original character.
1516
1517 The first character of the titlecased version is returned
1518 (but note, as explained above, that there may be more.)
1519
1520 =cut */
1521
1522 UV
1523 Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1524 {
1525     dVAR;
1526     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1527                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1528 }
1529
1530 /*
1531 =for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1532
1533 Convert the UTF-8 encoded character at p to its lowercase version and
1534 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1535 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1536 lowercase version may be longer than the original character.
1537
1538 The first character of the lowercased version is returned
1539 (but note, as explained above, that there may be more.)
1540
1541 =cut */
1542
1543 UV
1544 Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1545 {
1546     dVAR;
1547     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1548                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1549 }
1550
1551 /*
1552 =for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1553
1554 Convert the UTF-8 encoded character at p to its foldcase version and
1555 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1556 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1557 foldcase version may be longer than the original character (up to
1558 three characters).
1559
1560 The first character of the foldcased version is returned
1561 (but note, as explained above, that there may be more.)
1562
1563 =cut */
1564
1565 UV
1566 Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1567 {
1568     dVAR;
1569     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1570                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1571 }
1572
1573 /* Note:
1574  * A "swash" is a swatch hash.
1575  * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1576  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1577  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1578  */
1579 SV*
1580 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1581 {
1582     dVAR;
1583     SV* retval;
1584     SV* const tokenbufsv = sv_newmortal();
1585     dSP;
1586     const size_t pkg_len = strlen(pkg);
1587     const size_t name_len = strlen(name);
1588     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1589     SV* errsv_save;
1590
1591     PUSHSTACKi(PERLSI_MAGIC);
1592     ENTER;
1593     SAVEI32(PL_hints);
1594     PL_hints = 0;
1595     save_re_context();
1596     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1597         ENTER;
1598         errsv_save = newSVsv(ERRSV);
1599         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1600                          Nullsv);
1601         if (!SvTRUE(ERRSV))
1602             sv_setsv(ERRSV, errsv_save);
1603         SvREFCNT_dec(errsv_save);
1604         LEAVE;
1605     }
1606     SPAGAIN;
1607     PUSHMARK(SP);
1608     EXTEND(SP,5);
1609     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1610     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1611     PUSHs(listsv);
1612     PUSHs(sv_2mortal(newSViv(minbits)));
1613     PUSHs(sv_2mortal(newSViv(none)));
1614     PUTBACK;
1615     if (IN_PERL_COMPILETIME) {
1616         /* XXX ought to be handled by lex_start */
1617         SAVEI32(PL_in_my);
1618         PL_in_my = 0;
1619         sv_setpv(tokenbufsv, PL_tokenbuf);
1620     }
1621     errsv_save = newSVsv(ERRSV);
1622     if (call_method("SWASHNEW", G_SCALAR))
1623         retval = newSVsv(*PL_stack_sp--);
1624     else
1625         retval = &PL_sv_undef;
1626     if (!SvTRUE(ERRSV))
1627         sv_setsv(ERRSV, errsv_save);
1628     SvREFCNT_dec(errsv_save);
1629     LEAVE;
1630     POPSTACK;
1631     if (IN_PERL_COMPILETIME) {
1632         STRLEN len;
1633         const char* const pv = SvPV_const(tokenbufsv, len);
1634
1635         Copy(pv, PL_tokenbuf, len+1, char);
1636         PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1637     }
1638     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1639         if (SvPOK(retval))
1640             Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1641                        retval);
1642         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1643     }
1644     return retval;
1645 }
1646
1647
1648 /* This API is wrong for special case conversions since we may need to
1649  * return several Unicode characters for a single Unicode character
1650  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1651  * the lower-level routine, and it is similarly broken for returning
1652  * multiple values.  --jhi */
1653 /* Now SWASHGET is recasted into S_swash_get in this file. */
1654
1655 /* Note:
1656  * Returns the value of property/mapping C<swash> for the first character
1657  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1658  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1659  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1660  */
1661 UV
1662 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1663 {
1664     dVAR;
1665     HV* const hv = (HV*)SvRV(swash);
1666     U32 klen;
1667     U32 off;
1668     STRLEN slen;
1669     STRLEN needents;
1670     const U8 *tmps = NULL;
1671     U32 bit;
1672     SV *swatch;
1673     U8 tmputf8[2];
1674     UV c = NATIVE_TO_ASCII(*ptr);
1675
1676     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1677         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1678         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1679         ptr = tmputf8;
1680     }
1681     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1682      * then the "swatch" is a vec() for al the chars which start
1683      * with 0xAA..0xYY
1684      * So the key in the hash (klen) is length of encoded char -1
1685      */
1686     klen = UTF8SKIP(ptr) - 1;
1687     off  = ptr[klen];
1688
1689     if (klen == 0) {
1690       /* If char in invariant then swatch is for all the invariant chars
1691        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1692        */
1693         needents = UTF_CONTINUATION_MARK;
1694         off      = NATIVE_TO_UTF(ptr[klen]);
1695     }
1696     else {
1697       /* If char is encoded then swatch is for the prefix */
1698         needents = (1 << UTF_ACCUMULATION_SHIFT);
1699         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1700     }
1701
1702     /*
1703      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1704      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1705      * it's nothing to sniff at.)  Pity we usually come through at least
1706      * two function calls to get here...
1707      *
1708      * NB: this code assumes that swatches are never modified, once generated!
1709      */
1710
1711     if (hv   == PL_last_swash_hv &&
1712         klen == PL_last_swash_klen &&
1713         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1714     {
1715         tmps = PL_last_swash_tmps;
1716         slen = PL_last_swash_slen;
1717     }
1718     else {
1719         /* Try our second-level swatch cache, kept in a hash. */
1720         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1721
1722         /* If not cached, generate it via swash_get */
1723         if (!svp || !SvPOK(*svp)
1724                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1725             /* We use utf8n_to_uvuni() as we want an index into
1726                Unicode tables, not a native character number.
1727              */
1728             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1729                                            ckWARN(WARN_UTF8) ?
1730                                            0 : UTF8_ALLOW_ANY);
1731             swatch = swash_get(swash,
1732                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1733                                 (klen) ? (code_point & ~(needents - 1)) : 0,
1734                                 needents);
1735
1736             if (IN_PERL_COMPILETIME)
1737                 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1738
1739             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1740
1741             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1742                      || (slen << 3) < needents)
1743                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1744         }
1745
1746         PL_last_swash_hv = hv;
1747         PL_last_swash_klen = klen;
1748         /* FIXME change interpvar.h?  */
1749         PL_last_swash_tmps = (U8 *) tmps;
1750         PL_last_swash_slen = slen;
1751         if (klen)
1752             Copy(ptr, PL_last_swash_key, klen, U8);
1753     }
1754
1755     switch ((int)((slen << 3) / needents)) {
1756     case 1:
1757         bit = 1 << (off & 7);
1758         off >>= 3;
1759         return (tmps[off] & bit) != 0;
1760     case 8:
1761         return tmps[off];
1762     case 16:
1763         off <<= 1;
1764         return (tmps[off] << 8) + tmps[off + 1] ;
1765     case 32:
1766         off <<= 2;
1767         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1768     }
1769     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1770     return 0;
1771 }
1772
1773 /* Note:
1774  * Returns a swatch (a bit vector string) for a code point sequence
1775  * that starts from the value C<start> and comprises the number C<span>.
1776  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1777  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1778  */
1779 STATIC SV*
1780 S_swash_get(pTHX_ SV* swash, UV start, UV span)
1781 {
1782     SV *swatch;
1783     U8 *l, *lend, *x, *xend, *s;
1784     STRLEN lcur, xcur, scur;
1785
1786     HV* const hv = (HV*)SvRV(swash);
1787     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1788     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1789     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1790     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1791     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1792     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1793     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
1794     const STRLEN bits  = SvUV(*bitssvp);
1795     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1796     const UV     none  = SvUV(*nonesvp);
1797     const UV     end   = start + span;
1798
1799     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1800         Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1801                                                  (UV)bits);
1802     }
1803
1804     /* create and initialize $swatch */
1805     swatch = newSVpvs("");
1806     scur   = octets ? (span * octets) : (span + 7) / 8;
1807     SvGROW(swatch, scur + 1);
1808     s = (U8*)SvPVX(swatch);
1809     if (octets && none) {
1810         const U8* const e = s + scur;
1811         while (s < e) {
1812             if (bits == 8)
1813                 *s++ = (U8)(none & 0xff);
1814             else if (bits == 16) {
1815                 *s++ = (U8)((none >>  8) & 0xff);
1816                 *s++ = (U8)( none        & 0xff);
1817             }
1818             else if (bits == 32) {
1819                 *s++ = (U8)((none >> 24) & 0xff);
1820                 *s++ = (U8)((none >> 16) & 0xff);
1821                 *s++ = (U8)((none >>  8) & 0xff);
1822                 *s++ = (U8)( none        & 0xff);
1823             }
1824         }
1825         *s = '\0';
1826     }
1827     else {
1828         (void)memzero((U8*)s, scur + 1);
1829     }
1830     SvCUR_set(swatch, scur);
1831     s = (U8*)SvPVX(swatch);
1832
1833     /* read $swash->{LIST} */
1834     l = (U8*)SvPV(*listsvp, lcur);
1835     lend = l + lcur;
1836     while (l < lend) {
1837         UV min, max, val, key;
1838         STRLEN numlen;
1839         I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1840
1841         U8* const nl = (U8*)memchr(l, '\n', lend - l);
1842
1843         numlen = lend - l;
1844         min = grok_hex((char *)l, &numlen, &flags, NULL);
1845         if (numlen)
1846             l += numlen;
1847         else if (nl) {
1848             l = nl + 1; /* 1 is length of "\n" */
1849             continue;
1850         }
1851         else {
1852             l = lend; /* to LIST's end at which \n is not found */
1853             break;
1854         }
1855
1856         if (isBLANK(*l)) {
1857             ++l;
1858             flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1859             numlen = lend - l;
1860             max = grok_hex((char *)l, &numlen, &flags, NULL);
1861             if (numlen)
1862                 l += numlen;
1863             else
1864                 max = min;
1865
1866             if (octets) {
1867                 if (isBLANK(*l)) {
1868                     ++l;
1869                     flags = PERL_SCAN_SILENT_ILLDIGIT |
1870                             PERL_SCAN_DISALLOW_PREFIX;
1871                     numlen = lend - l;
1872                     val = grok_hex((char *)l, &numlen, &flags, NULL);
1873                     if (numlen)
1874                         l += numlen;
1875                     else
1876                         val = 0;
1877                 }
1878                 else {
1879                     val = 0;
1880                     if (typeto) {
1881                         Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1882                                          typestr, l);
1883                     }
1884                 }
1885             }
1886             else
1887                 val = 0; /* bits == 1, then val should be ignored */
1888         }
1889         else {
1890             max = min;
1891             if (octets) {
1892                 val = 0;
1893                 if (typeto) {
1894                     Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1895                 }
1896             }
1897             else
1898                 val = 0; /* bits == 1, then val should be ignored */
1899         }
1900
1901         if (nl)
1902             l = nl + 1;
1903         else
1904             l = lend;
1905
1906         if (max < start)
1907             continue;
1908
1909         if (octets) {
1910             if (min < start) {
1911                 if (!none || val < none) {
1912                     val += start - min;
1913                 }
1914                 min = start;
1915             }
1916             for (key = min; key <= max; key++) {
1917                 STRLEN offset;
1918                 if (key >= end)
1919                     goto go_out_list;
1920                 /* offset must be non-negative (start <= min <= key < end) */
1921                 offset = octets * (key - start);
1922                 if (bits == 8)
1923                     s[offset] = (U8)(val & 0xff);
1924                 else if (bits == 16) {
1925                     s[offset    ] = (U8)((val >>  8) & 0xff);
1926                     s[offset + 1] = (U8)( val        & 0xff);
1927                 }
1928                 else if (bits == 32) {
1929                     s[offset    ] = (U8)((val >> 24) & 0xff);
1930                     s[offset + 1] = (U8)((val >> 16) & 0xff);
1931                     s[offset + 2] = (U8)((val >>  8) & 0xff);
1932                     s[offset + 3] = (U8)( val        & 0xff);
1933                 }
1934
1935                 if (!none || val < none)
1936                     ++val;
1937             }
1938         }
1939         else { /* bits == 1, then val should be ignored */
1940             if (min < start)
1941                 min = start;
1942             for (key = min; key <= max; key++) {
1943                 const STRLEN offset = (STRLEN)(key - start);
1944                 if (key >= end)
1945                     goto go_out_list;
1946                 s[offset >> 3] |= 1 << (offset & 7);
1947             }
1948         }
1949     } /* while */
1950   go_out_list:
1951
1952     /* read $swash->{EXTRAS} */
1953     x = (U8*)SvPV(*extssvp, xcur);
1954     xend = x + xcur;
1955     while (x < xend) {
1956         STRLEN namelen;
1957         U8 *namestr;
1958         SV** othersvp;
1959         HV* otherhv;
1960         STRLEN otherbits;
1961         SV **otherbitssvp, *other;
1962         U8 *s, *o, *nl;
1963         STRLEN slen, olen;
1964
1965         U8 opc = *x++;
1966         if (opc == '\n')
1967             continue;
1968
1969         nl = (U8*)memchr(x, '\n', xend - x);
1970
1971         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1972             if (nl) {
1973                 x = nl + 1; /* 1 is length of "\n" */
1974                 continue;
1975             }
1976             else {
1977                 x = xend; /* to EXTRAS' end at which \n is not found */
1978                 break;
1979             }
1980         }
1981
1982         namestr = x;
1983         if (nl) {
1984             namelen = nl - namestr;
1985             x = nl + 1;
1986         }
1987         else {
1988             namelen = xend - namestr;
1989             x = xend;
1990         }
1991
1992         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1993         otherhv = (HV*)SvRV(*othersvp);
1994         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1995         otherbits = (STRLEN)SvUV(*otherbitssvp);
1996         if (bits < otherbits)
1997             Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1998
1999         /* The "other" swatch must be destroyed after. */
2000         other = swash_get(*othersvp, start, span);
2001         o = (U8*)SvPV(other, olen);
2002
2003         if (!olen)
2004             Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2005
2006         s = (U8*)SvPV(swatch, slen);
2007         if (bits == 1 && otherbits == 1) {
2008             if (slen != olen)
2009                 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2010
2011             switch (opc) {
2012             case '+':
2013                 while (slen--)
2014                     *s++ |= *o++;
2015                 break;
2016             case '!':
2017                 while (slen--)
2018                     *s++ |= ~*o++;
2019                 break;
2020             case '-':
2021                 while (slen--)
2022                     *s++ &= ~*o++;
2023                 break;
2024             case '&':
2025                 while (slen--)
2026                     *s++ &= *o++;
2027                 break;
2028             default:
2029                 break;
2030             }
2031         }
2032         else {
2033             STRLEN otheroctets = otherbits >> 3;
2034             STRLEN offset = 0;
2035             U8* send = s + slen;
2036
2037             while (s < send) {
2038                 UV otherval = 0;
2039
2040                 if (otherbits == 1) {
2041                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2042                     ++offset;
2043                 }
2044                 else {
2045                     STRLEN vlen = otheroctets;
2046                     otherval = *o++;
2047                     while (--vlen) {
2048                         otherval <<= 8;
2049                         otherval |= *o++;
2050                     }
2051                 }
2052
2053                 if (opc == '+' && otherval)
2054                     ; /* replace with otherval */
2055                 else if (opc == '!' && !otherval)
2056                     otherval = 1;
2057                 else if (opc == '-' && otherval)
2058                     otherval = 0;
2059                 else if (opc == '&' && !otherval)
2060                     otherval = 0;
2061                 else {
2062                     s += octets; /* no replacement */
2063                     continue;
2064                 }
2065
2066                 if (bits == 8)
2067                     *s++ = (U8)( otherval & 0xff);
2068                 else if (bits == 16) {
2069                     *s++ = (U8)((otherval >>  8) & 0xff);
2070                     *s++ = (U8)( otherval        & 0xff);
2071                 }
2072                 else if (bits == 32) {
2073                     *s++ = (U8)((otherval >> 24) & 0xff);
2074                     *s++ = (U8)((otherval >> 16) & 0xff);
2075                     *s++ = (U8)((otherval >>  8) & 0xff);
2076                     *s++ = (U8)( otherval        & 0xff);
2077                 }
2078             }
2079         }
2080         sv_free(other); /* through with it! */
2081     } /* while */
2082     return swatch;
2083 }
2084
2085 /*
2086 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2087
2088 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2089 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2090 bytes available. The return value is the pointer to the byte after the
2091 end of the new character. In other words,
2092
2093     d = uvchr_to_utf8(d, uv);
2094
2095 is the recommended wide native character-aware way of saying
2096
2097     *(d++) = uv;
2098
2099 =cut
2100 */
2101
2102 /* On ASCII machines this is normally a macro but we want a
2103    real function in case XS code wants it
2104 */
2105 U8 *
2106 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2107 {
2108     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2109 }
2110
2111 U8 *
2112 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2113 {
2114     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2115 }
2116
2117 /*
2118 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2119 flags
2120
2121 Returns the native character value of the first character in the string
2122 C<s>
2123 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2124 length, in bytes, of that character.
2125
2126 Allows length and flags to be passed to low level routine.
2127
2128 =cut
2129 */
2130 /* On ASCII machines this is normally a macro but we want
2131    a real function in case XS code wants it
2132 */
2133 UV
2134 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2135 U32 flags)
2136 {
2137     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2138     return UNI_TO_NATIVE(uv);
2139 }
2140
2141 /*
2142 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2143
2144 Build to the scalar dsv a displayable version of the string spv,
2145 length len, the displayable version being at most pvlim bytes long
2146 (if longer, the rest is truncated and "..." will be appended).
2147
2148 The flags argument can have UNI_DISPLAY_ISPRINT set to display
2149 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2150 to display the \\[nrfta\\] as the backslashed versions (like '\n')
2151 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2152 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2153 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2154
2155 The pointer to the PV of the dsv is returned.
2156
2157 =cut */
2158 char *
2159 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2160 {
2161     int truncated = 0;
2162     const char *s, *e;
2163
2164     sv_setpvn(dsv, "", 0);
2165     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2166          UV u;
2167           /* This serves double duty as a flag and a character to print after
2168              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2169           */
2170          char ok = 0;
2171
2172          if (pvlim && SvCUR(dsv) >= pvlim) {
2173               truncated++;
2174               break;
2175          }
2176          u = utf8_to_uvchr((U8*)s, 0);
2177          if (u < 256) {
2178              const unsigned char c = (unsigned char)u & 0xFF;
2179              if (flags & UNI_DISPLAY_BACKSLASH) {
2180                  switch (c) {
2181                  case '\n':
2182                      ok = 'n'; break;
2183                  case '\r':
2184                      ok = 'r'; break;
2185                  case '\t':
2186                      ok = 't'; break;
2187                  case '\f':
2188                      ok = 'f'; break;
2189                  case '\a':
2190                      ok = 'a'; break;
2191                  case '\\':
2192                      ok = '\\'; break;
2193                  default: break;
2194                  }
2195                  if (ok) {
2196                      Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2197                  }
2198              }
2199              /* isPRINT() is the locale-blind version. */
2200              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2201                  Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2202                  ok = 1;
2203              }
2204          }
2205          if (!ok)
2206              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2207     }
2208     if (truncated)
2209          sv_catpvs(dsv, "...");
2210
2211     return SvPVX(dsv);
2212 }
2213
2214 /*
2215 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2216
2217 Build to the scalar dsv a displayable version of the scalar sv,
2218 the displayable version being at most pvlim bytes long
2219 (if longer, the rest is truncated and "..." will be appended).
2220
2221 The flags argument is as in pv_uni_display().
2222
2223 The pointer to the PV of the dsv is returned.
2224
2225 =cut */
2226 char *
2227 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2228 {
2229      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2230                                 SvCUR(ssv), pvlim, flags);
2231 }
2232
2233 /*
2234 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2235
2236 Return true if the strings s1 and s2 differ case-insensitively, false
2237 if not (if they are equal case-insensitively).  If u1 is true, the
2238 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
2239 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
2240 are false, the respective string is assumed to be in native 8-bit
2241 encoding.
2242
2243 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2244 in there (they will point at the beginning of the I<next> character).
2245 If the pointers behind pe1 or pe2 are non-NULL, they are the end
2246 pointers beyond which scanning will not continue under any
2247 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
2248 s2+l2 will be used as goal end pointers that will also stop the scan,
2249 and which qualify towards defining a successful match: all the scans
2250 that define an explicit length must reach their goal pointers for
2251 a match to succeed).
2252
2253 For case-insensitiveness, the "casefolding" of Unicode is used
2254 instead of upper/lowercasing both the characters, see
2255 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2256
2257 =cut */
2258 I32
2259 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2260 {
2261      dVAR;
2262      register const U8 *p1  = (const U8*)s1;
2263      register const U8 *p2  = (const U8*)s2;
2264      register const U8 *f1 = NULL;
2265      register const U8 *f2 = NULL;
2266      register U8 *e1 = NULL;
2267      register U8 *q1 = NULL;
2268      register U8 *e2 = NULL;
2269      register U8 *q2 = NULL;
2270      STRLEN n1 = 0, n2 = 0;
2271      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2272      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2273      U8 natbuf[1+1];
2274      STRLEN foldlen1, foldlen2;
2275      bool match;
2276
2277      if (pe1)
2278           e1 = *(U8**)pe1;
2279      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2280           f1 = (const U8*)s1 + l1;
2281      if (pe2)
2282           e2 = *(U8**)pe2;
2283      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2284           f2 = (const U8*)s2 + l2;
2285
2286      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2287           return 1; /* mismatch; possible infinite loop or false positive */
2288
2289      if (!u1 || !u2)
2290           natbuf[1] = 0; /* Need to terminate the buffer. */
2291
2292      while ((e1 == 0 || p1 < e1) &&
2293             (f1 == 0 || p1 < f1) &&
2294             (e2 == 0 || p2 < e2) &&
2295             (f2 == 0 || p2 < f2)) {
2296           if (n1 == 0) {
2297                if (u1)
2298                     to_utf8_fold(p1, foldbuf1, &foldlen1);
2299                else {
2300                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2301                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2302                }
2303                q1 = foldbuf1;
2304                n1 = foldlen1;
2305           }
2306           if (n2 == 0) {
2307                if (u2)
2308                     to_utf8_fold(p2, foldbuf2, &foldlen2);
2309                else {
2310                     uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2311                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2312                }
2313                q2 = foldbuf2;
2314                n2 = foldlen2;
2315           }
2316           while (n1 && n2) {
2317                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2318                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2319                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2320                    return 1; /* mismatch */
2321                n1 -= UTF8SKIP(q1);
2322                q1 += UTF8SKIP(q1);
2323                n2 -= UTF8SKIP(q2);
2324                q2 += UTF8SKIP(q2);
2325           }
2326           if (n1 == 0)
2327                p1 += u1 ? UTF8SKIP(p1) : 1;
2328           if (n2 == 0)
2329                p2 += u2 ? UTF8SKIP(p2) : 1;
2330
2331      }
2332
2333      /* A match is defined by all the scans that specified
2334       * an explicit length reaching their final goals. */
2335      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2336
2337      if (match) {
2338           if (pe1)
2339                *pe1 = (char*)p1;
2340           if (pe2)
2341                *pe2 = (char*)p2;
2342      }
2343
2344      return match ? 0 : 1; /* 0 match, 1 mismatch */
2345 }
2346
2347 /*
2348  * Local variables:
2349  * c-indentation-style: bsd
2350  * c-basic-offset: 4
2351  * indent-tabs-mode: t
2352  * End:
2353  *
2354  * ex: set ts=8 sts=4 sw=4 noet:
2355  */