utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (c) 1998-2002, Larry Wall
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  */
   9
  10 /*
  11  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
  12  * heard of that we don't want to see any closer; and that's the one place
  13  * we're trying to get to!  And that's just where we can't get, nohow.'
  14  *
  15  * 'Well do I understand your speech,' he answered in the same language;
  16  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  17  * as is the custom in the West, if you wish to be answered?'
  18  *
  19  * ...the travellers perceived that the floor was paved with stones of many
  20  * hues; branching runes and strange devices intertwined beneath their feet.
  21  */
  22
  23 #include "EXTERN.h"
  24 #define PERL_IN_UTF8_C
  25 #include "perl.h"
  26
  27 /*
  28 =head1 Unicode Support
  29
  30 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
  31
  32 Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
  33 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
  34 bytes available. The return value is the pointer to the byte after the
  35 end of the new character. In other words,
  36
  37     d = uvuni_to_utf8_flags(d, uv, flags);
  38
  39 or, in most cases,
  40
  41     d = uvuni_to_utf8(d, uv);
  42
  43 (which is equivalent to)
  44
  45     d = uvuni_to_utf8_flags(d, uv, 0);
  46
  47 is the recommended Unicode-aware way of saying
  48
  49     *(d++) = uv;
  50
  51 =cut
  52 */
  53
  54 U8 *
  55 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
  56 {
  57     if (ckWARN(WARN_UTF8)) {
  58          if (UNICODE_IS_SURROGATE(uv) &&
  59              !(flags & UNICODE_ALLOW_SURROGATE))
  60               Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
  61          else if (
  62                   ((uv >= 0xFDD0 && uv <= 0xFDEF &&
  63                     !(flags & UNICODE_ALLOW_FDD0))
  64                    ||
  65                    ((uv & 0xFFFF) == 0xFFFE &&
  66                     !(flags & UNICODE_ALLOW_FFFE))
  67                    ||
  68                    ((uv & 0xFFFF) == 0xFFFF &&
  69                     !(flags & UNICODE_ALLOW_FFFF))) &&
  70                   /* UNICODE_ALLOW_SUPER includes
  71                    * FFFEs and FFFFs beyond 0x10FFFF. */
  72                   ((uv <= PERL_UNICODE_MAX) ||
  73                    !(flags & UNICODE_ALLOW_SUPER))
  74                   )
  75               Perl_warner(aTHX_ WARN_UTF8,
  76                          "Unicode character 0x%04"UVxf" is illegal", uv);
  77     }
  78     if (UNI_IS_INVARIANT(uv)) {
  79         *d++ = UTF_TO_NATIVE(uv);
  80         return d;
  81     }
  82 #if defined(EBCDIC)
  83     else {
  84         STRLEN len  = UNISKIP(uv);
  85         U8 *p = d+len-1;
  86         while (p > d) {
  87             *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
  88             uv >>= UTF_ACCUMULATION_SHIFT;
  89         }
  90         *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
  91         return d+len;
  92     }
  93 #else /* Non loop style */
  94     if (uv < 0x800) {
  95         *d++ = (( uv >>  6)         | 0xc0);
  96         *d++ = (( uv        & 0x3f) | 0x80);
  97         return d;
  98     }
  99     if (uv < 0x10000) {
 100         *d++ = (( uv >> 12)         | 0xe0);
 101         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 102         *d++ = (( uv        & 0x3f) | 0x80);
 103         return d;
 104     }
 105     if (uv < 0x200000) {
 106         *d++ = (( uv >> 18)         | 0xf0);
 107         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 108         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 109         *d++ = (( uv        & 0x3f) | 0x80);
 110         return d;
 111     }
 112     if (uv < 0x4000000) {
 113         *d++ = (( uv >> 24)         | 0xf8);
 114         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 115         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 116         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 117         *d++ = (( uv        & 0x3f) | 0x80);
 118         return d;
 119     }
 120     if (uv < 0x80000000) {
 121         *d++ = (( uv >> 30)         | 0xfc);
 122         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 123         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 124         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 125         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 126         *d++ = (( uv        & 0x3f) | 0x80);
 127         return d;
 128     }
 129 #ifdef HAS_QUAD
 130     if (uv < UTF8_QUAD_MAX)
 131 #endif
 132     {
 133         *d++ =                        0xfe;     /* Can't match U+FEFF! */
 134         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 135         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 136         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 137         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 138         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 139         *d++ = (( uv        & 0x3f) | 0x80);
 140         return d;
 141     }
 142 #ifdef HAS_QUAD
 143     {
 144         *d++ =                        0xff;     /* Can't match U+FFFE! */
 145         *d++ =                        0x80;     /* 6 Reserved bits */
 146         *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
 147         *d++ = (((uv >> 54) & 0x3f) | 0x80);
 148         *d++ = (((uv >> 48) & 0x3f) | 0x80);
 149         *d++ = (((uv >> 42) & 0x3f) | 0x80);
 150         *d++ = (((uv >> 36) & 0x3f) | 0x80);
 151         *d++ = (((uv >> 30) & 0x3f) | 0x80);
 152         *d++ = (((uv >> 24) & 0x3f) | 0x80);
 153         *d++ = (((uv >> 18) & 0x3f) | 0x80);
 154         *d++ = (((uv >> 12) & 0x3f) | 0x80);
 155         *d++ = (((uv >>  6) & 0x3f) | 0x80);
 156         *d++ = (( uv        & 0x3f) | 0x80);
 157         return d;
 158     }
 159 #endif
 160 #endif /* Loop style */
 161 }
 162
 163 U8 *
 164 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
 165 {
 166     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
 167 }
 168
 169
 170 /*
 171 =for apidoc A|STRLEN|is_utf8_char|U8 *s
 172
 173 Tests if some arbitrary number of bytes begins in a valid UTF-8
 174 character.  Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
 175 The actual number of bytes in the UTF-8 character will be returned if
 176 it is valid, otherwise 0.
 177
 178 =cut
 179 */
 180 STRLEN
 181 Perl_is_utf8_char(pTHX_ U8 *s)
 182 {
 183     U8 u = *s;
 184     STRLEN slen, len;
 185     UV uv, ouv;
 186
 187     if (UTF8_IS_INVARIANT(u))
 188         return 1;
 189
 190     if (!UTF8_IS_START(u))
 191         return 0;
 192
 193     len = UTF8SKIP(s);
 194
 195     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
 196         return 0;
 197
 198     slen = len - 1;
 199     s++;
 200     u &= UTF_START_MASK(len);
 201     uv  = u;
 202     ouv = uv;
 203     while (slen--) {
 204         if (!UTF8_IS_CONTINUATION(*s))
 205             return 0;
 206         uv = UTF8_ACCUMULATE(uv, *s);
 207         if (uv < ouv)
 208             return 0;
 209         ouv = uv;
 210         s++;
 211     }
 212
 213     if (UNISKIP(uv) < len)
 214         return 0;
 215
 216     return len;
 217 }
 218
 219 /*
 220 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
 221
 222 Returns true if first C<len> bytes of the given string form a valid UTF8
 223 string, false otherwise.  Note that 'a valid UTF8 string' does not mean
 224 'a string that contains UTF8' because a valid ASCII string is a valid
 225 UTF8 string.
 226
 227 =cut
 228 */
 229
 230 bool
 231 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 232 {
 233     U8* x = s;
 234     U8* send;
 235     STRLEN c;
 236
 237     if (!len)
 238         len = strlen((char *)s);
 239     send = s + len;
 240
 241     while (x < send) {
 242         c = is_utf8_char(x);
 243         if (!c)
 244             return FALSE;
 245         x += c;
 246     }
 247     if (x != send)
 248         return FALSE;
 249
 250     return TRUE;
 251 }
 252
 253 /*
 254 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
 255
 256 Bottom level UTF-8 decode routine.
 257 Returns the unicode code point value of the first character in the string C<s>
 258 which is assumed to be in UTF8 encoding and no longer than C<curlen>;
 259 C<retlen> will be set to the length, in bytes, of that character.
 260
 261 If C<s> does not point to a well-formed UTF8 character, the behaviour
 262 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
 263 it is assumed that the caller will raise a warning, and this function
 264 will silently just set C<retlen> to C<-1> and return zero.  If the
 265 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
 266 malformations will be given, C<retlen> will be set to the expected
 267 length of the UTF-8 character in bytes, and zero will be returned.
 268
 269 The C<flags> can also contain various flags to allow deviations from
 270 the strict UTF-8 encoding (see F<utf8.h>).
 271
 272 Most code should use utf8_to_uvchr() rather than call this directly.
 273
 274 =cut
 275 */
 276
 277 UV
 278 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 279 {
 280     U8 *s0 = s;
 281     UV uv = *s, ouv = 0;
 282     STRLEN len = 1;
 283     bool dowarn = ckWARN_d(WARN_UTF8);
 284     UV startbyte = *s;
 285     STRLEN expectlen = 0;
 286     U32 warning = 0;
 287
 288 /* This list is a superset of the UTF8_ALLOW_XXX. */
 289
 290 #define UTF8_WARN_EMPTY                          1
 291 #define UTF8_WARN_CONTINUATION                   2
 292 #define UTF8_WARN_NON_CONTINUATION               3
 293 #define UTF8_WARN_FE_FF                          4
 294 #define UTF8_WARN_SHORT                          5
 295 #define UTF8_WARN_OVERFLOW                       6
 296 #define UTF8_WARN_SURROGATE                      7
 297 #define UTF8_WARN_BOM                            8
 298 #define UTF8_WARN_LONG                           9
 299 #define UTF8_WARN_FFFF                          10
 300
 301     if (curlen == 0 &&
 302         !(flags & UTF8_ALLOW_EMPTY)) {
 303         warning = UTF8_WARN_EMPTY;
 304         goto malformed;
 305     }
 306
 307     if (UTF8_IS_INVARIANT(uv)) {
 308         if (retlen)
 309             *retlen = 1;
 310         return (UV) (NATIVE_TO_UTF(*s));
 311     }
 312
 313     if (UTF8_IS_CONTINUATION(uv) &&
 314         !(flags & UTF8_ALLOW_CONTINUATION)) {
 315         warning = UTF8_WARN_CONTINUATION;
 316         goto malformed;
 317     }
 318
 319     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
 320         !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 321         warning = UTF8_WARN_NON_CONTINUATION;
 322         goto malformed;
 323     }
 324
 325 #ifdef EBCDIC
 326     uv = NATIVE_TO_UTF(uv);
 327 #else
 328     if ((uv == 0xfe || uv == 0xff) &&
 329         !(flags & UTF8_ALLOW_FE_FF)) {
 330         warning = UTF8_WARN_FE_FF;
 331         goto malformed;
 332     }
 333 #endif
 334
 335     if      (!(uv & 0x20))      { len =  2; uv &= 0x1f; }
 336     else if (!(uv & 0x10))      { len =  3; uv &= 0x0f; }
 337     else if (!(uv & 0x08))      { len =  4; uv &= 0x07; }
 338     else if (!(uv & 0x04))      { len =  5; uv &= 0x03; }
 339 #ifdef EBCDIC
 340     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 341     else                        { len =  7; uv &= 0x01; }
 342 #else
 343     else if (!(uv & 0x02))      { len =  6; uv &= 0x01; }
 344     else if (!(uv & 0x01))      { len =  7; uv = 0; }
 345     else                        { len = 13; uv = 0; } /* whoa! */
 346 #endif
 347
 348     if (retlen)
 349         *retlen = len;
 350
 351     expectlen = len;
 352
 353     if ((curlen < expectlen) &&
 354         !(flags & UTF8_ALLOW_SHORT)) {
 355         warning = UTF8_WARN_SHORT;
 356         goto malformed;
 357     }
 358
 359     len--;
 360     s++;
 361     ouv = uv;
 362
 363     while (len--) {
 364         if (!UTF8_IS_CONTINUATION(*s) &&
 365             !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 366             s--;
 367             warning = UTF8_WARN_NON_CONTINUATION;
 368             goto malformed;
 369         }
 370         else
 371             uv = UTF8_ACCUMULATE(uv, *s);
 372         if (!(uv > ouv)) {
 373             /* These cannot be allowed. */
 374             if (uv == ouv) {
 375                 if (!(flags & UTF8_ALLOW_LONG)) {
 376                     warning = UTF8_WARN_LONG;
 377                     goto malformed;
 378                 }
 379             }
 380             else { /* uv < ouv */
 381                 /* This cannot be allowed. */
 382                 warning = UTF8_WARN_OVERFLOW;
 383                 goto malformed;
 384             }
 385         }
 386         s++;
 387         ouv = uv;
 388     }
 389
 390     if (UNICODE_IS_SURROGATE(uv) &&
 391         !(flags & UTF8_ALLOW_SURROGATE)) {
 392         warning = UTF8_WARN_SURROGATE;
 393         goto malformed;
 394     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
 395                !(flags & UTF8_ALLOW_BOM)) {
 396         warning = UTF8_WARN_BOM;
 397         goto malformed;
 398     } else if ((expectlen > UNISKIP(uv)) &&
 399                !(flags & UTF8_ALLOW_LONG)) {
 400         warning = UTF8_WARN_LONG;
 401         goto malformed;
 402     } else if (UNICODE_IS_ILLEGAL(uv) &&
 403                !(flags & UTF8_ALLOW_FFFF)) {
 404         warning = UTF8_WARN_FFFF;
 405         goto malformed;
 406     }
 407
 408     return uv;
 409
 410 malformed:
 411
 412     if (flags & UTF8_CHECK_ONLY) {
 413         if (retlen)
 414             *retlen = -1;
 415         return 0;
 416     }
 417
 418     if (dowarn) {
 419         SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
 420
 421         switch (warning) {
 422         case 0: /* Intentionally empty. */ break;
 423         case UTF8_WARN_EMPTY:
 424             Perl_sv_catpvf(aTHX_ sv, "(empty string)");
 425             break;
 426         case UTF8_WARN_CONTINUATION:
 427             Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
 428             break;
 429         case UTF8_WARN_NON_CONTINUATION:
 430             if (s == s0)
 431                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
 432                            (UV)s[1], startbyte);
 433             else
 434                 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
 435                            (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
 436
 437             break;
 438         case UTF8_WARN_FE_FF:
 439             Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
 440             break;
 441         case UTF8_WARN_SHORT:
 442             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 443                            curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
 444             expectlen = curlen;         /* distance for caller to skip */
 445             break;
 446         case UTF8_WARN_OVERFLOW:
 447             Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
 448                            ouv, *s, startbyte);
 449             break;
 450         case UTF8_WARN_SURROGATE:
 451             Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
 452             break;
 453         case UTF8_WARN_BOM:
 454             Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
 455             break;
 456         case UTF8_WARN_LONG:
 457             Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
 458                            expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
 459             break;
 460         case UTF8_WARN_FFFF:
 461             Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
 462             break;
 463         default:
 464             Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
 465             break;
 466         }
 467
 468         if (warning) {
 469             char *s = SvPVX(sv);
 470
 471             if (PL_op)
 472                 Perl_warner(aTHX_ WARN_UTF8,
 473                             "%s in %s", s,  OP_DESC(PL_op));
 474             else
 475                 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
 476         }
 477     }
 478
 479     if (retlen)
 480         *retlen = expectlen ? expectlen : len;
 481
 482     return 0;
 483 }
 484
 485 /*
 486 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
 487
 488 Returns the native character value of the first character in the string C<s>
 489 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 490 length, in bytes, of that character.
 491
 492 If C<s> does not point to a well-formed UTF8 character, zero is
 493 returned and retlen is set, if possible, to -1.
 494
 495 =cut
 496 */
 497
 498 UV
 499 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 500 {
 501     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 502 }
 503
 504 /*
 505 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
 506
 507 Returns the Unicode code point of the first character in the string C<s>
 508 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
 509 length, in bytes, of that character.
 510
 511 This function should only be used when returned UV is considered
 512 an index into the Unicode semantic tables (e.g. swashes).
 513
 514 If C<s> does not point to a well-formed UTF8 character, zero is
 515 returned and retlen is set, if possible, to -1.
 516
 517 =cut
 518 */
 519
 520 UV
 521 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 522 {
 523     /* Call the low level routine asking for checks */
 524     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
 525 }
 526
 527 /*
 528 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
 529
 530 Return the length of the UTF-8 char encoded string C<s> in characters.
 531 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
 532 up past C<e>, croaks.
 533
 534 =cut
 535 */
 536
 537 STRLEN
 538 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
 539 {
 540     STRLEN len = 0;
 541
 542     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
 543      * the bitops (especially ~) can create illegal UTF-8.
 544      * In other words: in Perl UTF-8 is not just for Unicode. */
 545
 546     if (e < s)
 547         Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
 548     while (s < e) {
 549         U8 t = UTF8SKIP(s);
 550
 551         if (e - s < t)
 552             Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
 553         s += t;
 554         len++;
 555     }
 556
 557     return len;
 558 }
 559
 560 /*
 561 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
 562
 563 Returns the number of UTF8 characters between the UTF-8 pointers C<a>
 564 and C<b>.
 565
 566 WARNING: use only if you *know* that the pointers point inside the
 567 same UTF-8 buffer.
 568
 569 =cut
 570 */
 571
 572 IV
 573 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
 574 {
 575     IV off = 0;
 576
 577     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
 578      * the bitops (especially ~) can create illegal UTF-8.
 579      * In other words: in Perl UTF-8 is not just for Unicode. */
 580
 581     if (a < b) {
 582         while (a < b) {
 583             U8 c = UTF8SKIP(a);
 584
 585             if (b - a < c)
 586                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 587             a += c;
 588             off--;
 589         }
 590     }
 591     else {
 592         while (b < a) {
 593             U8 c = UTF8SKIP(b);
 594
 595             if (a - b < c)
 596                 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
 597             b += c;
 598             off++;
 599         }
 600     }
 601
 602     return off;
 603 }
 604
 605 /*
 606 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
 607
 608 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
 609 forward or backward.
 610
 611 WARNING: do not use the following unless you *know* C<off> is within
 612 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
 613 on the first byte of character or just after the last byte of a character.
 614
 615 =cut
 616 */
 617
 618 U8 *
 619 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
 620 {
 621     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
 622      * the bitops (especially ~) can create illegal UTF-8.
 623      * In other words: in Perl UTF-8 is not just for Unicode. */
 624
 625     if (off >= 0) {
 626         while (off--)
 627             s += UTF8SKIP(s);
 628     }
 629     else {
 630         while (off++) {
 631             s--;
 632             while (UTF8_IS_CONTINUATION(*s))
 633                 s--;
 634         }
 635     }
 636     return s;
 637 }
 638
 639 /*
 640 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
 641
 642 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 643 Unlike C<bytes_to_utf8>, this over-writes the original string, and
 644 updates len to contain the new length.
 645 Returns zero on failure, setting C<len> to -1.
 646
 647 =cut
 648 */
 649
 650 U8 *
 651 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
 652 {
 653     U8 *send;
 654     U8 *d;
 655     U8 *save = s;
 656
 657     /* ensure valid UTF8 and chars < 256 before updating string */
 658     for (send = s + *len; s < send; ) {
 659         U8 c = *s++;
 660
 661         if (!UTF8_IS_INVARIANT(c) &&
 662             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
 663              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
 664             *len = -1;
 665             return 0;
 666         }
 667     }
 668
 669     d = s = save;
 670     while (s < send) {
 671         STRLEN ulen;
 672         *d++ = (U8)utf8_to_uvchr(s, &ulen);
 673         s += ulen;
 674     }
 675     *d = '\0';
 676     *len = d - save;
 677     return save;
 678 }
 679
 680 /*
 681 =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
 682
 683 Converts a string C<s> of length C<len> from UTF8 into byte encoding.
 684 Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
 685 the newly-created string, and updates C<len> to contain the new
 686 length.  Returns the original string if no conversion occurs, C<len>
 687 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
 688 0 if C<s> is converted or contains all 7bit characters.
 689
 690 =cut
 691 */
 692
 693 U8 *
 694 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
 695 {
 696     U8 *d;
 697     U8 *start = s;
 698     U8 *send;
 699     I32 count = 0;
 700
 701     if (!*is_utf8)
 702         return start;
 703
 704     /* ensure valid UTF8 and chars < 256 before converting string */
 705     for (send = s + *len; s < send;) {
 706         U8 c = *s++;
 707         if (!UTF8_IS_INVARIANT(c)) {
 708             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
 709                 (c = *s++) && UTF8_IS_CONTINUATION(c))
 710                 count++;
 711             else
 712                 return start;
 713         }
 714     }
 715
 716     *is_utf8 = 0;
 717
 718     Newz(801, d, (*len) - count + 1, U8);
 719     s = start; start = d;
 720     while (s < send) {
 721         U8 c = *s++;
 722         if (!UTF8_IS_INVARIANT(c)) {
 723             /* Then it is two-byte encoded */
 724             c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
 725             c = ASCII_TO_NATIVE(c);
 726         }
 727         *d++ = c;
 728     }
 729     *d = '\0';
 730     *len = d - start;
 731     return start;
 732 }
 733
 734 /*
 735 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
 736
 737 Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
 738 Returns a pointer to the newly-created string, and sets C<len> to
 739 reflect the new length.
 740
 741 =cut
 742 */
 743
 744 U8*
 745 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
 746 {
 747     U8 *send;
 748     U8 *d;
 749     U8 *dst;
 750     send = s + (*len);
 751
 752     Newz(801, d, (*len) * 2 + 1, U8);
 753     dst = d;
 754
 755     while (s < send) {
 756         UV uv = NATIVE_TO_ASCII(*s++);
 757         if (UNI_IS_INVARIANT(uv))
 758             *d++ = UTF_TO_NATIVE(uv);
 759         else {
 760             *d++ = UTF8_EIGHT_BIT_HI(uv);
 761             *d++ = UTF8_EIGHT_BIT_LO(uv);
 762         }
 763     }
 764     *d = '\0';
 765     *len = d-dst;
 766     return dst;
 767 }
 768
 769 /*
 770  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
 771  *
 772  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
 773  * We optimize for native, for obvious reasons. */
 774
 775 U8*
 776 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 777 {
 778     U8* pend;
 779     U8* dstart = d;
 780
 781     if (bytelen & 1)
 782         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
 783
 784     pend = p + bytelen;
 785
 786     while (p < pend) {
 787         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
 788         p += 2;
 789         if (uv < 0x80) {
 790             *d++ = uv;
 791             continue;
 792         }
 793         if (uv < 0x800) {
 794             *d++ = (( uv >>  6)         | 0xc0);
 795             *d++ = (( uv        & 0x3f) | 0x80);
 796             continue;
 797         }
 798         if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
 799             UV low = *p++;
 800             if (low < 0xdc00 || low >= 0xdfff)
 801                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
 802             uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
 803         }
 804         if (uv < 0x10000) {
 805             *d++ = (( uv >> 12)         | 0xe0);
 806             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 807             *d++ = (( uv        & 0x3f) | 0x80);
 808             continue;
 809         }
 810         else {
 811             *d++ = (( uv >> 18)         | 0xf0);
 812             *d++ = (((uv >> 12) & 0x3f) | 0x80);
 813             *d++ = (((uv >>  6) & 0x3f) | 0x80);
 814             *d++ = (( uv        & 0x3f) | 0x80);
 815             continue;
 816         }
 817     }
 818     *newlen = d - dstart;
 819     return d;
 820 }
 821
 822 /* Note: this one is slightly destructive of the source. */
 823
 824 U8*
 825 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
 826 {
 827     U8* s = (U8*)p;
 828     U8* send = s + bytelen;
 829     while (s < send) {
 830         U8 tmp = s[0];
 831         s[0] = s[1];
 832         s[1] = tmp;
 833         s += 2;
 834     }
 835     return utf16_to_utf8(p, d, bytelen, newlen);
 836 }
 837
 838 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
 839
 840 bool
 841 Perl_is_uni_alnum(pTHX_ UV c)
 842 {
 843     U8 tmpbuf[UTF8_MAXLEN+1];
 844     uvchr_to_utf8(tmpbuf, c);
 845     return is_utf8_alnum(tmpbuf);
 846 }
 847
 848 bool
 849 Perl_is_uni_alnumc(pTHX_ UV c)
 850 {
 851     U8 tmpbuf[UTF8_MAXLEN+1];
 852     uvchr_to_utf8(tmpbuf, c);
 853     return is_utf8_alnumc(tmpbuf);
 854 }
 855
 856 bool
 857 Perl_is_uni_idfirst(pTHX_ UV c)
 858 {
 859     U8 tmpbuf[UTF8_MAXLEN+1];
 860     uvchr_to_utf8(tmpbuf, c);
 861     return is_utf8_idfirst(tmpbuf);
 862 }
 863
 864 bool
 865 Perl_is_uni_alpha(pTHX_ UV c)
 866 {
 867     U8 tmpbuf[UTF8_MAXLEN+1];
 868     uvchr_to_utf8(tmpbuf, c);
 869     return is_utf8_alpha(tmpbuf);
 870 }
 871
 872 bool
 873 Perl_is_uni_ascii(pTHX_ UV c)
 874 {
 875     U8 tmpbuf[UTF8_MAXLEN+1];
 876     uvchr_to_utf8(tmpbuf, c);
 877     return is_utf8_ascii(tmpbuf);
 878 }
 879
 880 bool
 881 Perl_is_uni_space(pTHX_ UV c)
 882 {
 883     U8 tmpbuf[UTF8_MAXLEN+1];
 884     uvchr_to_utf8(tmpbuf, c);
 885     return is_utf8_space(tmpbuf);
 886 }
 887
 888 bool
 889 Perl_is_uni_digit(pTHX_ UV c)
 890 {
 891     U8 tmpbuf[UTF8_MAXLEN+1];
 892     uvchr_to_utf8(tmpbuf, c);
 893     return is_utf8_digit(tmpbuf);
 894 }
 895
 896 bool
 897 Perl_is_uni_upper(pTHX_ UV c)
 898 {
 899     U8 tmpbuf[UTF8_MAXLEN+1];
 900     uvchr_to_utf8(tmpbuf, c);
 901     return is_utf8_upper(tmpbuf);
 902 }
 903
 904 bool
 905 Perl_is_uni_lower(pTHX_ UV c)
 906 {
 907     U8 tmpbuf[UTF8_MAXLEN+1];
 908     uvchr_to_utf8(tmpbuf, c);
 909     return is_utf8_lower(tmpbuf);
 910 }
 911
 912 bool
 913 Perl_is_uni_cntrl(pTHX_ UV c)
 914 {
 915     U8 tmpbuf[UTF8_MAXLEN+1];
 916     uvchr_to_utf8(tmpbuf, c);
 917     return is_utf8_cntrl(tmpbuf);
 918 }
 919
 920 bool
 921 Perl_is_uni_graph(pTHX_ UV c)
 922 {
 923     U8 tmpbuf[UTF8_MAXLEN+1];
 924     uvchr_to_utf8(tmpbuf, c);
 925     return is_utf8_graph(tmpbuf);
 926 }
 927
 928 bool
 929 Perl_is_uni_print(pTHX_ UV c)
 930 {
 931     U8 tmpbuf[UTF8_MAXLEN+1];
 932     uvchr_to_utf8(tmpbuf, c);
 933     return is_utf8_print(tmpbuf);
 934 }
 935
 936 bool
 937 Perl_is_uni_punct(pTHX_ UV c)
 938 {
 939     U8 tmpbuf[UTF8_MAXLEN+1];
 940     uvchr_to_utf8(tmpbuf, c);
 941     return is_utf8_punct(tmpbuf);
 942 }
 943
 944 bool
 945 Perl_is_uni_xdigit(pTHX_ UV c)
 946 {
 947     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 948     uvchr_to_utf8(tmpbuf, c);
 949     return is_utf8_xdigit(tmpbuf);
 950 }
 951
 952 UV
 953 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
 954 {
 955     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 956     uvchr_to_utf8(tmpbuf, c);
 957     return to_utf8_upper(tmpbuf, p, lenp);
 958 }
 959
 960 UV
 961 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
 962 {
 963     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 964     uvchr_to_utf8(tmpbuf, c);
 965     return to_utf8_title(tmpbuf, p, lenp);
 966 }
 967
 968 UV
 969 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
 970 {
 971     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
 972     uvchr_to_utf8(tmpbuf, c);
 973     return to_utf8_lower(tmpbuf, p, lenp);
 974 }
 975
 976 UV
 977 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
 978 {
 979     U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
 980     uvchr_to_utf8(tmpbuf, c);
 981     return to_utf8_fold(tmpbuf, p, lenp);
 982 }
 983
 984 /* for now these all assume no locale info available for Unicode > 255 */
 985
 986 bool
 987 Perl_is_uni_alnum_lc(pTHX_ UV c)
 988 {
 989     return is_uni_alnum(c);     /* XXX no locale support yet */
 990 }
 991
 992 bool
 993 Perl_is_uni_alnumc_lc(pTHX_ UV c)
 994 {
 995     return is_uni_alnumc(c);    /* XXX no locale support yet */
 996 }
 997
 998 bool
 999 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1000 {
1001     return is_uni_idfirst(c);   /* XXX no locale support yet */
1002 }
1003
1004 bool
1005 Perl_is_uni_alpha_lc(pTHX_ UV c)
1006 {
1007     return is_uni_alpha(c);     /* XXX no locale support yet */
1008 }
1009
1010 bool
1011 Perl_is_uni_ascii_lc(pTHX_ UV c)
1012 {
1013     return is_uni_ascii(c);     /* XXX no locale support yet */
1014 }
1015
1016 bool
1017 Perl_is_uni_space_lc(pTHX_ UV c)
1018 {
1019     return is_uni_space(c);     /* XXX no locale support yet */
1020 }
1021
1022 bool
1023 Perl_is_uni_digit_lc(pTHX_ UV c)
1024 {
1025     return is_uni_digit(c);     /* XXX no locale support yet */
1026 }
1027
1028 bool
1029 Perl_is_uni_upper_lc(pTHX_ UV c)
1030 {
1031     return is_uni_upper(c);     /* XXX no locale support yet */
1032 }
1033
1034 bool
1035 Perl_is_uni_lower_lc(pTHX_ UV c)
1036 {
1037     return is_uni_lower(c);     /* XXX no locale support yet */
1038 }
1039
1040 bool
1041 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1042 {
1043     return is_uni_cntrl(c);     /* XXX no locale support yet */
1044 }
1045
1046 bool
1047 Perl_is_uni_graph_lc(pTHX_ UV c)
1048 {
1049     return is_uni_graph(c);     /* XXX no locale support yet */
1050 }
1051
1052 bool
1053 Perl_is_uni_print_lc(pTHX_ UV c)
1054 {
1055     return is_uni_print(c);     /* XXX no locale support yet */
1056 }
1057
1058 bool
1059 Perl_is_uni_punct_lc(pTHX_ UV c)
1060 {
1061     return is_uni_punct(c);     /* XXX no locale support yet */
1062 }
1063
1064 bool
1065 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1066 {
1067     return is_uni_xdigit(c);    /* XXX no locale support yet */
1068 }
1069
1070 U32
1071 Perl_to_uni_upper_lc(pTHX_ U32 c)
1072 {
1073     /* XXX returns only the first character -- do not use XXX */
1074     /* XXX no locale support yet */
1075     STRLEN len;
1076     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1077     return (U32)to_uni_upper(c, tmpbuf, &len);
1078 }
1079
1080 U32
1081 Perl_to_uni_title_lc(pTHX_ U32 c)
1082 {
1083     /* XXX returns only the first character XXX -- do not use XXX */
1084     /* XXX no locale support yet */
1085     STRLEN len;
1086     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1087     return (U32)to_uni_title(c, tmpbuf, &len);
1088 }
1089
1090 U32
1091 Perl_to_uni_lower_lc(pTHX_ U32 c)
1092 {
1093     /* XXX returns only the first character -- do not use XXX */
1094     /* XXX no locale support yet */
1095     STRLEN len;
1096     U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097     return (U32)to_uni_lower(c, tmpbuf, &len);
1098 }
1099
1100 bool
1101 Perl_is_utf8_alnum(pTHX_ U8 *p)
1102 {
1103     if (!is_utf8_char(p))
1104         return FALSE;
1105     if (!PL_utf8_alnum)
1106         /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1107          * descendant of isalnum(3), in other words, it doesn't
1108          * contain the '_'. --jhi */
1109         PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1110     return swash_fetch(PL_utf8_alnum, p, TRUE);
1111 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1112 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1113     if (!PL_utf8_alnum)
1114         PL_utf8_alnum = swash_init("utf8", "",
1115             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1116     return swash_fetch(PL_utf8_alnum, p, TRUE);
1117 #endif
1118 }
1119
1120 bool
1121 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1122 {
1123     if (!is_utf8_char(p))
1124         return FALSE;
1125     if (!PL_utf8_alnum)
1126         PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1127     return swash_fetch(PL_utf8_alnum, p, TRUE);
1128 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1129 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1130     if (!PL_utf8_alnum)
1131         PL_utf8_alnum = swash_init("utf8", "",
1132             sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1133     return swash_fetch(PL_utf8_alnum, p, TRUE);
1134 #endif
1135 }
1136
1137 bool
1138 Perl_is_utf8_idfirst(pTHX_ U8 *p)
1139 {
1140     return *p == '_' || is_utf8_alpha(p);
1141 }
1142
1143 bool
1144 Perl_is_utf8_alpha(pTHX_ U8 *p)
1145 {
1146     if (!is_utf8_char(p))
1147         return FALSE;
1148     if (!PL_utf8_alpha)
1149         PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1150     return swash_fetch(PL_utf8_alpha, p, TRUE);
1151 }
1152
1153 bool
1154 Perl_is_utf8_ascii(pTHX_ U8 *p)
1155 {
1156     if (!is_utf8_char(p))
1157         return FALSE;
1158     if (!PL_utf8_ascii)
1159         PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1160     return swash_fetch(PL_utf8_ascii, p, TRUE);
1161 }
1162
1163 bool
1164 Perl_is_utf8_space(pTHX_ U8 *p)
1165 {
1166     if (!is_utf8_char(p))
1167         return FALSE;
1168     if (!PL_utf8_space)
1169         PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1170     return swash_fetch(PL_utf8_space, p, TRUE);
1171 }
1172
1173 bool
1174 Perl_is_utf8_digit(pTHX_ U8 *p)
1175 {
1176     if (!is_utf8_char(p))
1177         return FALSE;
1178     if (!PL_utf8_digit)
1179         PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1180     return swash_fetch(PL_utf8_digit, p, TRUE);
1181 }
1182
1183 bool
1184 Perl_is_utf8_upper(pTHX_ U8 *p)
1185 {
1186     if (!is_utf8_char(p))
1187         return FALSE;
1188     if (!PL_utf8_upper)
1189         PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1190     return swash_fetch(PL_utf8_upper, p, TRUE);
1191 }
1192
1193 bool
1194 Perl_is_utf8_lower(pTHX_ U8 *p)
1195 {
1196     if (!is_utf8_char(p))
1197         return FALSE;
1198     if (!PL_utf8_lower)
1199         PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1200     return swash_fetch(PL_utf8_lower, p, TRUE);
1201 }
1202
1203 bool
1204 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1205 {
1206     if (!is_utf8_char(p))
1207         return FALSE;
1208     if (!PL_utf8_cntrl)
1209         PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1210     return swash_fetch(PL_utf8_cntrl, p, TRUE);
1211 }
1212
1213 bool
1214 Perl_is_utf8_graph(pTHX_ U8 *p)
1215 {
1216     if (!is_utf8_char(p))
1217         return FALSE;
1218     if (!PL_utf8_graph)
1219         PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1220     return swash_fetch(PL_utf8_graph, p, TRUE);
1221 }
1222
1223 bool
1224 Perl_is_utf8_print(pTHX_ U8 *p)
1225 {
1226     if (!is_utf8_char(p))
1227         return FALSE;
1228     if (!PL_utf8_print)
1229         PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1230     return swash_fetch(PL_utf8_print, p, TRUE);
1231 }
1232
1233 bool
1234 Perl_is_utf8_punct(pTHX_ U8 *p)
1235 {
1236     if (!is_utf8_char(p))
1237         return FALSE;
1238     if (!PL_utf8_punct)
1239         PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1240     return swash_fetch(PL_utf8_punct, p, TRUE);
1241 }
1242
1243 bool
1244 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1245 {
1246     if (!is_utf8_char(p))
1247         return FALSE;
1248     if (!PL_utf8_xdigit)
1249         PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1250     return swash_fetch(PL_utf8_xdigit, p, TRUE);
1251 }
1252
1253 bool
1254 Perl_is_utf8_mark(pTHX_ U8 *p)
1255 {
1256     if (!is_utf8_char(p))
1257         return FALSE;
1258     if (!PL_utf8_mark)
1259         PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1260     return swash_fetch(PL_utf8_mark, p, TRUE);
1261 }
1262
1263 /*
1264 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1265
1266 The "p" contains the pointer to the UTF-8 string encoding
1267 the character that is being converted.
1268
1269 The "ustrp" is a pointer to the character buffer to put the
1270 conversion result to.  The "lenp" is a pointer to the length
1271 of the result.
1272
1273 The "swash" is a pointer to the swash to use.
1274
1275 The "normal" is a string like "ToLower" which means the swash
1276 $utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1277 and loaded by SWASHGET, using lib/utf8_heavy.pl.
1278
1279 The "special" is a string like "utf8::ToSpecLower", which means
1280 the hash %utf8::ToSpecLower, which is stored in the same file,
1281 lib/unicore/To/Lower.pl, and also loaded by SWASHGET.  The access
1282 to the hash is by Perl_to_utf8_case().
1283
1284 =cut
1285  */
1286
1287 UV
1288 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1289 {
1290     UV uv0, uv1, uv2;
1291     U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *d;
1292     char *s = NULL;
1293     STRLEN len;
1294     bool has_utf8 = FALSE;
1295
1296     if (!*swashp)
1297         *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1298     uv0 = utf8_to_uvchr(p, 0);
1299     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1300      * are necessary in EBCDIC, they are redundant no-ops
1301      * in ASCII-ish platforms, and hopefully optimized away. */
1302     uv1 = NATIVE_TO_UNI(uv0);
1303     uvuni_to_utf8(tmpbuf, uv1);
1304     uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1305     if (uv2) {
1306          /* It was "normal" (a single character mapping). */
1307          d = uvuni_to_utf8(ustrp, uv2);
1308          has_utf8 = !UNI_IS_INVARIANT(uv2);
1309     }
1310     else {
1311          /* It might be "special" (sometimes, but not always,
1312           * a multicharacter mapping) */
1313          HV *hv;
1314          SV *keysv;
1315          HE *he;
1316          SV *val;
1317
1318          if ((hv    = get_hv(special, FALSE)) &&
1319              (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1320              (he    = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1321              (val   = HeVAL(he))) {
1322
1323               s = SvPV(val, len);
1324               if (len == 1)
1325                    d = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s));
1326               else {
1327                    Copy(s, ustrp, len, U8);
1328                    d = ustrp + len;
1329               }
1330               if (SvUTF8(val))
1331                    has_utf8 = TRUE;
1332          }
1333          else {
1334               /* It was not "special", either. */
1335               d = uvuni_to_utf8(ustrp, uv1);
1336               has_utf8 = !UNI_IS_INVARIANT(uv1);
1337          }
1338     }
1339
1340     len = d - ustrp;
1341
1342 #ifdef EBCDIC
1343     {
1344          /* If we have EBCDIC we need to remap the characters since
1345           * any characters in the low 256 are in Unicode code points,
1346           * not EBCDIC. */
1347          U8 *t, *tend;
1348
1349          d = tmpbuf;
1350          if (has_utf8) {
1351               STRLEN tlen = 0;
1352
1353               for (t = ustrp, tend = t + len;
1354                    t < tend; t += tlen) {
1355                    UV c = utf8_to_uvchr(t, &tlen);
1356
1357                    if (tlen > 0)
1358                         d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1359                    else
1360                         break;
1361               }
1362          } else {
1363               for (t = ustrp, tend = t + len;
1364                    t < tend; t++)
1365                    d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1366          }
1367          len = d - tmpbuf;
1368          Copy(tmpbuf, ustrp, len, U8);
1369     }
1370 #endif
1371
1372     if (lenp)
1373          *lenp = len;
1374
1375     return utf8_to_uvchr(ustrp, 0);
1376 }
1377
1378 /*
1379 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1380
1381 Convert the UTF-8 encoded character at p to its uppercase version and
1382 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1383 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1384 uppercase version may be longer than the original character (up to two
1385 characters).
1386
1387 The first character of the uppercased version is returned
1388 (but note, as explained above, that there may be more.)
1389
1390 =cut */
1391
1392 UV
1393 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1394 {
1395     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1396                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1397 }
1398
1399 /*
1400 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1401
1402 Convert the UTF-8 encoded character at p to its titlecase version and
1403 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1404 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1405 titlecase version may be longer than the original character (up to two
1406 characters).
1407
1408 The first character of the titlecased version is returned
1409 (but note, as explained above, that there may be more.)
1410
1411 =cut */
1412
1413 UV
1414 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1415 {
1416     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1417                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1418 }
1419
1420 /*
1421 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1422
1423 Convert the UTF-8 encoded character at p to its lowercase version and
1424 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1425 that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1426 lowercase version may be longer than the original character (up to two
1427 characters).
1428
1429 The first character of the lowercased version is returned
1430 (but note, as explained above, that there may be more.)
1431
1432 =cut */
1433
1434 UV
1435 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1436 {
1437     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1438                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1439 }
1440
1441 /*
1442 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1443
1444 Convert the UTF-8 encoded character at p to its foldcase version and
1445 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1446 that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1447 foldcase version may be longer than the original character (up to
1448 three characters).
1449
1450 The first character of the foldcased version is returned
1451 (but note, as explained above, that there may be more.)
1452
1453 =cut */
1454
1455 UV
1456 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1457 {
1458     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1459                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1460 }
1461
1462 /* a "swash" is a swatch hash */
1463
1464 SV*
1465 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1466 {
1467     SV* retval;
1468     SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1469     dSP;
1470     HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1471     SV* errsv_save;
1472
1473     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {      /* demand load utf8 */
1474         ENTER;
1475         errsv_save = newSVsv(ERRSV);
1476         Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1477         if (!SvTRUE(ERRSV))
1478             sv_setsv(ERRSV, errsv_save);
1479         SvREFCNT_dec(errsv_save);
1480         LEAVE;
1481     }
1482     SPAGAIN;
1483     PUSHSTACKi(PERLSI_MAGIC);
1484     PUSHMARK(SP);
1485     EXTEND(SP,5);
1486     PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1487     PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1488     PUSHs(listsv);
1489     PUSHs(sv_2mortal(newSViv(minbits)));
1490     PUSHs(sv_2mortal(newSViv(none)));
1491     PUTBACK;
1492     ENTER;
1493     SAVEI32(PL_hints);
1494     PL_hints = 0;
1495     save_re_context();
1496     if (PL_curcop == &PL_compiling)
1497         /* XXX ought to be handled by lex_start */
1498         sv_setpv(tokenbufsv, PL_tokenbuf);
1499     errsv_save = newSVsv(ERRSV);
1500     if (call_method("SWASHNEW", G_SCALAR))
1501         retval = newSVsv(*PL_stack_sp--);
1502     else
1503         retval = &PL_sv_undef;
1504     if (!SvTRUE(ERRSV))
1505         sv_setsv(ERRSV, errsv_save);
1506     SvREFCNT_dec(errsv_save);
1507     LEAVE;
1508     POPSTACK;
1509     if (PL_curcop == &PL_compiling) {
1510         STRLEN len;
1511         char* pv = SvPV(tokenbufsv, len);
1512
1513         Copy(pv, PL_tokenbuf, len+1, char);
1514         PL_curcop->op_private = PL_hints;
1515     }
1516     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1517         Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1518     return retval;
1519 }
1520
1521
1522 /* This API is wrong for special case conversions since we may need to
1523  * return several Unicode characters for a single Unicode character
1524  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1525  * the lower-level routine, and it is similarly broken for returning
1526  * multiple values.  --jhi */
1527 UV
1528 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1529 {
1530     HV* hv = (HV*)SvRV(sv);
1531     U32 klen;
1532     U32 off;
1533     STRLEN slen;
1534     STRLEN needents;
1535     U8 *tmps = NULL;
1536     U32 bit;
1537     SV *retval;
1538     U8 tmputf8[2];
1539     UV c = NATIVE_TO_ASCII(*ptr);
1540
1541     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1542         tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1543         tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1544         ptr = tmputf8;
1545     }
1546     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1547      * then the "swatch" is a vec() for al the chars which start
1548      * with 0xAA..0xYY
1549      * So the key in the hash (klen) is length of encoded char -1
1550      */
1551     klen = UTF8SKIP(ptr) - 1;
1552     off  = ptr[klen];
1553
1554     if (klen == 0)
1555      {
1556       /* If char in invariant then swatch is for all the invariant chars
1557        * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1558        */
1559       needents = UTF_CONTINUATION_MARK;
1560       off      = NATIVE_TO_UTF(ptr[klen]);
1561      }
1562     else
1563      {
1564       /* If char is encoded then swatch is for the prefix */
1565       needents = (1 << UTF_ACCUMULATION_SHIFT);
1566       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1567      }
1568
1569     /*
1570      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1571      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1572      * it's nothing to sniff at.)  Pity we usually come through at least
1573      * two function calls to get here...
1574      *
1575      * NB: this code assumes that swatches are never modified, once generated!
1576      */
1577
1578     if (hv   == PL_last_swash_hv &&
1579         klen == PL_last_swash_klen &&
1580         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1581     {
1582         tmps = PL_last_swash_tmps;
1583         slen = PL_last_swash_slen;
1584     }
1585     else {
1586         /* Try our second-level swatch cache, kept in a hash. */
1587         SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1588
1589         /* If not cached, generate it via utf8::SWASHGET */
1590         if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1591             dSP;
1592             /* We use utf8n_to_uvuni() as we want an index into
1593                Unicode tables, not a native character number.
1594              */
1595             UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1596             SV *errsv_save;
1597             ENTER;
1598             SAVETMPS;
1599             save_re_context();
1600             PUSHSTACKi(PERLSI_MAGIC);
1601             PUSHMARK(SP);
1602             EXTEND(SP,3);
1603             PUSHs((SV*)sv);
1604             /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1605             PUSHs(sv_2mortal(newSViv((klen) ?
1606                                      (code_point & ~(needents - 1)) : 0)));
1607             PUSHs(sv_2mortal(newSViv(needents)));
1608             PUTBACK;
1609             errsv_save = newSVsv(ERRSV);
1610             if (call_method("SWASHGET", G_SCALAR))
1611                 retval = newSVsv(*PL_stack_sp--);
1612             else
1613                 retval = &PL_sv_undef;
1614             if (!SvTRUE(ERRSV))
1615                 sv_setsv(ERRSV, errsv_save);
1616             SvREFCNT_dec(errsv_save);
1617             POPSTACK;
1618             FREETMPS;
1619             LEAVE;
1620             if (PL_curcop == &PL_compiling)
1621                 PL_curcop->op_private = PL_hints;
1622
1623             svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1624
1625             if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1626                 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1627         }
1628
1629         PL_last_swash_hv = hv;
1630         PL_last_swash_klen = klen;
1631         PL_last_swash_tmps = tmps;
1632         PL_last_swash_slen = slen;
1633         if (klen)
1634             Copy(ptr, PL_last_swash_key, klen, U8);
1635     }
1636
1637     switch ((int)((slen << 3) / needents)) {
1638     case 1:
1639         bit = 1 << (off & 7);
1640         off >>= 3;
1641         return (tmps[off] & bit) != 0;
1642     case 8:
1643         return tmps[off];
1644     case 16:
1645         off <<= 1;
1646         return (tmps[off] << 8) + tmps[off + 1] ;
1647     case 32:
1648         off <<= 2;
1649         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1650     }
1651     Perl_croak(aTHX_ "panic: swash_fetch");
1652     return 0;
1653 }
1654
1655
1656 /*
1657 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1658
1659 Adds the UTF8 representation of the Native codepoint C<uv> to the end
1660 of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1661 bytes available. The return value is the pointer to the byte after the
1662 end of the new character. In other words,
1663
1664     d = uvchr_to_utf8(d, uv);
1665
1666 is the recommended wide native character-aware way of saying
1667
1668     *(d++) = uv;
1669
1670 =cut
1671 */
1672
1673 /* On ASCII machines this is normally a macro but we want a
1674    real function in case XS code wants it
1675 */
1676 #undef Perl_uvchr_to_utf8
1677 U8 *
1678 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1679 {
1680     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1681 }
1682
1683 U8 *
1684 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1685 {
1686     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1687 }
1688
1689 /*
1690 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1691
1692 Returns the native character value of the first character in the string C<s>
1693 which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1694 length, in bytes, of that character.
1695
1696 Allows length and flags to be passed to low level routine.
1697
1698 =cut
1699 */
1700 /* On ASCII machines this is normally a macro but we want
1701    a real function in case XS code wants it
1702 */
1703 #undef Perl_utf8n_to_uvchr
1704 UV
1705 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1706 {
1707     UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1708     return UNI_TO_NATIVE(uv);
1709 }
1710
1711 /*
1712 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1713
1714 Build to the scalar dsv a displayable version of the string spv,
1715 length len, the displayable version being at most pvlim bytes long
1716 (if longer, the rest is truncated and "..." will be appended).
1717
1718 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1719 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1720 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1721 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1722 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1723 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1724
1725 The pointer to the PV of the dsv is returned.
1726
1727 =cut */
1728 char *
1729 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1730 {
1731     int truncated = 0;
1732     char *s, *e;
1733
1734     sv_setpvn(dsv, "", 0);
1735     for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1736          UV u;
1737          bool ok = FALSE;
1738
1739          if (pvlim && SvCUR(dsv) >= pvlim) {
1740               truncated++;
1741               break;
1742          }
1743          u = utf8_to_uvchr((U8*)s, 0);
1744          if (u < 256) {
1745              if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1746                  switch (u & 0xFF) {
1747                  case '\n':
1748                      Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1749                  case '\r':
1750                      Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1751                  case '\t':
1752                      Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1753                  case '\f':
1754                      Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1755                  case '\a':
1756                      Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1757                  case '\\':
1758                      Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
1759                  default: break;
1760                  }
1761              }
1762              /* isPRINT() is the locale-blind version. */
1763              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1764                  Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1765                  ok = TRUE;
1766              }
1767          }
1768          if (!ok)
1769              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1770     }
1771     if (truncated)
1772          sv_catpvn(dsv, "...", 3);
1773
1774     return SvPVX(dsv);
1775 }
1776
1777 /*
1778 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1779
1780 Build to the scalar dsv a displayable version of the scalar sv,
1781 the displayable version being at most pvlim bytes long
1782 (if longer, the rest is truncated and "..." will be appended).
1783
1784 The flags argument is as in pv_uni_display().
1785
1786 The pointer to the PV of the dsv is returned.
1787
1788 =cut */
1789 char *
1790 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1791 {
1792      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1793                                 pvlim, flags);
1794 }
1795
1796 /*
1797 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1798
1799 Return true if the strings s1 and s2 differ case-insensitively, false
1800 if not (if they are equal case-insensitively).  If u1 is true, the
1801 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1802 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1803 are false, the respective string is assumed to be in native 8-bit
1804 encoding.
1805
1806 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1807 in there (they will point at the beginning of the I<next> character).
1808 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1809 pointers beyond which scanning will not continue under any
1810 circustances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1811 s2+l2 will be used as goal end pointers that will also stop the scan,
1812 and which qualify towards defining a successful match: all the scans
1813 that define an explicit length must reach their goal pointers for
1814 a match to succeed).
1815
1816 For case-insensitiveness, the "casefolding" of Unicode is used
1817 instead of upper/lowercasing both the characters, see
1818 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1819
1820 =cut */
1821 I32
1822 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1823 {
1824      register U8 *p1  = (U8*)s1;
1825      register U8 *p2  = (U8*)s2;
1826      register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1827      register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1828      STRLEN n1 = 0, n2 = 0;
1829      U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1830      U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1831      U8 natbuf[1+1];
1832      STRLEN foldlen1, foldlen2;
1833      bool match;
1834
1835      if (pe1)
1836           e1 = *(U8**)pe1;
1837      if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1838           f1 = (U8*)s1 + l1;
1839      if (pe2)
1840           e2 = *(U8**)pe2;
1841      if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1842           f2 = (U8*)s2 + l2;
1843
1844      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1845           return 1; /* mismatch; possible infinite loop or false positive */
1846
1847      if (!u1 || !u2)
1848           natbuf[1] = 0; /* Need to terminate the buffer. */
1849
1850      while ((e1 == 0 || p1 < e1) &&
1851             (f1 == 0 || p1 < f1) &&
1852             (e2 == 0 || p2 < e2) &&
1853             (f2 == 0 || p2 < f2)) {
1854           if (n1 == 0) {
1855                if (u1)
1856                     to_utf8_fold(p1, foldbuf1, &foldlen1);
1857                else {
1858                     natbuf[0] = *p1;
1859                     to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1860                }
1861                q1 = foldbuf1;
1862                n1 = foldlen1;
1863           }
1864           if (n2 == 0) {
1865                if (u2)
1866                     to_utf8_fold(p2, foldbuf2, &foldlen2);
1867                else {
1868                     natbuf[0] = *p2;
1869                     to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1870                }
1871                q2 = foldbuf2;
1872                n2 = foldlen2;
1873           }
1874           while (n1 && n2) {
1875                if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1876                    (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1877                     memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1878                    return 1; /* mismatch */
1879                n1 -= UTF8SKIP(q1);
1880                q1 += UTF8SKIP(q1);
1881                n2 -= UTF8SKIP(q2);
1882                q2 += UTF8SKIP(q2);
1883           }
1884           if (n1 == 0)
1885                p1 += u1 ? UTF8SKIP(p1) : 1;
1886           if (n2 == 0)
1887                p2 += u2 ? UTF8SKIP(p2) : 1;
1888
1889      }
1890
1891      /* A match is defined by all the scans that specified
1892       * an explicit length reaching their final goals. */
1893      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1894
1895      if (match) {
1896           if (pe1)
1897                *pe1 = (char*)p1;
1898           if (pe2)
1899                *pe2 = (char*)p2;
1900      }
1901
1902      return match ? 0 : 1; /* 0 match, 1 mismatch */
1903 }
1904