utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34
  35 #ifndef EBCDIC
  36 /* Separate prototypes needed because in ASCII systems these are
  37  * usually macros but they still are compiled as code, too. */
  38 PERL_CALLCONV UV        Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  39 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
  40 #endif
  41
  42 static const char unees[] =
  43     "Malformed UTF-8 character (unexpected end of string)";
  44
  45 /*
  46 =head1 Unicode Support
  47
  48 This file contains various utility functions for manipulating UTF8-encoded
  49 strings. For the uninitiated, this is a method of representing arbitrary
  50 Unicode characters as a variable number of bytes, in such a way that
  51 characters in the ASCII range are unmodified, and a zero byte never appears
  52 within non-zero characters.
  53
  54 =cut
  55 */
  56
  57 /*
  58 =for apidoc is_ascii_string
  59
  60 Returns true if the first C<len> bytes of the string C<s> are the same whether
  61 or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  62 is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  63 fit this definition, hence the function's name.
  64
  65 If C<len> is 0, it will be calculated using C<strlen(s)>.
  66
  67 See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
  68
  69 =cut
  70 */
  71
  72 bool
  73 Perl_is_ascii_string(const U8 *s, STRLEN len)
  74 {
  75     const U8* const send = s + (len ? len : strlen((const char *)s));
  76     const U8* x = s;
  77
  78     PERL_ARGS_ASSERT_IS_ASCII_STRING;
  79
  80     for (; x < send; ++x) {
  81         if (!UTF8_IS_INVARIANT(*x))
  82             break;
  83     }
  84
  85     return x == send;
  86 }
  87
  88 /*
  89 =for apidoc uvuni_to_utf8_flags
  90
  91 Adds the UTF-8 representation of the code point C<uv> to the end
  92 of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
  93 bytes available. The return value is the pointer to the byte after the
  94 end of the new character. In other words,
  95
  96     d = uvuni_to_utf8_flags(d, uv, flags);
  97
  98 or, in most cases,
  99
 100     d = uvuni_to_utf8(d, uv);
 101
 102 (which is equivalent to)
 103
 104     d = uvuni_to_utf8_flags(d, uv, 0);
 105
 106 This is the recommended Unicode-aware way of saying
 107
 108     *(d++) = uv;
 109
 110 This function will convert to UTF-8 (and not warn) even code points that aren't
 111 legal Unicode or are problematic, unless C<flags> contains one or more of the
 112 following flags:
 113
 114 If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
 115 the function will raise a warning, provided UTF8 warnings are enabled.  If instead
 116 UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
 117 If both flags are set, the function will both warn and return NULL.
 118
 119 The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
 120 affect how the function handles a Unicode non-character.  And, likewise for the
 121 UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
 122 above the Unicode maximum of 0x10FFFF.  Code points above 0x7FFF_FFFF (which are
 123 even less portable) can be warned and/or disallowed even if other above-Unicode
 124 code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
 125 flags.
 126
 127 And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
 128 above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
 129 DISALLOW flags.
 130
 131
 132 =cut
 133 */
 134
 135 U8 *
 136 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 137 {
 138     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
 139
 140     if (ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR)) {
 141         if (UNICODE_IS_SURROGATE(uv)) {
 142             if (flags & UNICODE_WARN_SURROGATE) {
 143                 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
 144                                             "UTF-16 surrogate U+%04"UVXf, uv);
 145             }
 146             if (flags & UNICODE_DISALLOW_SURROGATE) {
 147                 return NULL;
 148             }
 149         }
 150         else if (UNICODE_IS_SUPER(uv)) {
 151             if (flags & UNICODE_WARN_SUPER
 152                 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
 153             {
 154                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
 155                           "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
 156             }
 157             if (flags & UNICODE_DISALLOW_SUPER
 158                 || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
 159             {
 160                 return NULL;
 161             }
 162         }
 163         else if (UNICODE_IS_NONCHAR(uv)) {
 164             if (flags & UNICODE_WARN_NONCHAR) {
 165                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
 166                  "Unicode non-character U+%04"UVXf" is illegal for open interchange",
 167                  uv);
 168             }
 169             if (flags & UNICODE_DISALLOW_NONCHAR) {
 170                 return NULL;
 171             }
 172         }
 173     }
 174     if (UNI_IS_INVARIANT(uv)) {
 175         *d++ = (U8)UTF_TO_NATIVE(uv);
 176         return d;
 177     }
 178 #if defined(EBCDIC)
 179     else {
 180         STRLEN len  = UNISKIP(uv);
 181         U8 *p = d+len-1;
 182         while (p > d) {
 183             *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
 184             uv >>= UTF_ACCUMULATION_SHIFT;
 185         }
 186         *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 187         return d+len;
 188     }
 189 #else /* Non loop style */
 190     if (uv < 0x800) {
 191         *d++ = (U8)(( uv >>  6)         | 0xc0);
 192         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 193         return d;
 194     }
 195     if (uv < 0x10000) {
 196         *d++ = (U8)(( uv >> 12)         | 0xe0);
 197         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 198         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 199         return d;
 200     }
 201     if (uv < 0x200000) {
 202         *d++ = (U8)(( uv >> 18)         | 0xf0);
 203         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 204         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 205         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 206         return d;
 207     }
 208     if (uv < 0x4000000) {
 209         *d++ = (U8)(( uv >> 24)         | 0xf8);
 210         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 211         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 212         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 213         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 214         return d;
 215     }
 216     if (uv < 0x80000000) {
 217         *d++ = (U8)(( uv >> 30)         | 0xfc);
 218         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 219         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 220         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 221         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 222         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 223         return d;
 224     }
 225 #ifdef HAS_QUAD
 226     if (uv < UTF8_QUAD_MAX)
 227 #endif
 228     {
 229         *d++ =                            0xfe; /* Can't match U+FEFF! */
 230         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 231         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 232         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 233         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 234         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 235         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 236         return d;
 237     }
 238 #ifdef HAS_QUAD
 239     {
 240         *d++ =                            0xff;         /* Can't match U+FFFE! */
 241         *d++ =                            0x80;         /* 6 Reserved bits */
 242         *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);        /* 2 Reserved bits */
 243         *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
 244         *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
 245         *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
 246         *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
 247         *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
 248         *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
 249         *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
 250         *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
 251         *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
 252         *d++ = (U8)(( uv        & 0x3f) | 0x80);
 253         return d;
 254     }
 255 #endif
 256 #endif /* Loop style */
 257 }
 258
 259 /*
 260
 261 Tests if the first C<len> bytes of string C<s> form a valid UTF-8
 262 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
 263 UTF-8 character.  The number of bytes in the UTF-8 character
 264 will be returned if it is valid, otherwise 0.
 265
 266 This is the "slow" version as opposed to the "fast" version which is
 267 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
 268 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
 269 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
 270 you should use the _slow().  In practice this means that the _slow()
 271 will be used very rarely, since the maximum Unicode code point (as of
 272 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
 273 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
 274 five bytes or more.
 275
 276 =cut */
 277 STATIC STRLEN
 278 S_is_utf8_char_slow(const U8 *s, const STRLEN len)
 279 {
 280     dTHX;   /* The function called below requires thread context */
 281
 282     STRLEN actual_len;
 283
 284     PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
 285
 286     utf8n_to_uvuni(s, len, &actual_len, UTF8_CHECK_ONLY);
 287
 288     return (actual_len == (STRLEN) -1) ? 0 : actual_len;
 289 }
 290
 291 /*
 292 =for apidoc is_utf8_char_buf
 293
 294 Returns the number of bytes that comprise the first UTF-8 encoded character in
 295 buffer C<buf>.  C<buf_end> should point to one position beyond the end of the
 296 buffer.  0 is returned if C<buf> does not point to a complete, valid UTF-8
 297 encoded character.
 298
 299 Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
 300 machines) is a valid UTF-8 character.
 301
 302 =cut */
 303
 304 STRLEN
 305 Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
 306 {
 307
 308     STRLEN len;
 309
 310     PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
 311
 312     if (buf_end <= buf) {
 313         return 0;
 314     }
 315
 316     len = buf_end - buf;
 317     if (len > UTF8SKIP(buf)) {
 318         len = UTF8SKIP(buf);
 319     }
 320
 321 #ifdef IS_UTF8_CHAR
 322     if (IS_UTF8_CHAR_FAST(len))
 323         return IS_UTF8_CHAR(buf, len) ? len : 0;
 324 #endif /* #ifdef IS_UTF8_CHAR */
 325     return is_utf8_char_slow(buf, len);
 326 }
 327
 328 /*
 329 =for apidoc is_utf8_char
 330
 331 DEPRECATED!
 332
 333 Tests if some arbitrary number of bytes begins in a valid UTF-8
 334 character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
 335 character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
 336 character will be returned if it is valid, otherwise 0.
 337
 338 This function is deprecated due to the possibility that malformed input could
 339 cause reading beyond the end of the input buffer.  Use L</is_utf8_char_buf>
 340 instead.
 341
 342 =cut */
 343
 344 STRLEN
 345 Perl_is_utf8_char(const U8 *s)
 346 {
 347     PERL_ARGS_ASSERT_IS_UTF8_CHAR;
 348
 349     /* Assumes we have enough space, which is why this is deprecated */
 350     return is_utf8_char_buf(s, s + UTF8SKIP(s));
 351 }
 352
 353
 354 /*
 355 =for apidoc is_utf8_string
 356
 357 Returns true if the first C<len> bytes of string C<s> form a valid
 358 UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
 359 using C<strlen(s)> (which means if you use this option, that C<s> has to have a
 360 terminating NUL byte).  Note that all characters being ASCII constitute 'a
 361 valid UTF-8 string'.
 362
 363 See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
 364
 365 =cut
 366 */
 367
 368 bool
 369 Perl_is_utf8_string(const U8 *s, STRLEN len)
 370 {
 371     const U8* const send = s + (len ? len : strlen((const char *)s));
 372     const U8* x = s;
 373
 374     PERL_ARGS_ASSERT_IS_UTF8_STRING;
 375
 376     while (x < send) {
 377          /* Inline the easy bits of is_utf8_char() here for speed... */
 378          if (UTF8_IS_INVARIANT(*x)) {
 379             x++;
 380          }
 381          else if (!UTF8_IS_START(*x))
 382              return FALSE;
 383          else {
 384               /* ... and call is_utf8_char() only if really needed. */
 385              const STRLEN c = UTF8SKIP(x);
 386              const U8* const next_char_ptr = x + c;
 387
 388              if (next_char_ptr > send) {
 389                  return FALSE;
 390              }
 391
 392              if (IS_UTF8_CHAR_FAST(c)) {
 393                  if (!IS_UTF8_CHAR(x, c))
 394                      return FALSE;
 395              }
 396              else if (! is_utf8_char_slow(x, c)) {
 397                  return FALSE;
 398              }
 399              x = next_char_ptr;
 400          }
 401     }
 402
 403     return TRUE;
 404 }
 405
 406 /*
 407 Implemented as a macro in utf8.h
 408
 409 =for apidoc is_utf8_string_loc
 410
 411 Like L</is_utf8_string> but stores the location of the failure (in the
 412 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 413 "utf8ness success") in the C<ep>.
 414
 415 See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
 416
 417 =for apidoc is_utf8_string_loclen
 418
 419 Like L</is_utf8_string>() but stores the location of the failure (in the
 420 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 421 "utf8ness success") in the C<ep>, and the number of UTF-8
 422 encoded characters in the C<el>.
 423
 424 See also L</is_utf8_string_loc>() and L</is_utf8_string>().
 425
 426 =cut
 427 */
 428
 429 bool
 430 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 431 {
 432     const U8* const send = s + (len ? len : strlen((const char *)s));
 433     const U8* x = s;
 434     STRLEN c;
 435     STRLEN outlen = 0;
 436
 437     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 438
 439     while (x < send) {
 440          const U8* next_char_ptr;
 441
 442          /* Inline the easy bits of is_utf8_char() here for speed... */
 443          if (UTF8_IS_INVARIANT(*x))
 444              next_char_ptr = x + 1;
 445          else if (!UTF8_IS_START(*x))
 446              goto out;
 447          else {
 448              /* ... and call is_utf8_char() only if really needed. */
 449              c = UTF8SKIP(x);
 450              next_char_ptr = c + x;
 451              if (next_char_ptr > send) {
 452                  goto out;
 453              }
 454              if (IS_UTF8_CHAR_FAST(c)) {
 455                  if (!IS_UTF8_CHAR(x, c))
 456                      c = 0;
 457              } else
 458                  c = is_utf8_char_slow(x, c);
 459              if (!c)
 460                  goto out;
 461          }
 462          x = next_char_ptr;
 463          outlen++;
 464     }
 465
 466  out:
 467     if (el)
 468         *el = outlen;
 469
 470     if (ep)
 471         *ep = x;
 472     return (x == send);
 473 }
 474
 475 /*
 476
 477 =for apidoc utf8n_to_uvuni
 478
 479 Bottom level UTF-8 decode routine.
 480 Returns the code point value of the first character in the string C<s>,
 481 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
 482 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
 483 the length, in bytes, of that character.
 484
 485 The value of C<flags> determines the behavior when C<s> does not point to a
 486 well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
 487 zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
 488 next possible position in C<s> that could begin a non-malformed character.
 489 Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
 490
 491 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
 492 individual types of malformations, such as the sequence being overlong (that
 493 is, when there is a shorter sequence that can express the same code point;
 494 overlong sequences are expressly forbidden in the UTF-8 standard due to
 495 potential security issues).  Another malformation example is the first byte of
 496 a character not being a legal first byte.  See F<utf8.h> for the list of such
 497 flags.  For allowed 0 length strings, this function returns 0; for allowed
 498 overlong sequences, the computed code point is returned; for all other allowed
 499 malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
 500 determinable reasonable value.
 501
 502 The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
 503 flags) malformation is found.  If this flag is set, the routine assumes that
 504 the caller will raise a warning, and this function will silently just set
 505 C<retlen> to C<-1> and return zero.
 506
 507 Certain code points are considered problematic.  These are Unicode surrogates,
 508 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
 509 By default these are considered regular code points, but certain situations
 510 warrant special handling for them.  If C<flags> contains
 511 UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
 512 malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
 513 UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
 514 maximum) can be set to disallow these categories individually.
 515
 516 The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
 517 UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
 518 for their respective categories, but otherwise the code points are considered
 519 valid (not malformations).  To get a category to both be treated as a
 520 malformation and raise a warning, specify both the WARN and DISALLOW flags.
 521 (But note that warnings are not raised if lexically disabled nor if
 522 UTF8_CHECK_ONLY is also specified.)
 523
 524 Very large code points (above 0x7FFF_FFFF) are considered more problematic than
 525 the others that are above the Unicode legal maximum.  There are several
 526 reasons: they requre at least 32 bits to represent them on ASCII platforms, are
 527 not representable at all on EBCDIC platforms, and the original UTF-8
 528 specification never went above this number (the current 0x10FFFF limit was
 529 imposed later).  (The smaller ones, those that fit into 32 bits, are
 530 representable by a UV on ASCII platforms, but not by an IV, which means that
 531 the number of operations that can be performed on them is quite restricted.)
 532 The UTF-8 encoding on ASCII platforms for these large code points begins with a
 533 byte containing 0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to
 534 be treated as malformations, while allowing smaller above-Unicode code points.
 535 (Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
 536 including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
 537 the other WARN flags, but applies just to these code points.
 538
 539 All other code points corresponding to Unicode characters, including private
 540 use and those yet to be assigned, are never considered malformed and never
 541 warn.
 542
 543 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 544
 545 =cut
 546 */
 547
 548 UV
 549 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 550 {
 551     dVAR;
 552     const U8 * const s0 = s;
 553     U8 overflow_byte = '\0';    /* Save byte in case of overflow */
 554     U8 * send;
 555     UV uv = *s;
 556     STRLEN expectlen;
 557     SV* sv = NULL;
 558     UV outlier_ret = 0; /* return value when input is in error or problematic
 559                          */
 560     UV pack_warn = 0;   /* Save result of packWARN() for later */
 561     bool unexpected_non_continuation = FALSE;
 562     bool overflowed = FALSE;
 563     bool do_overlong_test = TRUE;   /* May have to skip this test */
 564
 565     const char* const malformed_text = "Malformed UTF-8 character";
 566
 567     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
 568
 569     /* The order of malformation tests here is important.  We should consume as
 570      * few bytes as possible in order to not skip any valid character.  This is
 571      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
 572      * http://unicode.org/reports/tr36 for more discussion as to why.  For
 573      * example, once we've done a UTF8SKIP, we can tell the expected number of
 574      * bytes, and could fail right off the bat if the input parameters indicate
 575      * that there are too few available.  But it could be that just that first
 576      * byte is garbled, and the intended character occupies fewer bytes.  If we
 577      * blindly assumed that the first byte is correct, and skipped based on
 578      * that number, we could skip over a valid input character.  So instead, we
 579      * always examine the sequence byte-by-byte.
 580      *
 581      * We also should not consume too few bytes, otherwise someone could inject
 582      * things.  For example, an input could be deliberately designed to
 583      * overflow, and if this code bailed out immediately upon discovering that,
 584      * returning to the caller *retlen pointing to the very next byte (one
 585      * which is actually part of of the overflowing sequence), that could look
 586      * legitimate to the caller, which could discard the initial partial
 587      * sequence and process the rest, inappropriately */
 588
 589     /* Zero length strings, if allowed, of necessity are zero */
 590     if (curlen == 0) {
 591         if (retlen) {
 592             *retlen = 0;
 593         }
 594
 595         if (flags & UTF8_ALLOW_EMPTY) {
 596             return 0;
 597         }
 598         if (! (flags & UTF8_CHECK_ONLY)) {
 599             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)", malformed_text));
 600         }
 601         goto malformed;
 602     }
 603
 604     expectlen = UTF8SKIP(s);
 605
 606     /* A well-formed UTF-8 character, as the vast majority of calls to this
 607      * function will be for, has this expected length.  For efficiency, set
 608      * things up here to return it.  It will be overriden only in those rare
 609      * cases where a malformation is found */
 610     if (retlen) {
 611         *retlen = expectlen;
 612     }
 613
 614     /* An invariant is trivially well-formed */
 615     if (UTF8_IS_INVARIANT(uv)) {
 616         return (UV) (NATIVE_TO_UTF(*s));
 617     }
 618
 619     /* A continuation character can't start a valid sequence */
 620     if (UTF8_IS_CONTINUATION(uv)) {
 621         if (flags & UTF8_ALLOW_CONTINUATION) {
 622             if (retlen) {
 623                 *retlen = 1;
 624             }
 625             return UNICODE_REPLACEMENT;
 626         }
 627
 628         if (! (flags & UTF8_CHECK_ONLY)) {
 629             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected continuation byte 0x%02x, with no preceding start byte)", malformed_text, *s0));
 630         }
 631         curlen = 1;
 632         goto malformed;
 633     }
 634
 635 #ifdef EBCDIC
 636     uv = NATIVE_TO_UTF(uv);
 637 #endif
 638
 639     /* Here is not a continuation byte, nor an invariant.  The only thing left
 640      * is a start byte (possibly for an overlong) */
 641
 642     /* Remove the leading bits that indicate the number of bytes in the
 643      * character's whole UTF-8 sequence, leaving just the bits that are part of
 644      * the value */
 645     uv &= UTF_START_MASK(expectlen);
 646
 647     /* Now, loop through the remaining bytes in the character's sequence,
 648      * accumulating each into the working value as we go.  Be sure to not look
 649      * past the end of the input string */
 650     send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
 651
 652     for (s = s0 + 1; s < send; s++) {
 653         if (UTF8_IS_CONTINUATION(*s)) {
 654 #ifndef EBCDIC  /* Can't overflow in EBCDIC */
 655             if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
 656
 657                 /* The original implementors viewed this malformation as more
 658                  * serious than the others (though I, khw, don't understand
 659                  * why, since other malformations also give very very wrong
 660                  * results), so there is no way to turn off checking for it.
 661                  * Set a flag, but keep going in the loop, so that we absorb
 662                  * the rest of the bytes that comprise the character. */
 663                 overflowed = TRUE;
 664                 overflow_byte = *s; /* Save for warning message's use */
 665             }
 666 #endif
 667             uv = UTF8_ACCUMULATE(uv, *s);
 668         }
 669         else {
 670             /* Here, found a non-continuation before processing all expected
 671              * bytes.  This byte begins a new character, so quit, even if
 672              * allowing this malformation. */
 673             unexpected_non_continuation = TRUE;
 674             break;
 675         }
 676     } /* End of loop through the character's bytes */
 677
 678     /* Save how many bytes were actually in the character */
 679     curlen = s - s0;
 680
 681     /* The loop above finds two types of malformations: non-continuation and/or
 682      * overflow.  The non-continuation malformation is really a too-short
 683      * malformation, as it means that the current character ended before it was
 684      * expected to (being terminated prematurely by the beginning of the next
 685      * character, whereas in the too-short malformation there just are too few
 686      * bytes available to hold the character.  In both cases, the check below
 687      * that we have found the expected number of bytes would fail if executed.)
 688      * Thus the non-continuation malformation is really unnecessary, being a
 689      * subset of the too-short malformation.  But there may be existing
 690      * applications that are expecting the non-continuation type, so we retain
 691      * it, and return it in preference to the too-short malformation.  (If this
 692      * code were being written from scratch, the two types might be collapsed
 693      * into one.)  I, khw, am also giving priority to returning the
 694      * non-continuation and too-short malformations over overflow when multiple
 695      * ones are present.  I don't know of any real reason to prefer one over
 696      * the other, except that it seems to me that multiple-byte errors trumps
 697      * errors from a single byte */
 698     if (unexpected_non_continuation) {
 699         if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
 700             if (! (flags & UTF8_CHECK_ONLY)) {
 701                 if (curlen == 1) {
 702                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)", malformed_text, *s, *s0));
 703                 }
 704                 else {
 705                     sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (unexpected non-continuation byte 0x%02x, %d bytes after start byte 0x%02x, expected %d bytes)", malformed_text, *s, (int) curlen, *s0, (int)expectlen));
 706                 }
 707             }
 708             goto malformed;
 709         }
 710         uv = UNICODE_REPLACEMENT;
 711
 712         /* Skip testing for overlongs, as the REPLACEMENT may not be the same
 713          * as what the original expectations were. */
 714         do_overlong_test = FALSE;
 715         if (retlen) {
 716             *retlen = curlen;
 717         }
 718     }
 719     else if (curlen < expectlen) {
 720         if (! (flags & UTF8_ALLOW_SHORT)) {
 721             if (! (flags & UTF8_CHECK_ONLY)) {
 722                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, *s0));
 723             }
 724             goto malformed;
 725         }
 726         uv = UNICODE_REPLACEMENT;
 727         do_overlong_test = FALSE;
 728         if (retlen) {
 729             *retlen = curlen;
 730         }
 731     }
 732
 733 #ifndef EBCDIC  /* EBCDIC allows FE, FF, can't overflow */
 734     if ((*s0 & 0xFE) == 0xFE    /* matches both FE, FF */
 735         && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
 736     {
 737         /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
 738          * generation of the sv, since no warnings are raised under CHECK */
 739         if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
 740             && ckWARN_d(WARN_UTF8))
 741         {
 742             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
 743             pack_warn = packWARN(WARN_UTF8);
 744         }
 745         if (flags & UTF8_DISALLOW_FE_FF) {
 746             goto malformed;
 747         }
 748     }
 749     if (overflowed) {
 750
 751         /* If the first byte is FF, it will overflow a 32-bit word.  If the
 752          * first byte is FE, it will overflow a signed 32-bit word.  The
 753          * above preserves backward compatibility, since its message was used
 754          * in earlier versions of this code in preference to overflow */
 755         sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
 756         goto malformed;
 757     }
 758 #endif
 759
 760     if (do_overlong_test
 761         && expectlen > (STRLEN)UNISKIP(uv)
 762         && ! (flags & UTF8_ALLOW_LONG))
 763     {
 764         /* The overlong malformation has lower precedence than the others.
 765          * Note that if this malformation is allowed, we return the actual
 766          * value, instead of the replacement character.  This is because this
 767          * value is actually well-defined. */
 768         if (! (flags & UTF8_CHECK_ONLY)) {
 769             sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (%d byte%s, need %d, after start byte 0x%02x)", malformed_text, (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), *s0));
 770         }
 771         goto malformed;
 772     }
 773
 774     /* Here, the input is considered to be well-formed , but could be a
 775      * problematic code point that is not allowed by the input parameters. */
 776     if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
 777         && (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 778                      |UTF8_WARN_ILLEGAL_INTERCHANGE)))
 779     {
 780         if (UNICODE_IS_SURROGATE(uv)) {
 781             if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
 782                 && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
 783             {
 784                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
 785                 pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
 786             }
 787             if (flags & UTF8_DISALLOW_SURROGATE) {
 788                 goto disallowed;
 789             }
 790         }
 791         else if (UNICODE_IS_NONCHAR(uv)) {
 792             if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
 793                 && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
 794             {
 795                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
 796                 pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
 797             }
 798             if (flags & UTF8_DISALLOW_NONCHAR) {
 799                 goto disallowed;
 800             }
 801         }
 802         else if ((uv > PERL_UNICODE_MAX)) {
 803             if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
 804                 && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
 805             {
 806                 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
 807                 pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
 808             }
 809             if (flags & UTF8_DISALLOW_SUPER) {
 810                 goto disallowed;
 811             }
 812         }
 813
 814         if (sv) {
 815             outlier_ret = uv;
 816             goto do_warn;
 817         }
 818
 819         /* Here, this is not considered a malformed character, so drop through
 820          * to return it */
 821     }
 822
 823     return uv;
 824
 825     /* There are three cases which get to beyond this point.  In all 3 cases:
 826      * <sv>         if not null points to a string to print as a warning.
 827      * <curlen>     is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
 828      *              set.
 829      * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
 830      *              This is done by initializing it to 0, and changing it only
 831      *              for case 1).
 832      * The 3 cases are:
 833      * 1)   The input is valid but problematic, and to be warned about.  The
 834      *      return value is the resultant code point; <*retlen> is set to
 835      *      <curlen>, the number of bytes that comprise the code point.
 836      *      <pack_warn> contains the result of packWARN() for the warning
 837      *      types.  The entry point for this case is the label <do_warn>;
 838      * 2)   The input is a valid code point but disallowed by the parameters to
 839      *      this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
 840      *      <*relen> is -1; otherwise it is <curlen>, the number of bytes that
 841      *      comprise the code point.  <pack_warn> contains the result of
 842      *      packWARN() for the warning types.  The entry point for this case is
 843      *      the label <disallowed>.
 844      * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
 845      *      is set, <*relen> is -1; otherwise it is <curlen>, the number of
 846      *      bytes that comprise the malformation.  All such malformations are
 847      *      assumed to be warning type <utf8>.  The entry point for this case
 848      *      is the label <malformed>.
 849      */
 850
 851 malformed:
 852
 853     if (sv && ckWARN_d(WARN_UTF8)) {
 854         pack_warn = packWARN(WARN_UTF8);
 855     }
 856
 857 disallowed:
 858
 859     if (flags & UTF8_CHECK_ONLY) {
 860         if (retlen)
 861             *retlen = ((STRLEN) -1);
 862         return 0;
 863     }
 864
 865 do_warn:
 866
 867     if (pack_warn) {    /* <pack_warn> was initialized to 0, and changed only
 868                            if warnings are to be raised. */
 869         const char * const string = SvPVX_const(sv);
 870
 871         if (PL_op)
 872             Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
 873         else
 874             Perl_warner(aTHX_ pack_warn, "%s", string);
 875     }
 876
 877     if (retlen) {
 878         *retlen = curlen;
 879     }
 880
 881     return outlier_ret;
 882 }
 883
 884 /*
 885 =for apidoc utf8_to_uvchr_buf
 886
 887 Returns the native code point of the first character in the string C<s> which
 888 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 889 C<*retlen> will be set to the length, in bytes, of that character.
 890
 891 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
 892 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
 893 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
 894 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
 895 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
 896 next possible position in C<s> that could begin a non-malformed character.
 897 See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
 898
 899 =cut
 900 */
 901
 902
 903 UV
 904 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 905 {
 906     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
 907
 908     assert(s < send);
 909
 910     return utf8n_to_uvchr(s, send - s, retlen,
 911                           ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 912 }
 913
 914 /* Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
 915  * there are no malformations in the input UTF-8 string C<s>.  Currently, some
 916  * malformations are checked for, but this checking likely will be removed in
 917  * the future */
 918
 919 UV
 920 Perl_valid_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 921 {
 922     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 923
 924     return utf8_to_uvchr_buf(s, s + UTF8_MAXBYTES, retlen);
 925 }
 926
 927 /*
 928 =for apidoc utf8_to_uvchr
 929
 930 DEPRECATED!
 931
 932 Returns the native code point of the first character in the string C<s>
 933 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
 934 length, in bytes, of that character.
 935
 936 Some, but not all, UTF-8 malformations are detected, and in fact, some
 937 malformed input could cause reading beyond the end of the input buffer, which
 938 is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
 939
 940 If C<s> points to one of the detected malformations, and UTF8 warnings are
 941 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
 942 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
 943 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
 944 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
 945 next possible position in C<s> that could begin a non-malformed character.
 946 See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
 947
 948 =cut
 949 */
 950
 951 UV
 952 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
 953 {
 954     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
 955
 956     return valid_utf8_to_uvchr(s, retlen);
 957 }
 958
 959 /*
 960 =for apidoc utf8_to_uvuni_buf
 961
 962 Returns the Unicode code point of the first character in the string C<s> which
 963 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
 964 C<retlen> will be set to the length, in bytes, of that character.
 965
 966 This function should only be used when the returned UV is considered
 967 an index into the Unicode semantic tables (e.g. swashes).
 968
 969 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
 970 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
 971 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
 972 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
 973 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
 974 next possible position in C<s> that could begin a non-malformed character.
 975 See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
 976
 977 =cut
 978 */
 979
 980 UV
 981 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
 982 {
 983     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
 984
 985     assert(send > s);
 986
 987     /* Call the low level routine asking for checks */
 988     return Perl_utf8n_to_uvuni(aTHX_ s, send -s, retlen,
 989                                ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 990 }
 991
 992 /* Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that
 993  * there are no malformations in the input UTF-8 string C<s>.  Currently, some
 994  * malformations are checked for, but this checking likely will be removed in
 995  * the future */
 996
 997 UV
 998 Perl_valid_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
 999 {
1000     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVUNI;
1001
1002     return utf8_to_uvuni_buf(s, s + UTF8_MAXBYTES, retlen);
1003 }
1004
1005 /*
1006 =for apidoc utf8_to_uvuni
1007
1008 DEPRECATED!
1009
1010 Returns the Unicode code point of the first character in the string C<s>
1011 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1012 length, in bytes, of that character.
1013
1014 This function should only be used when the returned UV is considered
1015 an index into the Unicode semantic tables (e.g. swashes).
1016
1017 Some, but not all, UTF-8 malformations are detected, and in fact, some
1018 malformed input could cause reading beyond the end of the input buffer, which
1019 is why this function is deprecated.  Use L</utf8_to_uvuni_buf> instead.
1020
1021 If C<s> points to one of the detected malformations, and UTF8 warnings are
1022 enabled, zero is returned and C<*retlen> is set (if C<retlen> doesn't point to
1023 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
1024 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1025 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1026 next possible position in C<s> that could begin a non-malformed character.
1027 See L</utf8n_to_uvuni> for details on when the REPLACEMENT CHARACTER is returned.
1028
1029 =cut
1030 */
1031
1032 UV
1033 Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
1034 {
1035     PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
1036
1037     return valid_utf8_to_uvuni(s, retlen);
1038 }
1039
1040 /*
1041 =for apidoc utf8_length
1042
1043 Return the length of the UTF-8 char encoded string C<s> in characters.
1044 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
1045 up past C<e>, croaks.
1046
1047 =cut
1048 */
1049
1050 STRLEN
1051 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
1052 {
1053     dVAR;
1054     STRLEN len = 0;
1055
1056     PERL_ARGS_ASSERT_UTF8_LENGTH;
1057
1058     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1059      * the bitops (especially ~) can create illegal UTF-8.
1060      * In other words: in Perl UTF-8 is not just for Unicode. */
1061
1062     if (e < s)
1063         goto warn_and_return;
1064     while (s < e) {
1065         if (!UTF8_IS_INVARIANT(*s))
1066             s += UTF8SKIP(s);
1067         else
1068             s++;
1069         len++;
1070     }
1071
1072     if (e != s) {
1073         len--;
1074         warn_and_return:
1075         if (PL_op)
1076             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1077                              "%s in %s", unees, OP_DESC(PL_op));
1078         else
1079             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1080     }
1081
1082     return len;
1083 }
1084
1085 /*
1086 =for apidoc utf8_distance
1087
1088 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1089 and C<b>.
1090
1091 WARNING: use only if you *know* that the pointers point inside the
1092 same UTF-8 buffer.
1093
1094 =cut
1095 */
1096
1097 IV
1098 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1099 {
1100     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1101
1102     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1103 }
1104
1105 /*
1106 =for apidoc utf8_hop
1107
1108 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1109 forward or backward.
1110
1111 WARNING: do not use the following unless you *know* C<off> is within
1112 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1113 on the first byte of character or just after the last byte of a character.
1114
1115 =cut
1116 */
1117
1118 U8 *
1119 Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
1120 {
1121     PERL_ARGS_ASSERT_UTF8_HOP;
1122
1123     PERL_UNUSED_CONTEXT;
1124     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1125      * the bitops (especially ~) can create illegal UTF-8.
1126      * In other words: in Perl UTF-8 is not just for Unicode. */
1127
1128     if (off >= 0) {
1129         while (off--)
1130             s += UTF8SKIP(s);
1131     }
1132     else {
1133         while (off++) {
1134             s--;
1135             while (UTF8_IS_CONTINUATION(*s))
1136                 s--;
1137         }
1138     }
1139     return (U8 *)s;
1140 }
1141
1142 /*
1143 =for apidoc bytes_cmp_utf8
1144
1145 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1146 sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
1147 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1148 if the first string is greater than the second string.
1149
1150 -1 or +1 is returned if the shorter string was identical to the start of the
1151 longer string. -2 or +2 is returned if the was a difference between characters
1152 within the strings.
1153
1154 =cut
1155 */
1156
1157 int
1158 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1159 {
1160     const U8 *const bend = b + blen;
1161     const U8 *const uend = u + ulen;
1162
1163     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1164
1165     PERL_UNUSED_CONTEXT;
1166
1167     while (b < bend && u < uend) {
1168         U8 c = *u++;
1169         if (!UTF8_IS_INVARIANT(c)) {
1170             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1171                 if (u < uend) {
1172                     U8 c1 = *u++;
1173                     if (UTF8_IS_CONTINUATION(c1)) {
1174                         c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
1175                     } else {
1176                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1177                                          "Malformed UTF-8 character "
1178                                          "(unexpected non-continuation byte 0x%02x"
1179                                          ", immediately after start byte 0x%02x)"
1180                                          /* Dear diag.t, it's in the pod.  */
1181                                          "%s%s", c1, c,
1182                                          PL_op ? " in " : "",
1183                                          PL_op ? OP_DESC(PL_op) : "");
1184                         return -2;
1185                     }
1186                 } else {
1187                     if (PL_op)
1188                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1189                                          "%s in %s", unees, OP_DESC(PL_op));
1190                     else
1191                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1192                     return -2; /* Really want to return undef :-)  */
1193                 }
1194             } else {
1195                 return -2;
1196             }
1197         }
1198         if (*b != c) {
1199             return *b < c ? -2 : +2;
1200         }
1201         ++b;
1202     }
1203
1204     if (b == bend && u == uend)
1205         return 0;
1206
1207     return b < bend ? +1 : -1;
1208 }
1209
1210 /*
1211 =for apidoc utf8_to_bytes
1212
1213 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1214 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1215 updates C<len> to contain the new length.
1216 Returns zero on failure, setting C<len> to -1.
1217
1218 If you need a copy of the string, see L</bytes_from_utf8>.
1219
1220 =cut
1221 */
1222
1223 U8 *
1224 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
1225 {
1226     U8 * const save = s;
1227     U8 * const send = s + *len;
1228     U8 *d;
1229
1230     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
1231
1232     /* ensure valid UTF-8 and chars < 256 before updating string */
1233     while (s < send) {
1234         U8 c = *s++;
1235
1236         if (!UTF8_IS_INVARIANT(c) &&
1237             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
1238              || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
1239             *len = ((STRLEN) -1);
1240             return 0;
1241         }
1242     }
1243
1244     d = s = save;
1245     while (s < send) {
1246         STRLEN ulen;
1247         *d++ = (U8)utf8_to_uvchr_buf(s, send, &ulen);
1248         s += ulen;
1249     }
1250     *d = '\0';
1251     *len = d - save;
1252     return save;
1253 }
1254
1255 /*
1256 =for apidoc bytes_from_utf8
1257
1258 Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
1259 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
1260 the newly-created string, and updates C<len> to contain the new
1261 length.  Returns the original string if no conversion occurs, C<len>
1262 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
1263 0 if C<s> is converted or consisted entirely of characters that are invariant
1264 in utf8 (i.e., US-ASCII on non-EBCDIC machines).
1265
1266 =cut
1267 */
1268
1269 U8 *
1270 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
1271 {
1272     U8 *d;
1273     const U8 *start = s;
1274     const U8 *send;
1275     I32 count = 0;
1276
1277     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
1278
1279     PERL_UNUSED_CONTEXT;
1280     if (!*is_utf8)
1281         return (U8 *)start;
1282
1283     /* ensure valid UTF-8 and chars < 256 before converting string */
1284     for (send = s + *len; s < send;) {
1285         U8 c = *s++;
1286         if (!UTF8_IS_INVARIANT(c)) {
1287             if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
1288                 (c = *s++) && UTF8_IS_CONTINUATION(c))
1289                 count++;
1290             else
1291                 return (U8 *)start;
1292         }
1293     }
1294
1295     *is_utf8 = FALSE;
1296
1297     Newx(d, (*len) - count + 1, U8);
1298     s = start; start = d;
1299     while (s < send) {
1300         U8 c = *s++;
1301         if (!UTF8_IS_INVARIANT(c)) {
1302             /* Then it is two-byte encoded */
1303             c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
1304         }
1305         *d++ = c;
1306     }
1307     *d = '\0';
1308     *len = d - start;
1309     return (U8 *)start;
1310 }
1311
1312 /*
1313 =for apidoc bytes_to_utf8
1314
1315 Converts a string C<s> of length C<len> bytes from the native encoding into
1316 UTF-8.
1317 Returns a pointer to the newly-created string, and sets C<len> to
1318 reflect the new length in bytes.
1319
1320 A NUL character will be written after the end of the string.
1321
1322 If you want to convert to UTF-8 from encodings other than
1323 the native (Latin1 or EBCDIC),
1324 see L</sv_recode_to_utf8>().
1325
1326 =cut
1327 */
1328
1329 /* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1330    likewise need duplication. */
1331
1332 U8*
1333 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
1334 {
1335     const U8 * const send = s + (*len);
1336     U8 *d;
1337     U8 *dst;
1338
1339     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
1340     PERL_UNUSED_CONTEXT;
1341
1342     Newx(d, (*len) * 2 + 1, U8);
1343     dst = d;
1344
1345     while (s < send) {
1346         const UV uv = NATIVE_TO_ASCII(*s++);
1347         if (UNI_IS_INVARIANT(uv))
1348             *d++ = (U8)UTF_TO_NATIVE(uv);
1349         else {
1350             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
1351             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
1352         }
1353     }
1354     *d = '\0';
1355     *len = d-dst;
1356     return dst;
1357 }
1358
1359 /*
1360  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
1361  *
1362  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
1363  * We optimize for native, for obvious reasons. */
1364
1365 U8*
1366 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1367 {
1368     U8* pend;
1369     U8* dstart = d;
1370
1371     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1372
1373     if (bytelen & 1)
1374         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
1375
1376     pend = p + bytelen;
1377
1378     while (p < pend) {
1379         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1380         p += 2;
1381         if (uv < 0x80) {
1382 #ifdef EBCDIC
1383             *d++ = UNI_TO_NATIVE(uv);
1384 #else
1385             *d++ = (U8)uv;
1386 #endif
1387             continue;
1388         }
1389         if (uv < 0x800) {
1390             *d++ = (U8)(( uv >>  6)         | 0xc0);
1391             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1392             continue;
1393         }
1394         if (uv >= 0xd800 && uv <= 0xdbff) {     /* surrogates */
1395             if (p >= pend) {
1396                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1397             } else {
1398                 UV low = (p[0] << 8) + p[1];
1399                 p += 2;
1400                 if (low < 0xdc00 || low > 0xdfff)
1401                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1402                 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
1403             }
1404         } else if (uv >= 0xdc00 && uv <= 0xdfff) {
1405             Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1406         }
1407         if (uv < 0x10000) {
1408             *d++ = (U8)(( uv >> 12)         | 0xe0);
1409             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1410             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1411             continue;
1412         }
1413         else {
1414             *d++ = (U8)(( uv >> 18)         | 0xf0);
1415             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1416             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
1417             *d++ = (U8)(( uv        & 0x3f) | 0x80);
1418             continue;
1419         }
1420     }
1421     *newlen = d - dstart;
1422     return d;
1423 }
1424
1425 /* Note: this one is slightly destructive of the source. */
1426
1427 U8*
1428 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1429 {
1430     U8* s = (U8*)p;
1431     U8* const send = s + bytelen;
1432
1433     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1434
1435     if (bytelen & 1)
1436         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1437                    (UV)bytelen);
1438
1439     while (s < send) {
1440         const U8 tmp = s[0];
1441         s[0] = s[1];
1442         s[1] = tmp;
1443         s += 2;
1444     }
1445     return utf16_to_utf8(p, d, bytelen, newlen);
1446 }
1447
1448 /* for now these are all defined (inefficiently) in terms of the utf8 versions.
1449  * Note that the macros in handy.h that call these short-circuit calling them
1450  * for Latin-1 range inputs */
1451
1452 bool
1453 Perl_is_uni_alnum(pTHX_ UV c)
1454 {
1455     U8 tmpbuf[UTF8_MAXBYTES+1];
1456     uvchr_to_utf8(tmpbuf, c);
1457     return is_utf8_alnum(tmpbuf);
1458 }
1459
1460 bool
1461 Perl_is_uni_idfirst(pTHX_ UV c)
1462 {
1463     U8 tmpbuf[UTF8_MAXBYTES+1];
1464     uvchr_to_utf8(tmpbuf, c);
1465     return is_utf8_idfirst(tmpbuf);
1466 }
1467
1468 bool
1469 Perl_is_uni_alpha(pTHX_ UV c)
1470 {
1471     U8 tmpbuf[UTF8_MAXBYTES+1];
1472     uvchr_to_utf8(tmpbuf, c);
1473     return is_utf8_alpha(tmpbuf);
1474 }
1475
1476 bool
1477 Perl_is_uni_ascii(pTHX_ UV c)
1478 {
1479     return isASCII(c);
1480 }
1481
1482 bool
1483 Perl_is_uni_space(pTHX_ UV c)
1484 {
1485     U8 tmpbuf[UTF8_MAXBYTES+1];
1486     uvchr_to_utf8(tmpbuf, c);
1487     return is_utf8_space(tmpbuf);
1488 }
1489
1490 bool
1491 Perl_is_uni_digit(pTHX_ UV c)
1492 {
1493     U8 tmpbuf[UTF8_MAXBYTES+1];
1494     uvchr_to_utf8(tmpbuf, c);
1495     return is_utf8_digit(tmpbuf);
1496 }
1497
1498 bool
1499 Perl_is_uni_upper(pTHX_ UV c)
1500 {
1501     U8 tmpbuf[UTF8_MAXBYTES+1];
1502     uvchr_to_utf8(tmpbuf, c);
1503     return is_utf8_upper(tmpbuf);
1504 }
1505
1506 bool
1507 Perl_is_uni_lower(pTHX_ UV c)
1508 {
1509     U8 tmpbuf[UTF8_MAXBYTES+1];
1510     uvchr_to_utf8(tmpbuf, c);
1511     return is_utf8_lower(tmpbuf);
1512 }
1513
1514 bool
1515 Perl_is_uni_cntrl(pTHX_ UV c)
1516 {
1517     return isCNTRL_L1(c);
1518 }
1519
1520 bool
1521 Perl_is_uni_graph(pTHX_ UV c)
1522 {
1523     U8 tmpbuf[UTF8_MAXBYTES+1];
1524     uvchr_to_utf8(tmpbuf, c);
1525     return is_utf8_graph(tmpbuf);
1526 }
1527
1528 bool
1529 Perl_is_uni_print(pTHX_ UV c)
1530 {
1531     U8 tmpbuf[UTF8_MAXBYTES+1];
1532     uvchr_to_utf8(tmpbuf, c);
1533     return is_utf8_print(tmpbuf);
1534 }
1535
1536 bool
1537 Perl_is_uni_punct(pTHX_ UV c)
1538 {
1539     U8 tmpbuf[UTF8_MAXBYTES+1];
1540     uvchr_to_utf8(tmpbuf, c);
1541     return is_utf8_punct(tmpbuf);
1542 }
1543
1544 bool
1545 Perl_is_uni_xdigit(pTHX_ UV c)
1546 {
1547     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1548     uvchr_to_utf8(tmpbuf, c);
1549     return is_utf8_xdigit(tmpbuf);
1550 }
1551
1552 UV
1553 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1554 {
1555     /* We have the latin1-range values compiled into the core, so just use
1556      * those, converting the result to utf8.  The only difference between upper
1557      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1558      * either "SS" or "Ss".  Which one to use is passed into the routine in
1559      * 'S_or_s' to avoid a test */
1560
1561     UV converted = toUPPER_LATIN1_MOD(c);
1562
1563     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1564
1565     assert(S_or_s == 'S' || S_or_s == 's');
1566
1567     if (UNI_IS_INVARIANT(converted)) { /* No difference between the two for
1568                                           characters in this range */
1569         *p = (U8) converted;
1570         *lenp = 1;
1571         return converted;
1572     }
1573
1574     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1575      * which it maps to one of them, so as to only have to have one check for
1576      * it in the main case */
1577     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1578         switch (c) {
1579             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1580                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1581                 break;
1582             case MICRO_SIGN:
1583                 converted = GREEK_CAPITAL_LETTER_MU;
1584                 break;
1585             case LATIN_SMALL_LETTER_SHARP_S:
1586                 *(p)++ = 'S';
1587                 *p = S_or_s;
1588                 *lenp = 2;
1589                 return 'S';
1590             default:
1591                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
1592                 /* NOTREACHED */
1593         }
1594     }
1595
1596     *(p)++ = UTF8_TWO_BYTE_HI(converted);
1597     *p = UTF8_TWO_BYTE_LO(converted);
1598     *lenp = 2;
1599
1600     return converted;
1601 }
1602
1603 /* Call the function to convert a UTF-8 encoded character to the specified case.
1604  * Note that there may be more than one character in the result.
1605  * INP is a pointer to the first byte of the input character
1606  * OUTP will be set to the first byte of the string of changed characters.  It
1607  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1608  * LENP will be set to the length in bytes of the string of changed characters
1609  *
1610  * The functions return the ordinal of the first character in the string of OUTP */
1611 #define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
1612 #define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
1613 #define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
1614
1615 /* This additionally has the input parameter SPECIALS, which if non-zero will
1616  * cause this to use the SPECIALS hash for folding (meaning get full case
1617  * folding); otherwise, when zero, this implies a simple case fold */
1618 #define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
1619
1620 UV
1621 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1622 {
1623     dVAR;
1624
1625     /* Convert the Unicode character whose ordinal is <c> to its uppercase
1626      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1627      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1628      * the changed version may be longer than the original character.
1629      *
1630      * The ordinal of the first character of the changed version is returned
1631      * (but note, as explained above, that there may be more.) */
1632
1633     PERL_ARGS_ASSERT_TO_UNI_UPPER;
1634
1635     if (c < 256) {
1636         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1637     }
1638
1639     uvchr_to_utf8(p, c);
1640     return CALL_UPPER_CASE(p, p, lenp);
1641 }
1642
1643 UV
1644 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1645 {
1646     dVAR;
1647
1648     PERL_ARGS_ASSERT_TO_UNI_TITLE;
1649
1650     if (c < 256) {
1651         return _to_upper_title_latin1((U8) c, p, lenp, 's');
1652     }
1653
1654     uvchr_to_utf8(p, c);
1655     return CALL_TITLE_CASE(p, p, lenp);
1656 }
1657
1658 STATIC U8
1659 S_to_lower_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp)
1660 {
1661     /* We have the latin1-range values compiled into the core, so just use
1662      * those, converting the result to utf8.  Since the result is always just
1663      * one character, we allow <p> to be NULL */
1664
1665     U8 converted = toLOWER_LATIN1(c);
1666
1667     if (p != NULL) {
1668         if (UNI_IS_INVARIANT(converted)) {
1669             *p = converted;
1670             *lenp = 1;
1671         }
1672         else {
1673             *p = UTF8_TWO_BYTE_HI(converted);
1674             *(p+1) = UTF8_TWO_BYTE_LO(converted);
1675             *lenp = 2;
1676         }
1677     }
1678     return converted;
1679 }
1680
1681 UV
1682 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1683 {
1684     dVAR;
1685
1686     PERL_ARGS_ASSERT_TO_UNI_LOWER;
1687
1688     if (c < 256) {
1689         return to_lower_latin1((U8) c, p, lenp);
1690     }
1691
1692     uvchr_to_utf8(p, c);
1693     return CALL_LOWER_CASE(p, p, lenp);
1694 }
1695
1696 UV
1697 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const bool flags)
1698 {
1699     /* Corresponds to to_lower_latin1(), <flags> is TRUE if to use full case
1700      * folding */
1701
1702     UV converted;
1703
1704     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
1705
1706     if (c == MICRO_SIGN) {
1707         converted = GREEK_SMALL_LETTER_MU;
1708     }
1709     else if (flags && c == LATIN_SMALL_LETTER_SHARP_S) {
1710         *(p)++ = 's';
1711         *p = 's';
1712         *lenp = 2;
1713         return 's';
1714     }
1715     else { /* In this range the fold of all other characters is their lower
1716               case */
1717         converted = toLOWER_LATIN1(c);
1718     }
1719
1720     if (UNI_IS_INVARIANT(converted)) {
1721         *p = (U8) converted;
1722         *lenp = 1;
1723     }
1724     else {
1725         *(p)++ = UTF8_TWO_BYTE_HI(converted);
1726         *p = UTF8_TWO_BYTE_LO(converted);
1727         *lenp = 2;
1728     }
1729
1730     return converted;
1731 }
1732
1733 UV
1734 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, const bool flags)
1735 {
1736
1737     /* Not currently externally documented, and subject to change, <flags> is
1738      * TRUE iff full folding is to be used */
1739
1740     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
1741
1742     if (c < 256) {
1743         return _to_fold_latin1((U8) c, p, lenp, flags);
1744     }
1745
1746     uvchr_to_utf8(p, c);
1747     return CALL_FOLD_CASE(p, p, lenp, flags);
1748 }
1749
1750 /* for now these all assume no locale info available for Unicode > 255; and
1751  * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been
1752  * called instead, so that these don't get called for < 255 */
1753
1754 bool
1755 Perl_is_uni_alnum_lc(pTHX_ UV c)
1756 {
1757     return is_uni_alnum(c);     /* XXX no locale support yet */
1758 }
1759
1760 bool
1761 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1762 {
1763     return is_uni_idfirst(c);   /* XXX no locale support yet */
1764 }
1765
1766 bool
1767 Perl_is_uni_alpha_lc(pTHX_ UV c)
1768 {
1769     return is_uni_alpha(c);     /* XXX no locale support yet */
1770 }
1771
1772 bool
1773 Perl_is_uni_ascii_lc(pTHX_ UV c)
1774 {
1775     return is_uni_ascii(c);     /* XXX no locale support yet */
1776 }
1777
1778 bool
1779 Perl_is_uni_space_lc(pTHX_ UV c)
1780 {
1781     return is_uni_space(c);     /* XXX no locale support yet */
1782 }
1783
1784 bool
1785 Perl_is_uni_digit_lc(pTHX_ UV c)
1786 {
1787     return is_uni_digit(c);     /* XXX no locale support yet */
1788 }
1789
1790 bool
1791 Perl_is_uni_upper_lc(pTHX_ UV c)
1792 {
1793     return is_uni_upper(c);     /* XXX no locale support yet */
1794 }
1795
1796 bool
1797 Perl_is_uni_lower_lc(pTHX_ UV c)
1798 {
1799     return is_uni_lower(c);     /* XXX no locale support yet */
1800 }
1801
1802 bool
1803 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1804 {
1805     return is_uni_cntrl(c);     /* XXX no locale support yet */
1806 }
1807
1808 bool
1809 Perl_is_uni_graph_lc(pTHX_ UV c)
1810 {
1811     return is_uni_graph(c);     /* XXX no locale support yet */
1812 }
1813
1814 bool
1815 Perl_is_uni_print_lc(pTHX_ UV c)
1816 {
1817     return is_uni_print(c);     /* XXX no locale support yet */
1818 }
1819
1820 bool
1821 Perl_is_uni_punct_lc(pTHX_ UV c)
1822 {
1823     return is_uni_punct(c);     /* XXX no locale support yet */
1824 }
1825
1826 bool
1827 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1828 {
1829     return is_uni_xdigit(c);    /* XXX no locale support yet */
1830 }
1831
1832 U32
1833 Perl_to_uni_upper_lc(pTHX_ U32 c)
1834 {
1835     /* XXX returns only the first character -- do not use XXX */
1836     /* XXX no locale support yet */
1837     STRLEN len;
1838     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1839     return (U32)to_uni_upper(c, tmpbuf, &len);
1840 }
1841
1842 U32
1843 Perl_to_uni_title_lc(pTHX_ U32 c)
1844 {
1845     /* XXX returns only the first character XXX -- do not use XXX */
1846     /* XXX no locale support yet */
1847     STRLEN len;
1848     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1849     return (U32)to_uni_title(c, tmpbuf, &len);
1850 }
1851
1852 U32
1853 Perl_to_uni_lower_lc(pTHX_ U32 c)
1854 {
1855     /* XXX returns only the first character -- do not use XXX */
1856     /* XXX no locale support yet */
1857     STRLEN len;
1858     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1859     return (U32)to_uni_lower(c, tmpbuf, &len);
1860 }
1861
1862 static bool
1863 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1864                  const char *const swashname)
1865 {
1866     /* returns a boolean giving whether or not the UTF8-encoded character that
1867      * starts at <p> is in the swash indicated by <swashname>.  <swash>
1868      * contains a pointer to where the swash indicated by <swashname>
1869      * is to be stored; which this routine will do, so that future calls will
1870      * look at <*swash> and only generate a swash if it is not null
1871      *
1872      * Note that it is assumed that the buffer length of <p> is enough to
1873      * contain all the bytes that comprise the character.  Thus, <*p> should
1874      * have been checked before this call for mal-formedness enough to assure
1875      * that. */
1876
1877     dVAR;
1878
1879     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1880
1881     /* The API should have included a length for the UTF-8 character in <p>,
1882      * but it doesn't.  We therefor assume that p has been validated at least
1883      * as far as there being enough bytes available in it to accommodate the
1884      * character without reading beyond the end, and pass that number on to the
1885      * validating routine */
1886     if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
1887         return FALSE;
1888     if (!*swash)
1889         *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1890     return swash_fetch(*swash, p, TRUE) != 0;
1891 }
1892
1893 bool
1894 Perl_is_utf8_alnum(pTHX_ const U8 *p)
1895 {
1896     dVAR;
1897
1898     PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1899
1900     /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1901      * descendant of isalnum(3), in other words, it doesn't
1902      * contain the '_'. --jhi */
1903     return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1904 }
1905
1906 bool
1907 Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1908 {
1909     dVAR;
1910
1911     PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1912
1913     if (*p == '_')
1914         return TRUE;
1915     /* is_utf8_idstart would be more logical. */
1916     return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1917 }
1918
1919 bool
1920 Perl_is_utf8_xidfirst(pTHX_ const U8 *p) /* The naming is historical. */
1921 {
1922     dVAR;
1923
1924     PERL_ARGS_ASSERT_IS_UTF8_XIDFIRST;
1925
1926     if (*p == '_')
1927         return TRUE;
1928     /* is_utf8_idstart would be more logical. */
1929     return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart");
1930 }
1931
1932 bool
1933 Perl__is_utf8__perl_idstart(pTHX_ const U8 *p)
1934 {
1935     dVAR;
1936
1937     PERL_ARGS_ASSERT__IS_UTF8__PERL_IDSTART;
1938
1939     return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart");
1940 }
1941
1942 bool
1943 Perl_is_utf8_idcont(pTHX_ const U8 *p)
1944 {
1945     dVAR;
1946
1947     PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1948
1949     return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1950 }
1951
1952 bool
1953 Perl_is_utf8_xidcont(pTHX_ const U8 *p)
1954 {
1955     dVAR;
1956
1957     PERL_ARGS_ASSERT_IS_UTF8_XIDCONT;
1958
1959     return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue");
1960 }
1961
1962 bool
1963 Perl_is_utf8_alpha(pTHX_ const U8 *p)
1964 {
1965     dVAR;
1966
1967     PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1968
1969     return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1970 }
1971
1972 bool
1973 Perl_is_utf8_ascii(pTHX_ const U8 *p)
1974 {
1975     dVAR;
1976
1977     PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1978
1979     /* ASCII characters are the same whether in utf8 or not.  So the macro
1980      * works on both utf8 and non-utf8 representations. */
1981     return isASCII(*p);
1982 }
1983
1984 bool
1985 Perl_is_utf8_space(pTHX_ const U8 *p)
1986 {
1987     dVAR;
1988
1989     PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1990
1991     return is_utf8_common(p, &PL_utf8_space, "IsXPerlSpace");
1992 }
1993
1994 bool
1995 Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1996 {
1997     dVAR;
1998
1999     PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
2000
2001     /* Only true if is an ASCII space-like character, and ASCII is invariant
2002      * under utf8, so can just use the macro */
2003     return isSPACE_A(*p);
2004 }
2005
2006 bool
2007 Perl_is_utf8_perl_word(pTHX_ const U8 *p)
2008 {
2009     dVAR;
2010
2011     PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
2012
2013     /* Only true if is an ASCII word character, and ASCII is invariant
2014      * under utf8, so can just use the macro */
2015     return isWORDCHAR_A(*p);
2016 }
2017
2018 bool
2019 Perl_is_utf8_digit(pTHX_ const U8 *p)
2020 {
2021     dVAR;
2022
2023     PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
2024
2025     return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
2026 }
2027
2028 bool
2029 Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
2030 {
2031     dVAR;
2032
2033     PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
2034
2035     /* Only true if is an ASCII digit character, and ASCII is invariant
2036      * under utf8, so can just use the macro */
2037     return isDIGIT_A(*p);
2038 }
2039
2040 bool
2041 Perl_is_utf8_upper(pTHX_ const U8 *p)
2042 {
2043     dVAR;
2044
2045     PERL_ARGS_ASSERT_IS_UTF8_UPPER;
2046
2047     return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
2048 }
2049
2050 bool
2051 Perl_is_utf8_lower(pTHX_ const U8 *p)
2052 {
2053     dVAR;
2054
2055     PERL_ARGS_ASSERT_IS_UTF8_LOWER;
2056
2057     return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
2058 }
2059
2060 bool
2061 Perl_is_utf8_cntrl(pTHX_ const U8 *p)
2062 {
2063     dVAR;
2064
2065     PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
2066
2067     if (isASCII(*p)) {
2068         return isCNTRL_A(*p);
2069     }
2070
2071     /* All controls are in Latin1 */
2072     if (! UTF8_IS_DOWNGRADEABLE_START(*p)) {
2073         return 0;
2074     }
2075     return isCNTRL_L1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2076 }
2077
2078 bool
2079 Perl_is_utf8_graph(pTHX_ const U8 *p)
2080 {
2081     dVAR;
2082
2083     PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
2084
2085     return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
2086 }
2087
2088 bool
2089 Perl_is_utf8_print(pTHX_ const U8 *p)
2090 {
2091     dVAR;
2092
2093     PERL_ARGS_ASSERT_IS_UTF8_PRINT;
2094
2095     return is_utf8_common(p, &PL_utf8_print, "IsPrint");
2096 }
2097
2098 bool
2099 Perl_is_utf8_punct(pTHX_ const U8 *p)
2100 {
2101     dVAR;
2102
2103     PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
2104
2105     return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
2106 }
2107
2108 bool
2109 Perl_is_utf8_xdigit(pTHX_ const U8 *p)
2110 {
2111     dVAR;
2112
2113     PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
2114
2115     return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
2116 }
2117
2118 bool
2119 Perl_is_utf8_mark(pTHX_ const U8 *p)
2120 {
2121     dVAR;
2122
2123     PERL_ARGS_ASSERT_IS_UTF8_MARK;
2124
2125     return is_utf8_common(p, &PL_utf8_mark, "IsM");
2126 }
2127
2128 bool
2129 Perl_is_utf8_X_begin(pTHX_ const U8 *p)
2130 {
2131     dVAR;
2132
2133     PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
2134
2135     return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
2136 }
2137
2138 bool
2139 Perl_is_utf8_X_extend(pTHX_ const U8 *p)
2140 {
2141     dVAR;
2142
2143     PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
2144
2145     return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
2146 }
2147
2148 bool
2149 Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
2150 {
2151     dVAR;
2152
2153     PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
2154
2155     return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
2156 }
2157
2158 bool
2159 Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
2160 {
2161     dVAR;
2162
2163     PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
2164
2165     return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
2166 }
2167
2168 bool
2169 Perl_is_utf8_X_L(pTHX_ const U8 *p)
2170 {
2171     dVAR;
2172
2173     PERL_ARGS_ASSERT_IS_UTF8_X_L;
2174
2175     return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
2176 }
2177
2178 bool
2179 Perl_is_utf8_X_LV(pTHX_ const U8 *p)
2180 {
2181     dVAR;
2182
2183     PERL_ARGS_ASSERT_IS_UTF8_X_LV;
2184
2185     return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
2186 }
2187
2188 bool
2189 Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
2190 {
2191     dVAR;
2192
2193     PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
2194
2195     return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
2196 }
2197
2198 bool
2199 Perl_is_utf8_X_T(pTHX_ const U8 *p)
2200 {
2201     dVAR;
2202
2203     PERL_ARGS_ASSERT_IS_UTF8_X_T;
2204
2205     return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
2206 }
2207
2208 bool
2209 Perl_is_utf8_X_V(pTHX_ const U8 *p)
2210 {
2211     dVAR;
2212
2213     PERL_ARGS_ASSERT_IS_UTF8_X_V;
2214
2215     return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
2216 }
2217
2218 bool
2219 Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
2220 {
2221     dVAR;
2222
2223     PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
2224
2225     return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
2226 }
2227
2228 bool
2229 Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
2230 {
2231     /* For exclusive use of pp_quotemeta() */
2232
2233     dVAR;
2234
2235     PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
2236
2237     return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
2238 }
2239
2240 /*
2241 =for apidoc to_utf8_case
2242
2243 The C<p> contains the pointer to the UTF-8 string encoding
2244 the character that is being converted.  This routine assumes that the character
2245 at C<p> is well-formed.
2246
2247 The C<ustrp> is a pointer to the character buffer to put the
2248 conversion result to.  The C<lenp> is a pointer to the length
2249 of the result.
2250
2251 The C<swashp> is a pointer to the swash to use.
2252
2253 Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
2254 and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>.  The C<special> (usually,
2255 but not always, a multicharacter mapping), is tried first.
2256
2257 The C<special> is a string like "utf8::ToSpecLower", which means the
2258 hash %utf8::ToSpecLower.  The access to the hash is through
2259 Perl_to_utf8_case().
2260
2261 The C<normal> is a string like "ToLower" which means the swash
2262 %utf8::ToLower.
2263
2264 =cut */
2265
2266 UV
2267 Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2268                         SV **swashp, const char *normal, const char *special)
2269 {
2270     dVAR;
2271     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
2272     STRLEN len = 0;
2273     const UV uv0 = valid_utf8_to_uvchr(p, NULL);
2274     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
2275      * are necessary in EBCDIC, they are redundant no-ops
2276      * in ASCII-ish platforms, and hopefully optimized away. */
2277     const UV uv1 = NATIVE_TO_UNI(uv0);
2278
2279     PERL_ARGS_ASSERT_TO_UTF8_CASE;
2280
2281     /* Note that swash_fetch() doesn't output warnings for these because it
2282      * assumes we will */
2283     if (uv1 >= UNICODE_SURROGATE_FIRST) {
2284         if (uv1 <= UNICODE_SURROGATE_LAST) {
2285             if (ckWARN_d(WARN_SURROGATE)) {
2286                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2287                 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2288                     "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2289             }
2290         }
2291         else if (UNICODE_IS_SUPER(uv1)) {
2292             if (ckWARN_d(WARN_NON_UNICODE)) {
2293                 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2294                 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2295                     "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2296             }
2297         }
2298
2299         /* Note that non-characters are perfectly legal, so no warning should
2300          * be given */
2301     }
2302
2303     uvuni_to_utf8(tmpbuf, uv1);
2304
2305     if (!*swashp) /* load on-demand */
2306          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
2307
2308     if (special) {
2309          /* It might be "special" (sometimes, but not always,
2310           * a multicharacter mapping) */
2311          HV * const hv = get_hv(special, 0);
2312          SV **svp;
2313
2314          if (hv &&
2315              (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
2316              (*svp)) {
2317              const char *s;
2318
2319               s = SvPV_const(*svp, len);
2320               if (len == 1)
2321                    len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2322               else {
2323 #ifdef EBCDIC
2324                    /* If we have EBCDIC we need to remap the characters
2325                     * since any characters in the low 256 are Unicode
2326                     * code points, not EBCDIC. */
2327                    U8 *t = (U8*)s, *tend = t + len, *d;
2328
2329                    d = tmpbuf;
2330                    if (SvUTF8(*svp)) {
2331                         STRLEN tlen = 0;
2332
2333                         while (t < tend) {
2334                              const UV c = utf8_to_uvchr_buf(t, tend, &tlen);
2335                              if (tlen > 0) {
2336                                   d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
2337                                   t += tlen;
2338                              }
2339                              else
2340                                   break;
2341                         }
2342                    }
2343                    else {
2344                         while (t < tend) {
2345                              d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
2346                              t++;
2347                         }
2348                    }
2349                    len = d - tmpbuf;
2350                    Copy(tmpbuf, ustrp, len, U8);
2351 #else
2352                    Copy(s, ustrp, len, U8);
2353 #endif
2354               }
2355          }
2356     }
2357
2358     if (!len && *swashp) {
2359         const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
2360
2361          if (uv2) {
2362               /* It was "normal" (a single character mapping). */
2363               const UV uv3 = UNI_TO_NATIVE(uv2);
2364               len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2365          }
2366     }
2367
2368     if (!len) /* Neither: just copy.  In other words, there was no mapping
2369                  defined, which means that the code point maps to itself */
2370          len = uvchr_to_utf8(ustrp, uv0) - ustrp;
2371
2372     if (lenp)
2373          *lenp = len;
2374
2375     return len ? valid_utf8_to_uvchr(ustrp, 0) : 0;
2376 }
2377
2378 STATIC UV
2379 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
2380 {
2381     /* This is called when changing the case of a utf8-encoded character above
2382      * the Latin1 range, and the operation is in locale.  If the result
2383      * contains a character that crosses the 255/256 boundary, disallow the
2384      * change, and return the original code point.  See L<perlfunc/lc> for why;
2385      *
2386      * p        points to the original string whose case was changed; assumed
2387      *          by this routine to be well-formed
2388      * result   the code point of the first character in the changed-case string
2389      * ustrp    points to the changed-case string (<result> represents its first char)
2390      * lenp     points to the length of <ustrp> */
2391
2392     UV original;    /* To store the first code point of <p> */
2393
2394     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2395
2396     assert(! UTF8_IS_INVARIANT(*p) && ! UTF8_IS_DOWNGRADEABLE_START(*p));
2397
2398     /* We know immediately if the first character in the string crosses the
2399      * boundary, so can skip */
2400     if (result > 255) {
2401
2402         /* Look at every character in the result; if any cross the
2403         * boundary, the whole thing is disallowed */
2404         U8* s = ustrp + UTF8SKIP(ustrp);
2405         U8* e = ustrp + *lenp;
2406         while (s < e) {
2407             if (UTF8_IS_INVARIANT(*s) || UTF8_IS_DOWNGRADEABLE_START(*s))
2408             {
2409                 goto bad_crossing;
2410             }
2411             s += UTF8SKIP(s);
2412         }
2413
2414         /* Here, no characters crossed, result is ok as-is */
2415         return result;
2416     }
2417
2418 bad_crossing:
2419
2420     /* Failed, have to return the original */
2421     original = valid_utf8_to_uvchr(p, lenp);
2422     Copy(p, ustrp, *lenp, char);
2423     return original;
2424 }
2425
2426 /*
2427 =for apidoc to_utf8_upper
2428
2429 Convert the UTF-8 encoded character at C<p> to its uppercase version and
2430 store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
2431 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2432 the uppercase version may be longer than the original character.
2433
2434 The first character of the uppercased version is returned
2435 (but note, as explained above, that there may be more.)
2436
2437 The character at C<p> is assumed by this routine to be well-formed.
2438
2439 =cut */
2440
2441 /* Not currently externally documented, and subject to change:
2442  * <flags> is set iff locale semantics are to be used for code points < 256
2443  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2444  *               were used in the calculation; otherwise unchanged. */
2445
2446 UV
2447 Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2448 {
2449     dVAR;
2450
2451     UV result;
2452
2453     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
2454
2455     if (UTF8_IS_INVARIANT(*p)) {
2456         if (flags) {
2457             result = toUPPER_LC(*p);
2458         }
2459         else {
2460             return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
2461         }
2462     }
2463     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2464         if (flags) {
2465             result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2466         }
2467         else {
2468             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2469                                           ustrp, lenp, 'S');
2470         }
2471     }
2472     else {  /* utf8, ord above 255 */
2473         result = CALL_UPPER_CASE(p, ustrp, lenp);
2474
2475         if (flags) {
2476             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2477         }
2478         return result;
2479     }
2480
2481     /* Here, used locale rules.  Convert back to utf8 */
2482     if (UTF8_IS_INVARIANT(result)) {
2483         *ustrp = (U8) result;
2484         *lenp = 1;
2485     }
2486     else {
2487         *ustrp = UTF8_EIGHT_BIT_HI(result);
2488         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2489         *lenp = 2;
2490     }
2491
2492     if (tainted_ptr) {
2493         *tainted_ptr = TRUE;
2494     }
2495     return result;
2496 }
2497
2498 /*
2499 =for apidoc to_utf8_title
2500
2501 Convert the UTF-8 encoded character at C<p> to its titlecase version and
2502 store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
2503 that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2504 titlecase version may be longer than the original character.
2505
2506 The first character of the titlecased version is returned
2507 (but note, as explained above, that there may be more.)
2508
2509 The character at C<p> is assumed by this routine to be well-formed.
2510
2511 =cut */
2512
2513 /* Not currently externally documented, and subject to change:
2514  * <flags> is set iff locale semantics are to be used for code points < 256
2515  *         Since titlecase is not defined in POSIX, uppercase is used instead
2516  *         for these/
2517  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2518  *               were used in the calculation; otherwise unchanged. */
2519
2520 UV
2521 Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2522 {
2523     dVAR;
2524
2525     UV result;
2526
2527     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
2528
2529     if (UTF8_IS_INVARIANT(*p)) {
2530         if (flags) {
2531             result = toUPPER_LC(*p);
2532         }
2533         else {
2534             return _to_upper_title_latin1(*p, ustrp, lenp, 's');
2535         }
2536     }
2537     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2538         if (flags) {
2539             result = toUPPER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2540         }
2541         else {
2542             return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2543                                           ustrp, lenp, 's');
2544         }
2545     }
2546     else {  /* utf8, ord above 255 */
2547         result = CALL_TITLE_CASE(p, ustrp, lenp);
2548
2549         if (flags) {
2550             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2551         }
2552         return result;
2553     }
2554
2555     /* Here, used locale rules.  Convert back to utf8 */
2556     if (UTF8_IS_INVARIANT(result)) {
2557         *ustrp = (U8) result;
2558         *lenp = 1;
2559     }
2560     else {
2561         *ustrp = UTF8_EIGHT_BIT_HI(result);
2562         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2563         *lenp = 2;
2564     }
2565
2566     if (tainted_ptr) {
2567         *tainted_ptr = TRUE;
2568     }
2569     return result;
2570 }
2571
2572 /*
2573 =for apidoc to_utf8_lower
2574
2575 Convert the UTF-8 encoded character at C<p> to its lowercase version and
2576 store that in UTF-8 in ustrp and its length in bytes in C<lenp>.  Note
2577 that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2578 lowercase version may be longer than the original character.
2579
2580 The first character of the lowercased version is returned
2581 (but note, as explained above, that there may be more.)
2582
2583 The character at C<p> is assumed by this routine to be well-formed.
2584
2585 =cut */
2586
2587 /* Not currently externally documented, and subject to change:
2588  * <flags> is set iff locale semantics are to be used for code points < 256
2589  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2590  *               were used in the calculation; otherwise unchanged. */
2591
2592 UV
2593 Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, const bool flags, bool* tainted_ptr)
2594 {
2595     UV result;
2596
2597     dVAR;
2598
2599     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
2600
2601     if (UTF8_IS_INVARIANT(*p)) {
2602         if (flags) {
2603             result = toLOWER_LC(*p);
2604         }
2605         else {
2606             return to_lower_latin1(*p, ustrp, lenp);
2607         }
2608     }
2609     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2610         if (flags) {
2611             result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2612         }
2613         else {
2614             return to_lower_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2615                                    ustrp, lenp);
2616         }
2617     }
2618     else {  /* utf8, ord above 255 */
2619         result = CALL_LOWER_CASE(p, ustrp, lenp);
2620
2621         if (flags) {
2622             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2623         }
2624
2625         return result;
2626     }
2627
2628     /* Here, used locale rules.  Convert back to utf8 */
2629     if (UTF8_IS_INVARIANT(result)) {
2630         *ustrp = (U8) result;
2631         *lenp = 1;
2632     }
2633     else {
2634         *ustrp = UTF8_EIGHT_BIT_HI(result);
2635         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2636         *lenp = 2;
2637     }
2638
2639     if (tainted_ptr) {
2640         *tainted_ptr = TRUE;
2641     }
2642     return result;
2643 }
2644
2645 /*
2646 =for apidoc to_utf8_fold
2647
2648 Convert the UTF-8 encoded character at C<p> to its foldcase version and
2649 store that in UTF-8 in C<ustrp> and its length in bytes in C<lenp>.  Note
2650 that the C<ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
2651 foldcase version may be longer than the original character (up to
2652 three characters).
2653
2654 The first character of the foldcased version is returned
2655 (but note, as explained above, that there may be more.)
2656
2657 The character at C<p> is assumed by this routine to be well-formed.
2658
2659 =cut */
2660
2661 /* Not currently externally documented, and subject to change,
2662  * in <flags>
2663  *      bit FOLD_FLAGS_LOCALE is set iff locale semantics are to be used for code
2664  *                            points < 256.  Since foldcase is not defined in
2665  *                            POSIX, lowercase is used instead
2666  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
2667  *                            otherwise simple folds
2668  * <tainted_ptr> if non-null, *tainted_ptr will be set TRUE iff locale rules
2669  *               were used in the calculation; otherwise unchanged. */
2670
2671 UV
2672 Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags, bool* tainted_ptr)
2673 {
2674     dVAR;
2675
2676     UV result;
2677
2678     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
2679
2680     if (UTF8_IS_INVARIANT(*p)) {
2681         if (flags & FOLD_FLAGS_LOCALE) {
2682             result = toLOWER_LC(*p);
2683         }
2684         else {
2685             return _to_fold_latin1(*p, ustrp, lenp,
2686                                    cBOOL(flags & FOLD_FLAGS_FULL));
2687         }
2688     }
2689     else if UTF8_IS_DOWNGRADEABLE_START(*p) {
2690         if (flags & FOLD_FLAGS_LOCALE) {
2691             result = toLOWER_LC(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)));
2692         }
2693         else {
2694             return _to_fold_latin1(TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)),
2695                                    ustrp, lenp, cBOOL(flags & FOLD_FLAGS_FULL));
2696         }
2697     }
2698     else {  /* utf8, ord above 255 */
2699         result = CALL_FOLD_CASE(p, ustrp, lenp, flags);
2700
2701         if ((flags & FOLD_FLAGS_LOCALE)) {
2702             result = check_locale_boundary_crossing(p, result, ustrp, lenp);
2703         }
2704
2705         return result;
2706     }
2707
2708     /* Here, used locale rules.  Convert back to utf8 */
2709     if (UTF8_IS_INVARIANT(result)) {
2710         *ustrp = (U8) result;
2711         *lenp = 1;
2712     }
2713     else {
2714         *ustrp = UTF8_EIGHT_BIT_HI(result);
2715         *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
2716         *lenp = 2;
2717     }
2718
2719     if (tainted_ptr) {
2720         *tainted_ptr = TRUE;
2721     }
2722     return result;
2723 }
2724
2725 /* Note:
2726  * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
2727  * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2728  * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2729  */
2730
2731 SV*
2732 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
2733 {
2734     PERL_ARGS_ASSERT_SWASH_INIT;
2735
2736     /* Returns a copy of a swash initiated by the called function.  This is the
2737      * public interface, and returning a copy prevents others from doing
2738      * mischief on the original */
2739
2740     return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, FALSE, NULL, FALSE));
2741 }
2742
2743 SV*
2744 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, bool return_if_undef, SV* invlist, bool passed_in_invlist_has_user_defined_property)
2745 {
2746     /* Initialize and return a swash, creating it if necessary.  It does this
2747      * by calling utf8_heavy.pl in the general case.
2748      *
2749      * This interface should only be used by functions that won't destroy or
2750      * adversely change the swash, as doing so affects all other uses of the
2751      * swash in the program; the general public should use 'Perl_swash_init'
2752      * instead.
2753      *
2754      * pkg  is the name of the package that <name> should be in.
2755      * name is the name of the swash to find.  Typically it is a Unicode
2756      *      property name, including user-defined ones
2757      * listsv is a string to initialize the swash with.  It must be of the form
2758      *      documented as the subroutine return value in
2759      *      L<perlunicode/User-Defined Character Properties>
2760      * minbits is the number of bits required to represent each data element.
2761      *      It is '1' for binary properties.
2762      * none I (khw) do not understand this one, but it is used only in tr///.
2763      * return_if_undef is TRUE if the routine shouldn't croak if it can't find
2764      *      the requested property
2765      * invlist is an inversion list to initialize the swash with (or NULL)
2766      * has_user_defined_property is TRUE if <invlist> has some component that
2767      *      came from a user-defined property
2768      *
2769      * Thus there are three possible inputs to find the swash: <name>,
2770      * <listsv>, and <invlist>.  At least one must be specified.  The result
2771      * will be the union of the specified ones, although <listsv>'s various
2772      * actions can intersect, etc. what <name> gives.
2773      *
2774      * <invlist> is only valid for binary properties */
2775
2776     dVAR;
2777     SV* retval = &PL_sv_undef;
2778
2779     assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2780     assert(! invlist || minbits == 1);
2781
2782     /* If data was passed in to go out to utf8_heavy to find the swash of, do
2783      * so */
2784     if (listsv != &PL_sv_undef || strNE(name, "")) {
2785         dSP;
2786         const size_t pkg_len = strlen(pkg);
2787         const size_t name_len = strlen(name);
2788         HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2789         SV* errsv_save;
2790         GV *method;
2791
2792         PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2793
2794         PUSHSTACKi(PERLSI_MAGIC);
2795         ENTER;
2796         SAVEHINTS();
2797         save_re_context();
2798         if (PL_parser && PL_parser->error_count)
2799             SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2800         method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
2801         if (!method) {  /* demand load utf8 */
2802             ENTER;
2803             errsv_save = newSVsv(ERRSV);
2804             /* It is assumed that callers of this routine are not passing in
2805              * any user derived data.  */
2806             /* Need to do this after save_re_context() as it will set
2807              * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2808              * in Perl_magic_get).  Even line to create errsv_save can turn on
2809              * PL_tainted.  */
2810             SAVEBOOL(PL_tainted);
2811             PL_tainted = 0;
2812             Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2813                              NULL);
2814             if (!SvTRUE(ERRSV))
2815                 sv_setsv(ERRSV, errsv_save);
2816             SvREFCNT_dec(errsv_save);
2817             LEAVE;
2818         }
2819         SPAGAIN;
2820         PUSHMARK(SP);
2821         EXTEND(SP,5);
2822         mPUSHp(pkg, pkg_len);
2823         mPUSHp(name, name_len);
2824         PUSHs(listsv);
2825         mPUSHi(minbits);
2826         mPUSHi(none);
2827         PUTBACK;
2828         errsv_save = newSVsv(ERRSV);
2829         /* If we already have a pointer to the method, no need to use
2830          * call_method() to repeat the lookup.  */
2831         if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
2832             : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2833         {
2834             retval = *PL_stack_sp--;
2835             SvREFCNT_inc(retval);
2836         }
2837         if (!SvTRUE(ERRSV))
2838             sv_setsv(ERRSV, errsv_save);
2839         SvREFCNT_dec(errsv_save);
2840         LEAVE;
2841         POPSTACK;
2842         if (IN_PERL_COMPILETIME) {
2843             CopHINTS_set(PL_curcop, PL_hints);
2844         }
2845         if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
2846             if (SvPOK(retval))
2847
2848                 /* If caller wants to handle missing properties, let them */
2849                 if (return_if_undef) {
2850                     return NULL;
2851                 }
2852                 Perl_croak(aTHX_
2853                            "Can't find Unicode property definition \"%"SVf"\"",
2854                            SVfARG(retval));
2855             Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
2856         }
2857     } /* End of calling the module to find the swash */
2858
2859     /* Make sure there is an inversion list for binary properties */
2860     if (minbits == 1) {
2861         SV** swash_invlistsvp = NULL;
2862         SV* swash_invlist = NULL;
2863         bool invlist_in_swash_is_valid = FALSE;
2864         HV* swash_hv = NULL;
2865
2866         /* If this operation fetched a swash, get its already existing
2867          * inversion list or create one for it */
2868         if (retval != &PL_sv_undef) {
2869             swash_hv = MUTABLE_HV(SvRV(retval));
2870
2871             swash_invlistsvp = hv_fetchs(swash_hv, "INVLIST", FALSE);
2872             if (swash_invlistsvp) {
2873                 swash_invlist = *swash_invlistsvp;
2874                 invlist_in_swash_is_valid = TRUE;
2875             }
2876             else {
2877                 swash_invlist = _swash_to_invlist(retval);
2878             }
2879         }
2880
2881         /* If an inversion list was passed in, have to include it */
2882         if (invlist) {
2883
2884             /* Any fetched swash will by now have an inversion list in it;
2885              * otherwise <swash_invlist>  will be NULL, indicating that we
2886              * didn't fetch a swash */
2887             if (swash_invlist) {
2888
2889                 /* Add the passed-in inversion list, which invalidates the one
2890                  * already stored in the swash */
2891                 invlist_in_swash_is_valid = FALSE;
2892                 _invlist_union(invlist, swash_invlist, &swash_invlist);
2893             }
2894             else {
2895
2896                 /* Here, there is no swash already.  Set up a minimal one */
2897                 swash_hv = newHV();
2898                 retval = newRV_inc(MUTABLE_SV(swash_hv));
2899                 swash_invlist = invlist;
2900             }
2901
2902             if (passed_in_invlist_has_user_defined_property) {
2903                 if (! hv_stores(swash_hv, "USER_DEFINED", newSVuv(1))) {
2904                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2905                 }
2906             }
2907         }
2908
2909         /* Here, we have computed the union of all the passed-in data.  It may
2910          * be that there was an inversion list in the swash which didn't get
2911          * touched; otherwise save the one computed one */
2912         if (! invlist_in_swash_is_valid) {
2913             if (! hv_stores(MUTABLE_HV(SvRV(retval)), "INVLIST", swash_invlist))
2914             {
2915                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2916             }
2917         }
2918     }
2919
2920     return retval;
2921 }
2922
2923
2924 /* This API is wrong for special case conversions since we may need to
2925  * return several Unicode characters for a single Unicode character
2926  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
2927  * the lower-level routine, and it is similarly broken for returning
2928  * multiple values.  --jhi
2929  * For those, you should use to_utf8_case() instead */
2930 /* Now SWASHGET is recasted into S_swatch_get in this file. */
2931
2932 /* Note:
2933  * Returns the value of property/mapping C<swash> for the first character
2934  * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
2935  * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
2936  * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
2937  *
2938  * A "swash" is a hash which contains initially the keys/values set up by
2939  * SWASHNEW.  The purpose is to be able to completely represent a Unicode
2940  * property for all possible code points.  Things are stored in a compact form
2941  * (see utf8_heavy.pl) so that calculation is required to find the actual
2942  * property value for a given code point.  As code points are looked up, new
2943  * key/value pairs are added to the hash, so that the calculation doesn't have
2944  * to ever be re-done.  Further, each calculation is done, not just for the
2945  * desired one, but for a whole block of code points adjacent to that one.
2946  * For binary properties on ASCII machines, the block is usually for 64 code
2947  * points, starting with a code point evenly divisible by 64.  Thus if the
2948  * property value for code point 257 is requested, the code goes out and
2949  * calculates the property values for all 64 code points between 256 and 319,
2950  * and stores these as a single 64-bit long bit vector, called a "swatch",
2951  * under the key for code point 256.  The key is the UTF-8 encoding for code
2952  * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
2953  * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
2954  * for code point 258 is then requested, this code realizes that it would be
2955  * stored under the key for 256, and would find that value and extract the
2956  * relevant bit, offset from 256.
2957  *
2958  * Non-binary properties are stored in as many bits as necessary to represent
2959  * their values (32 currently, though the code is more general than that), not
2960  * as single bits, but the principal is the same: the value for each key is a
2961  * vector that encompasses the property values for all code points whose UTF-8
2962  * representations are represented by the key.  That is, for all code points
2963  * whose UTF-8 representations are length N bytes, and the key is the first N-1
2964  * bytes of that.
2965  */
2966 UV
2967 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
2968 {
2969     dVAR;
2970     HV *const hv = MUTABLE_HV(SvRV(swash));
2971     U32 klen;
2972     U32 off;
2973     STRLEN slen;
2974     STRLEN needents;
2975     const U8 *tmps = NULL;
2976     U32 bit;
2977     SV *swatch;
2978     U8 tmputf8[2];
2979     const UV c = NATIVE_TO_ASCII(*ptr);
2980
2981     PERL_ARGS_ASSERT_SWASH_FETCH;
2982
2983     /* Convert to utf8 if not already */
2984     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
2985         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
2986         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
2987         ptr = tmputf8;
2988     }
2989     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
2990      * then the "swatch" is a vec() for all the chars which start
2991      * with 0xAA..0xYY
2992      * So the key in the hash (klen) is length of encoded char -1
2993      */
2994     klen = UTF8SKIP(ptr) - 1;
2995     off  = ptr[klen];
2996
2997     if (klen == 0) {
2998       /* If char is invariant then swatch is for all the invariant chars
2999        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
3000        */
3001         needents = UTF_CONTINUATION_MARK;
3002         off      = NATIVE_TO_UTF(ptr[klen]);
3003     }
3004     else {
3005       /* If char is encoded then swatch is for the prefix */
3006         needents = (1 << UTF_ACCUMULATION_SHIFT);
3007         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
3008     }
3009
3010     /*
3011      * This single-entry cache saves about 1/3 of the utf8 overhead in test
3012      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
3013      * it's nothing to sniff at.)  Pity we usually come through at least
3014      * two function calls to get here...
3015      *
3016      * NB: this code assumes that swatches are never modified, once generated!
3017      */
3018
3019     if (hv   == PL_last_swash_hv &&
3020         klen == PL_last_swash_klen &&
3021         (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
3022     {
3023         tmps = PL_last_swash_tmps;
3024         slen = PL_last_swash_slen;
3025     }
3026     else {
3027         /* Try our second-level swatch cache, kept in a hash. */
3028         SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
3029
3030         /* If not cached, generate it via swatch_get */
3031         if (!svp || !SvPOK(*svp)
3032                  || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
3033             /* We use utf8n_to_uvuni() as we want an index into
3034                Unicode tables, not a native character number.
3035              */
3036             const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
3037                                            ckWARN(WARN_UTF8) ?
3038                                            0 : UTF8_ALLOW_ANY);
3039             swatch = swatch_get(swash,
3040                     /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3041                                 (klen) ? (code_point & ~((UV)needents - 1)) : 0,
3042                                 needents);
3043
3044             if (IN_PERL_COMPILETIME)
3045                 CopHINTS_set(PL_curcop, PL_hints);
3046
3047             svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
3048
3049             if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3050                      || (slen << 3) < needents)
3051                 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3052                            "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3053                            svp, tmps, (UV)slen, (UV)needents);
3054         }
3055
3056         PL_last_swash_hv = hv;
3057         assert(klen <= sizeof(PL_last_swash_key));
3058         PL_last_swash_klen = (U8)klen;
3059         /* FIXME change interpvar.h?  */
3060         PL_last_swash_tmps = (U8 *) tmps;
3061         PL_last_swash_slen = slen;
3062         if (klen)
3063             Copy(ptr, PL_last_swash_key, klen, U8);
3064     }
3065
3066     if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
3067         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3068
3069         /* This outputs warnings for binary properties only, assuming that
3070          * to_utf8_case() will output any for non-binary.  Also, surrogates
3071          * aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
3072
3073         if (! bitssvp || SvUV(*bitssvp) == 1) {
3074             /* User-defined properties can silently match above-Unicode */
3075             SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
3076             if (! user_defined_svp || ! SvUV(*user_defined_svp)) {
3077                 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
3078                 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3079                     "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
3080             }
3081         }
3082     }
3083
3084     switch ((int)((slen << 3) / needents)) {
3085     case 1:
3086         bit = 1 << (off & 7);
3087         off >>= 3;
3088         return (tmps[off] & bit) != 0;
3089     case 8:
3090         return tmps[off];
3091     case 16:
3092         off <<= 1;
3093         return (tmps[off] << 8) + tmps[off + 1] ;
3094     case 32:
3095         off <<= 2;
3096         return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
3097     }
3098     Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3099                "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
3100     NORETURN_FUNCTION_END;
3101 }
3102
3103 /* Read a single line of the main body of the swash input text.  These are of
3104  * the form:
3105  * 0053 0056    0073
3106  * where each number is hex.  The first two numbers form the minimum and
3107  * maximum of a range, and the third is the value associated with the range.
3108  * Not all swashes should have a third number
3109  *
3110  * On input: l    points to the beginning of the line to be examined; it points
3111  *                to somewhere in the string of the whole input text, and is
3112  *                terminated by a \n or the null string terminator.
3113  *           lend   points to the null terminator of that string
3114  *           wants_value    is non-zero if the swash expects a third number
3115  *           typestr is the name of the swash's mapping, like 'ToLower'
3116  * On output: *min, *max, and *val are set to the values read from the line.
3117  *            returns a pointer just beyond the line examined.  If there was no
3118  *            valid min number on the line, returns lend+1
3119  */
3120
3121 STATIC U8*
3122 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3123                              const bool wants_value, const U8* const typestr)
3124 {
3125     const int  typeto  = typestr[0] == 'T' && typestr[1] == 'o';
3126     STRLEN numlen;          /* Length of the number */
3127     I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3128                 | PERL_SCAN_DISALLOW_PREFIX
3129                 | PERL_SCAN_SILENT_NON_PORTABLE;
3130
3131     /* nl points to the next \n in the scan */
3132     U8* const nl = (U8*)memchr(l, '\n', lend - l);
3133
3134     /* Get the first number on the line: the range minimum */
3135     numlen = lend - l;
3136     *min = grok_hex((char *)l, &numlen, &flags, NULL);
3137     if (numlen)     /* If found a hex number, position past it */
3138         l += numlen;
3139     else if (nl) {          /* Else, go handle next line, if any */
3140         return nl + 1;  /* 1 is length of "\n" */
3141     }
3142     else {              /* Else, no next line */
3143         return lend + 1;        /* to LIST's end at which \n is not found */
3144     }
3145
3146     /* The max range value follows, separated by a BLANK */
3147     if (isBLANK(*l)) {
3148         ++l;
3149         flags = PERL_SCAN_SILENT_ILLDIGIT
3150                 | PERL_SCAN_DISALLOW_PREFIX
3151                 | PERL_SCAN_SILENT_NON_PORTABLE;
3152         numlen = lend - l;
3153         *max = grok_hex((char *)l, &numlen, &flags, NULL);
3154         if (numlen)
3155             l += numlen;
3156         else    /* If no value here, it is a single element range */
3157             *max = *min;
3158
3159         /* Non-binary tables have a third entry: what the first element of the
3160          * range maps to */
3161         if (wants_value) {
3162             if (isBLANK(*l)) {
3163                 ++l;
3164
3165                 /* The ToLc, etc table mappings are not in hex, and must be
3166                  * corrected by adding the code point to them */
3167                 if (typeto) {
3168                     char *after_strtol = (char *) lend;
3169                     *val = Strtol((char *)l, &after_strtol, 10);
3170                     l = (U8 *) after_strtol;
3171                 }
3172                 else { /* Other tables are in hex, and are the correct result
3173                           without tweaking */
3174                     flags = PERL_SCAN_SILENT_ILLDIGIT
3175                         | PERL_SCAN_DISALLOW_PREFIX
3176                         | PERL_SCAN_SILENT_NON_PORTABLE;
3177                     numlen = lend - l;
3178                     *val = grok_hex((char *)l, &numlen, &flags, NULL);
3179                     if (numlen)
3180                         l += numlen;
3181                     else
3182                         *val = 0;
3183                 }
3184             }
3185             else {
3186                 *val = 0;
3187                 if (typeto) {
3188                     /* diag_listed_as: To%s: illegal mapping '%s' */
3189                     Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3190                                      typestr, l);
3191                 }
3192             }
3193         }
3194         else
3195             *val = 0; /* bits == 1, then any val should be ignored */
3196     }
3197     else { /* Nothing following range min, should be single element with no
3198               mapping expected */
3199         *max = *min;
3200         if (wants_value) {
3201             *val = 0;
3202             if (typeto) {
3203                 /* diag_listed_as: To%s: illegal mapping '%s' */
3204                 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3205             }
3206         }
3207         else
3208             *val = 0; /* bits == 1, then val should be ignored */
3209     }
3210
3211     /* Position to next line if any, or EOF */
3212     if (nl)
3213         l = nl + 1;
3214     else
3215         l = lend;
3216
3217     return l;
3218 }
3219
3220 /* Note:
3221  * Returns a swatch (a bit vector string) for a code point sequence
3222  * that starts from the value C<start> and comprises the number C<span>.
3223  * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3224  * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3225  */
3226 STATIC SV*
3227 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
3228 {
3229     SV *swatch;
3230     U8 *l, *lend, *x, *xend, *s, *send;
3231     STRLEN lcur, xcur, scur;
3232     HV *const hv = MUTABLE_HV(SvRV(swash));
3233     SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
3234
3235     SV** listsvp = NULL; /* The string containing the main body of the table */
3236     SV** extssvp = NULL;
3237     SV** invert_it_svp = NULL;
3238     U8* typestr = NULL;
3239     STRLEN bits;
3240     STRLEN octets; /* if bits == 1, then octets == 0 */
3241     UV  none;
3242     UV  end = start + span;
3243
3244     if (invlistsvp == NULL) {
3245         SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3246         SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3247         SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3248         extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3249         listsvp = hv_fetchs(hv, "LIST", FALSE);
3250         invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3251
3252         bits  = SvUV(*bitssvp);
3253         none  = SvUV(*nonesvp);
3254         typestr = (U8*)SvPV_nolen(*typesvp);
3255     }
3256     else {
3257         bits = 1;
3258         none = 0;
3259     }
3260     octets = bits >> 3; /* if bits == 1, then octets == 0 */
3261
3262     PERL_ARGS_ASSERT_SWATCH_GET;
3263
3264     if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
3265         Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
3266                                                  (UV)bits);
3267     }
3268
3269     /* If overflowed, use the max possible */
3270     if (end < start) {
3271         end = UV_MAX;
3272         span = end - start;
3273     }
3274
3275     /* create and initialize $swatch */
3276     scur   = octets ? (span * octets) : (span + 7) / 8;
3277     swatch = newSV(scur);
3278     SvPOK_on(swatch);
3279     s = (U8*)SvPVX(swatch);
3280     if (octets && none) {
3281         const U8* const e = s + scur;
3282         while (s < e) {
3283             if (bits == 8)
3284                 *s++ = (U8)(none & 0xff);
3285             else if (bits == 16) {
3286                 *s++ = (U8)((none >>  8) & 0xff);
3287                 *s++ = (U8)( none        & 0xff);
3288             }
3289             else if (bits == 32) {
3290                 *s++ = (U8)((none >> 24) & 0xff);
3291                 *s++ = (U8)((none >> 16) & 0xff);
3292                 *s++ = (U8)((none >>  8) & 0xff);
3293                 *s++ = (U8)( none        & 0xff);
3294             }
3295         }
3296         *s = '\0';
3297     }
3298     else {
3299         (void)memzero((U8*)s, scur + 1);
3300     }
3301     SvCUR_set(swatch, scur);
3302     s = (U8*)SvPVX(swatch);
3303
3304     if (invlistsvp) {   /* If has an inversion list set up use that */
3305         _invlist_populate_swatch(*invlistsvp, start, end, s);
3306         return swatch;
3307     }
3308
3309     /* read $swash->{LIST} */
3310     l = (U8*)SvPV(*listsvp, lcur);
3311     lend = l + lcur;
3312     while (l < lend) {
3313         UV min, max, val, upper;
3314         l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3315                                          cBOOL(octets), typestr);
3316         if (l > lend) {
3317             break;
3318         }
3319
3320         /* If looking for something beyond this range, go try the next one */
3321         if (max < start)
3322             continue;
3323
3324         /* <end> is generally 1 beyond where we want to set things, but at the
3325          * platform's infinity, where we can't go any higher, we want to
3326          * include the code point at <end> */
3327         upper = (max < end)
3328                 ? max
3329                 : (max != UV_MAX || end != UV_MAX)
3330                   ? end - 1
3331                   : end;
3332
3333         if (octets) {
3334             UV key;
3335             if (min < start) {
3336                 if (!none || val < none) {
3337                     val += start - min;
3338                 }
3339                 min = start;
3340             }
3341             for (key = min; key <= upper; key++) {
3342                 STRLEN offset;
3343                 /* offset must be non-negative (start <= min <= key < end) */
3344                 offset = octets * (key - start);
3345                 if (bits == 8)
3346                     s[offset] = (U8)(val & 0xff);
3347                 else if (bits == 16) {
3348                     s[offset    ] = (U8)((val >>  8) & 0xff);
3349                     s[offset + 1] = (U8)( val        & 0xff);
3350                 }
3351                 else if (bits == 32) {
3352                     s[offset    ] = (U8)((val >> 24) & 0xff);
3353                     s[offset + 1] = (U8)((val >> 16) & 0xff);
3354                     s[offset + 2] = (U8)((val >>  8) & 0xff);
3355                     s[offset + 3] = (U8)( val        & 0xff);
3356                 }
3357
3358                 if (!none || val < none)
3359                     ++val;
3360             }
3361         }
3362         else { /* bits == 1, then val should be ignored */
3363             UV key;
3364             if (min < start)
3365                 min = start;
3366
3367             for (key = min; key <= upper; key++) {
3368                 const STRLEN offset = (STRLEN)(key - start);
3369                 s[offset >> 3] |= 1 << (offset & 7);
3370             }
3371         }
3372     } /* while */
3373
3374     /* Invert if the data says it should be.  Assumes that bits == 1 */
3375     if (invert_it_svp && SvUV(*invert_it_svp)) {
3376
3377         /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3378          * be 0, and their inversion should also be 0, as we don't succeed any
3379          * Unicode property matches for non-Unicode code points */
3380         if (start <= PERL_UNICODE_MAX) {
3381
3382             /* The code below assumes that we never cross the
3383              * Unicode/above-Unicode boundary in a range, as otherwise we would
3384              * have to figure out where to stop flipping the bits.  Since this
3385              * boundary is divisible by a large power of 2, and swatches comes
3386              * in small powers of 2, this should be a valid assumption */
3387             assert(start + span - 1 <= PERL_UNICODE_MAX);
3388
3389             send = s + scur;
3390             while (s < send) {
3391                 *s = ~(*s);
3392                 s++;
3393             }
3394         }
3395     }
3396
3397     /* read $swash->{EXTRAS}
3398      * This code also copied to swash_to_invlist() below */
3399     x = (U8*)SvPV(*extssvp, xcur);
3400     xend = x + xcur;
3401     while (x < xend) {
3402         STRLEN namelen;
3403         U8 *namestr;
3404         SV** othersvp;
3405         HV* otherhv;
3406         STRLEN otherbits;
3407         SV **otherbitssvp, *other;
3408         U8 *s, *o, *nl;
3409         STRLEN slen, olen;
3410
3411         const U8 opc = *x++;
3412         if (opc == '\n')
3413             continue;
3414
3415         nl = (U8*)memchr(x, '\n', xend - x);
3416
3417         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3418             if (nl) {
3419                 x = nl + 1; /* 1 is length of "\n" */
3420                 continue;
3421             }
3422             else {
3423                 x = xend; /* to EXTRAS' end at which \n is not found */
3424                 break;
3425             }
3426         }
3427
3428         namestr = x;
3429         if (nl) {
3430             namelen = nl - namestr;
3431             x = nl + 1;
3432         }
3433         else {
3434             namelen = xend - namestr;
3435             x = xend;
3436         }
3437
3438         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3439         otherhv = MUTABLE_HV(SvRV(*othersvp));
3440         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3441         otherbits = (STRLEN)SvUV(*otherbitssvp);
3442         if (bits < otherbits)
3443             Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3444                        "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
3445
3446         /* The "other" swatch must be destroyed after. */
3447         other = swatch_get(*othersvp, start, span);
3448         o = (U8*)SvPV(other, olen);
3449
3450         if (!olen)
3451             Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
3452
3453         s = (U8*)SvPV(swatch, slen);
3454         if (bits == 1 && otherbits == 1) {
3455             if (slen != olen)
3456                 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3457                            "mismatch, slen=%"UVuf", olen=%"UVuf,
3458                            (UV)slen, (UV)olen);
3459
3460             switch (opc) {
3461             case '+':
3462                 while (slen--)
3463                     *s++ |= *o++;
3464                 break;
3465             case '!':
3466                 while (slen--)
3467                     *s++ |= ~*o++;
3468                 break;
3469             case '-':
3470                 while (slen--)
3471                     *s++ &= ~*o++;
3472                 break;
3473             case '&':
3474                 while (slen--)
3475                     *s++ &= *o++;
3476                 break;
3477             default:
3478                 break;
3479             }
3480         }
3481         else {
3482             STRLEN otheroctets = otherbits >> 3;
3483             STRLEN offset = 0;
3484             U8* const send = s + slen;
3485
3486             while (s < send) {
3487                 UV otherval = 0;
3488
3489                 if (otherbits == 1) {
3490                     otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3491                     ++offset;
3492                 }
3493                 else {
3494                     STRLEN vlen = otheroctets;
3495                     otherval = *o++;
3496                     while (--vlen) {
3497                         otherval <<= 8;
3498                         otherval |= *o++;
3499                     }
3500                 }
3501
3502                 if (opc == '+' && otherval)
3503                     NOOP;   /* replace with otherval */
3504                 else if (opc == '!' && !otherval)
3505                     otherval = 1;
3506                 else if (opc == '-' && otherval)
3507                     otherval = 0;
3508                 else if (opc == '&' && !otherval)
3509                     otherval = 0;
3510                 else {
3511                     s += octets; /* no replacement */
3512                     continue;
3513                 }
3514
3515                 if (bits == 8)
3516                     *s++ = (U8)( otherval & 0xff);
3517                 else if (bits == 16) {
3518                     *s++ = (U8)((otherval >>  8) & 0xff);
3519                     *s++ = (U8)( otherval        & 0xff);
3520                 }
3521                 else if (bits == 32) {
3522                     *s++ = (U8)((otherval >> 24) & 0xff);
3523                     *s++ = (U8)((otherval >> 16) & 0xff);
3524                     *s++ = (U8)((otherval >>  8) & 0xff);
3525                     *s++ = (U8)( otherval        & 0xff);
3526                 }
3527             }
3528         }
3529         sv_free(other); /* through with it! */
3530     } /* while */
3531     return swatch;
3532 }
3533
3534 HV*
3535 Perl__swash_inversion_hash(pTHX_ SV* const swash)
3536 {
3537
3538    /* Subject to change or removal.  For use only in one place in regcomp.c.
3539     * Can't be used on a property that is subject to user override, as it
3540     * relies on the value of SPECIALS in the swash which would be set by
3541     * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3542     * for overridden properties
3543     *
3544     * Returns a hash which is the inversion and closure of a swash mapping.
3545     * For example, consider the input lines:
3546     * 004B              006B
3547     * 004C              006C
3548     * 212A              006B
3549     *
3550     * The returned hash would have two keys, the utf8 for 006B and the utf8 for
3551     * 006C.  The value for each key is an array.  For 006C, the array would
3552     * have a two elements, the utf8 for itself, and for 004C.  For 006B, there
3553     * would be three elements in its array, the utf8 for 006B, 004B and 212A.
3554     *
3555     * Essentially, for any code point, it gives all the code points that map to
3556     * it, or the list of 'froms' for that point.
3557     *
3558     * Currently it ignores any additions or deletions from other swashes,
3559     * looking at just the main body of the swash, and if there are SPECIALS
3560     * in the swash, at that hash
3561     *
3562     * The specials hash can be extra code points, and most likely consists of
3563     * maps from single code points to multiple ones (each expressed as a string
3564     * of utf8 characters).   This function currently returns only 1-1 mappings.
3565     * However consider this possible input in the specials hash:
3566     * "\xEF\xAC\x85" => "\x{0073}\x{0074}",         # U+FB05 => 0073 0074
3567     * "\xEF\xAC\x86" => "\x{0073}\x{0074}",         # U+FB06 => 0073 0074
3568     *
3569     * Both FB05 and FB06 map to the same multi-char sequence, which we don't
3570     * currently handle.  But it also means that FB05 and FB06 are equivalent in
3571     * a 1-1 mapping which we should handle, and this relationship may not be in
3572     * the main table.  Therefore this function examines all the multi-char
3573     * sequences and adds the 1-1 mappings that come out of that.  */
3574
3575     U8 *l, *lend;
3576     STRLEN lcur;
3577     HV *const hv = MUTABLE_HV(SvRV(swash));
3578
3579     /* The string containing the main body of the table */
3580     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3581
3582     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3583     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3584     SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3585     /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
3586     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3587     const STRLEN bits  = SvUV(*bitssvp);
3588     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3589     const UV     none  = SvUV(*nonesvp);
3590     SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
3591
3592     HV* ret = newHV();
3593
3594     PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
3595
3596     /* Must have at least 8 bits to get the mappings */
3597     if (bits != 8 && bits != 16 && bits != 32) {
3598         Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
3599                                                  (UV)bits);
3600     }
3601
3602     if (specials_p) { /* It might be "special" (sometimes, but not always, a
3603                         mapping to more than one character */
3604
3605         /* Construct an inverse mapping hash for the specials */
3606         HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
3607         HV * specials_inverse = newHV();
3608         char *char_from; /* the lhs of the map */
3609         I32 from_len;   /* its byte length */
3610         char *char_to;  /* the rhs of the map */
3611         I32 to_len;     /* its byte length */
3612         SV *sv_to;      /* and in a sv */
3613         AV* from_list;  /* list of things that map to each 'to' */
3614
3615         hv_iterinit(specials_hv);
3616
3617         /* The keys are the characters (in utf8) that map to the corresponding
3618          * utf8 string value.  Iterate through the list creating the inverse
3619          * list. */
3620         while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
3621             SV** listp;
3622             if (! SvPOK(sv_to)) {
3623                 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
3624                            "unexpectedly is not a string, flags=%lu",
3625                            (unsigned long)SvFLAGS(sv_to));
3626             }
3627             /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
3628
3629             /* Each key in the inverse list is a mapped-to value, and the key's
3630              * hash value is a list of the strings (each in utf8) that map to
3631              * it.  Those strings are all one character long */
3632             if ((listp = hv_fetch(specials_inverse,
3633                                     SvPVX(sv_to),
3634                                     SvCUR(sv_to), 0)))
3635             {
3636                 from_list = (AV*) *listp;
3637             }
3638             else { /* No entry yet for it: create one */
3639                 from_list = newAV();
3640                 if (! hv_store(specials_inverse,
3641                                 SvPVX(sv_to),
3642                                 SvCUR(sv_to),
3643                                 (SV*) from_list, 0))
3644                 {
3645                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3646                 }
3647             }
3648
3649             /* Here have the list associated with this 'to' (perhaps newly
3650              * created and empty).  Just add to it.  Note that we ASSUME that
3651              * the input is guaranteed to not have duplications, so we don't
3652              * check for that.  Duplications just slow down execution time. */
3653             av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
3654         }
3655
3656         /* Here, 'specials_inverse' contains the inverse mapping.  Go through
3657          * it looking for cases like the FB05/FB06 examples above.  There would
3658          * be an entry in the hash like
3659         *       'st' => [ FB05, FB06 ]
3660         * In this example we will create two lists that get stored in the
3661         * returned hash, 'ret':
3662         *       FB05 => [ FB05, FB06 ]
3663         *       FB06 => [ FB05, FB06 ]
3664         *
3665         * Note that there is nothing to do if the array only has one element.
3666         * (In the normal 1-1 case handled below, we don't have to worry about
3667         * two lists, as everything gets tied to the single list that is
3668         * generated for the single character 'to'.  But here, we are omitting
3669         * that list, ('st' in the example), so must have multiple lists.) */
3670         while ((from_list = (AV *) hv_iternextsv(specials_inverse,
3671                                                  &char_to, &to_len)))
3672         {
3673             if (av_len(from_list) > 0) {
3674                 int i;
3675
3676                 /* We iterate over all combinations of i,j to place each code
3677                  * point on each list */
3678                 for (i = 0; i <= av_len(from_list); i++) {
3679                     int j;
3680                     AV* i_list = newAV();
3681                     SV** entryp = av_fetch(from_list, i, FALSE);
3682                     if (entryp == NULL) {
3683                         Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3684                     }
3685                     if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
3686                         Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
3687                     }
3688                     if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
3689                                    (SV*) i_list, FALSE))
3690                     {
3691                         Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3692                     }
3693
3694                     /* For debugging: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
3695                     for (j = 0; j <= av_len(from_list); j++) {
3696                         entryp = av_fetch(from_list, j, FALSE);
3697                         if (entryp == NULL) {
3698                             Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3699                         }
3700
3701                         /* When i==j this adds itself to the list */
3702                         av_push(i_list, newSVuv(utf8_to_uvchr_buf(
3703                                         (U8*) SvPVX(*entryp),
3704                                         (U8*) SvPVX(*entryp) + SvCUR(*entryp),
3705                                         0)));
3706                         /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
3707                     }
3708                 }
3709             }
3710         }
3711         SvREFCNT_dec(specials_inverse); /* done with it */
3712     } /* End of specials */
3713
3714     /* read $swash->{LIST} */
3715     l = (U8*)SvPV(*listsvp, lcur);
3716     lend = l + lcur;
3717
3718     /* Go through each input line */
3719     while (l < lend) {
3720         UV min, max, val;
3721         UV inverse;
3722         l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
3723                                          cBOOL(octets), typestr);
3724         if (l > lend) {
3725             break;
3726         }
3727
3728         /* Each element in the range is to be inverted */
3729         for (inverse = min; inverse <= max; inverse++) {
3730             AV* list;
3731             SV** listp;
3732             IV i;
3733             bool found_key = FALSE;
3734             bool found_inverse = FALSE;
3735
3736             /* The key is the inverse mapping */
3737             char key[UTF8_MAXBYTES+1];
3738             char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
3739             STRLEN key_len = key_end - key;
3740
3741             /* Get the list for the map */
3742             if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
3743                 list = (AV*) *listp;
3744             }
3745             else { /* No entry yet for it: create one */
3746                 list = newAV();
3747                 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
3748                     Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3749                 }
3750             }
3751
3752             /* Look through list to see if this inverse mapping already is
3753              * listed, or if there is a mapping to itself already */
3754             for (i = 0; i <= av_len(list); i++) {
3755                 SV** entryp = av_fetch(list, i, FALSE);
3756                 SV* entry;
3757                 if (entryp == NULL) {
3758                     Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
3759                 }
3760                 entry = *entryp;
3761                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %"UVXf" contains %"UVXf"\n", val, SvUV(entry)));*/
3762                 if (SvUV(entry) == val) {
3763                     found_key = TRUE;
3764                 }
3765                 if (SvUV(entry) == inverse) {
3766                     found_inverse = TRUE;
3767                 }
3768
3769                 /* No need to continue searching if found everything we are
3770                  * looking for */
3771                 if (found_key && found_inverse) {
3772                     break;
3773                 }
3774             }
3775
3776             /* Make sure there is a mapping to itself on the list */
3777             if (! found_key) {
3778                 av_push(list, newSVuv(val));
3779                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", val, val));*/
3780             }
3781
3782
3783             /* Simply add the value to the list */
3784             if (! found_inverse) {
3785                 av_push(list, newSVuv(inverse));
3786                 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Adding %"UVXf" to list for %"UVXf"\n", inverse, val));*/
3787             }
3788
3789             /* swatch_get() increments the value of val for each element in the
3790              * range.  That makes more compact tables possible.  You can
3791              * express the capitalization, for example, of all consecutive
3792              * letters with a single line: 0061\t007A\t0041 This maps 0061 to
3793              * 0041, 0062 to 0042, etc.  I (khw) have never understood 'none',
3794              * and it's not documented; it appears to be used only in
3795              * implementing tr//; I copied the semantics from swatch_get(), just
3796              * in case */
3797             if (!none || val < none) {
3798                 ++val;
3799             }
3800         }
3801     }
3802
3803     return ret;
3804 }
3805
3806 SV*
3807 Perl__swash_to_invlist(pTHX_ SV* const swash)
3808 {
3809
3810    /* Subject to change or removal.  For use only in one place in regcomp.c */
3811
3812     U8 *l, *lend;
3813     char *loc;
3814     STRLEN lcur;
3815     HV *const hv = MUTABLE_HV(SvRV(swash));
3816     UV elements = 0;    /* Number of elements in the inversion list */
3817     U8 empty[] = "";
3818
3819     /* The string containing the main body of the table */
3820     SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
3821     SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3822     SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3823     SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3824     SV** const invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3825
3826     const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
3827     const STRLEN bits  = SvUV(*bitssvp);
3828     const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
3829     U8 *x, *xend;
3830     STRLEN xcur;
3831
3832     SV* invlist;
3833
3834     PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
3835
3836     /* read $swash->{LIST} */
3837     if (SvPOK(*listsvp)) {
3838         l = (U8*)SvPV(*listsvp, lcur);
3839     }
3840     else {
3841         /* LIST legitimately doesn't contain a string during compilation phases
3842          * of Perl itself, before the Unicode tables are generated.  In this
3843          * case, just fake things up by creating an empty list */
3844         l = empty;
3845         lcur = 0;
3846     }
3847     loc = (char *) l;
3848     lend = l + lcur;
3849
3850     /* Scan the input to count the number of lines to preallocate array size
3851      * based on worst possible case, which is each line in the input creates 2
3852      * elements in the inversion list: 1) the beginning of a range in the list;
3853      * 2) the beginning of a range not in the list.  */
3854     while ((loc = (strchr(loc, '\n'))) != NULL) {
3855         elements += 2;
3856         loc++;
3857     }
3858
3859     /* If the ending is somehow corrupt and isn't a new line, add another
3860      * element for the final range that isn't in the inversion list */
3861     if (! (*lend == '\n'
3862         || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
3863     {
3864         elements++;
3865     }
3866
3867     invlist = _new_invlist(elements);
3868
3869     /* Now go through the input again, adding each range to the list */
3870     while (l < lend) {
3871         UV start, end;
3872         UV val;         /* Not used by this function */
3873
3874         l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
3875                                          cBOOL(octets), typestr);
3876
3877         if (l > lend) {
3878             break;
3879         }
3880
3881         invlist = _add_range_to_invlist(invlist, start, end);
3882     }
3883
3884     /* Invert if the data says it should be */
3885     if (invert_it_svp && SvUV(*invert_it_svp)) {
3886         _invlist_invert_prop(invlist);
3887     }
3888
3889     /* This code is copied from swatch_get()
3890      * read $swash->{EXTRAS} */
3891     x = (U8*)SvPV(*extssvp, xcur);
3892     xend = x + xcur;
3893     while (x < xend) {
3894         STRLEN namelen;
3895         U8 *namestr;
3896         SV** othersvp;
3897         HV* otherhv;
3898         STRLEN otherbits;
3899         SV **otherbitssvp, *other;
3900         U8 *nl;
3901
3902         const U8 opc = *x++;
3903         if (opc == '\n')
3904             continue;
3905
3906         nl = (U8*)memchr(x, '\n', xend - x);
3907
3908         if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3909             if (nl) {
3910                 x = nl + 1; /* 1 is length of "\n" */
3911                 continue;
3912             }
3913             else {
3914                 x = xend; /* to EXTRAS' end at which \n is not found */
3915                 break;
3916             }
3917         }
3918
3919         namestr = x;
3920         if (nl) {
3921             namelen = nl - namestr;
3922             x = nl + 1;
3923         }
3924         else {
3925             namelen = xend - namestr;
3926             x = xend;
3927         }
3928
3929         othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
3930         otherhv = MUTABLE_HV(SvRV(*othersvp));
3931         otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
3932         otherbits = (STRLEN)SvUV(*otherbitssvp);
3933
3934         if (bits != otherbits || bits != 1) {
3935             Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
3936                        "properties, bits=%"UVuf", otherbits=%"UVuf,
3937                        (UV)bits, (UV)otherbits);
3938         }
3939
3940         /* The "other" swatch must be destroyed after. */
3941         other = _swash_to_invlist((SV *)*othersvp);
3942
3943         /* End of code copied from swatch_get() */
3944         switch (opc) {
3945         case '+':
3946             _invlist_union(invlist, other, &invlist);
3947             break;
3948         case '!':
3949             _invlist_invert(other);
3950             _invlist_union(invlist, other, &invlist);
3951             break;
3952         case '-':
3953             _invlist_subtract(invlist, other, &invlist);
3954             break;
3955         case '&':
3956             _invlist_intersection(invlist, other, &invlist);
3957             break;
3958         default:
3959             break;
3960         }
3961         sv_free(other); /* through with it! */
3962     }
3963
3964     return invlist;
3965 }
3966
3967 /*
3968 =for apidoc uvchr_to_utf8
3969
3970 Adds the UTF-8 representation of the Native code point C<uv> to the end
3971 of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
3972 bytes available. The return value is the pointer to the byte after the
3973 end of the new character. In other words,
3974
3975     d = uvchr_to_utf8(d, uv);
3976
3977 is the recommended wide native character-aware way of saying
3978
3979     *(d++) = uv;
3980
3981 =cut
3982 */
3983
3984 /* On ASCII machines this is normally a macro but we want a
3985    real function in case XS code wants it
3986 */
3987 U8 *
3988 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
3989 {
3990     PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
3991
3992     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
3993 }
3994
3995 U8 *
3996 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
3997 {
3998     PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
3999
4000     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
4001 }
4002
4003 /*
4004 =for apidoc utf8n_to_uvchr
4005
4006 Returns the native character value of the first character in the string
4007 C<s>
4008 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
4009 length, in bytes, of that character.
4010
4011 C<length> and C<flags> are the same as L</utf8n_to_uvuni>().
4012
4013 =cut
4014 */
4015 /* On ASCII machines this is normally a macro but we want
4016    a real function in case XS code wants it
4017 */
4018 UV
4019 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
4020 U32 flags)
4021 {
4022     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
4023
4024     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
4025
4026     return UNI_TO_NATIVE(uv);
4027 }
4028
4029 bool
4030 Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
4031 {
4032     /* May change: warns if surrogates, non-character code points, or
4033      * non-Unicode code points are in s which has length len bytes.  Returns
4034      * TRUE if none found; FALSE otherwise.  The only other validity check is
4035      * to make sure that this won't exceed the string's length */
4036
4037     const U8* const e = s + len;
4038     bool ok = TRUE;
4039
4040     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4041
4042     while (s < e) {
4043         if (UTF8SKIP(s) > len) {
4044             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4045                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4046             return FALSE;
4047         }
4048         if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
4049             STRLEN char_len;
4050             if (UTF8_IS_SUPER(s)) {
4051                 if (ckWARN_d(WARN_NON_UNICODE)) {
4052                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4053                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
4054                         "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
4055                     ok = FALSE;
4056                 }
4057             }
4058             else if (UTF8_IS_SURROGATE(s)) {
4059                 if (ckWARN_d(WARN_SURROGATE)) {
4060                     UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4061                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4062                         "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
4063                     ok = FALSE;
4064                 }
4065             }
4066             else if
4067                 ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
4068                  && (ckWARN_d(WARN_NONCHAR)))
4069             {
4070                 UV uv = utf8_to_uvchr_buf(s, e, &char_len);
4071                 Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
4072                     "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
4073                 ok = FALSE;
4074             }
4075         }
4076         s += UTF8SKIP(s);
4077     }
4078
4079     return ok;
4080 }
4081
4082 /*
4083 =for apidoc pv_uni_display
4084
4085 Build to the scalar C<dsv> a displayable version of the string C<spv>,
4086 length C<len>, the displayable version being at most C<pvlim> bytes long
4087 (if longer, the rest is truncated and "..." will be appended).
4088
4089 The C<flags> argument can have UNI_DISPLAY_ISPRINT set to display
4090 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
4091 to display the \\[nrfta\\] as the backslashed versions (like '\n')
4092 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
4093 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
4094 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
4095
4096 The pointer to the PV of the C<dsv> is returned.
4097
4098 =cut */
4099 char *
4100 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
4101 {
4102     int truncated = 0;
4103     const char *s, *e;
4104
4105     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4106
4107     sv_setpvs(dsv, "");
4108     SvUTF8_off(dsv);
4109     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4110          UV u;
4111           /* This serves double duty as a flag and a character to print after
4112              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4113           */
4114          char ok = 0;
4115
4116          if (pvlim && SvCUR(dsv) >= pvlim) {
4117               truncated++;
4118               break;
4119          }
4120          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4121          if (u < 256) {
4122              const unsigned char c = (unsigned char)u & 0xFF;
4123              if (flags & UNI_DISPLAY_BACKSLASH) {
4124                  switch (c) {
4125                  case '\n':
4126                      ok = 'n'; break;
4127                  case '\r':
4128                      ok = 'r'; break;
4129                  case '\t':
4130                      ok = 't'; break;
4131                  case '\f':
4132                      ok = 'f'; break;
4133                  case '\a':
4134                      ok = 'a'; break;
4135                  case '\\':
4136                      ok = '\\'; break;
4137                  default: break;
4138                  }
4139                  if (ok) {
4140                      const char string = ok;
4141                      sv_catpvs(dsv, "\\");
4142                      sv_catpvn(dsv, &string, 1);
4143                  }
4144              }
4145              /* isPRINT() is the locale-blind version. */
4146              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4147                  const char string = c;
4148                  sv_catpvn(dsv, &string, 1);
4149                  ok = 1;
4150              }
4151          }
4152          if (!ok)
4153              Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
4154     }
4155     if (truncated)
4156          sv_catpvs(dsv, "...");
4157
4158     return SvPVX(dsv);
4159 }
4160
4161 /*
4162 =for apidoc sv_uni_display
4163
4164 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4165 the displayable version being at most C<pvlim> bytes long
4166 (if longer, the rest is truncated and "..." will be appended).
4167
4168 The C<flags> argument is as in L</pv_uni_display>().
4169
4170 The pointer to the PV of the C<dsv> is returned.
4171
4172 =cut
4173 */
4174 char *
4175 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4176 {
4177     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4178
4179      return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
4180                                 SvCUR(ssv), pvlim, flags);
4181 }
4182
4183 /*
4184 =for apidoc foldEQ_utf8
4185
4186 Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
4187 of which may be in UTF-8) are the same case-insensitively; false otherwise.
4188 How far into the strings to compare is determined by other input parameters.
4189
4190 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4191 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for C<u2>
4192 with respect to C<s2>.
4193
4194 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
4195 equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.  The
4196 scan will not be considered to be a match unless the goal is reached, and
4197 scanning won't continue past that goal.  Correspondingly for C<l2> with respect to
4198 C<s2>.
4199
4200 If C<pe1> is non-NULL and the pointer it points to is not NULL, that pointer is
4201 considered an end pointer beyond which scanning of C<s1> will not continue under
4202 any circumstances.  This means that if both C<l1> and C<pe1> are specified, and
4203 C<pe1>
4204 is less than C<s1>+C<l1>, the match will never be successful because it can
4205 never
4206 get as far as its goal (and in fact is asserted against).  Correspondingly for
4207 C<pe2> with respect to C<s2>.
4208
4209 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4210 C<l2> must be non-zero), and if both do, both have to be
4211 reached for a successful match.   Also, if the fold of a character is multiple
4212 characters, all of them must be matched (see tr21 reference below for
4213 'folding').
4214
4215 Upon a successful match, if C<pe1> is non-NULL,
4216 it will be set to point to the beginning of the I<next> character of C<s1>
4217 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4218
4219 For case-insensitiveness, the "casefolding" of Unicode is used
4220 instead of upper/lowercasing both the characters, see
4221 L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
4222
4223 =cut */
4224
4225 /* A flags parameter has been added which may change, and hence isn't
4226  * externally documented.  Currently it is:
4227  *  0 for as-documented above
4228  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4229                             ASCII one, to not match
4230  *  FOLDEQ_UTF8_LOCALE      meaning that locale rules are to be used for code
4231  *                          points below 256; unicode rules for above 255; and
4232  *                          folds that cross those boundaries are disallowed,
4233  *                          like the NOMIX_ASCII option
4234  *  FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
4235  *                           routine.  This allows that step to be skipped.
4236  *  FOLDEQ_S2_ALREADY_FOLDED   Similarly.
4237  */
4238 I32
4239 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
4240 {
4241     dVAR;
4242     register const U8 *p1  = (const U8*)s1; /* Point to current char */
4243     register const U8 *p2  = (const U8*)s2;
4244     register const U8 *g1 = NULL;       /* goal for s1 */
4245     register const U8 *g2 = NULL;
4246     register const U8 *e1 = NULL;       /* Don't scan s1 past this */
4247     register U8 *f1 = NULL;             /* Point to current folded */
4248     register const U8 *e2 = NULL;
4249     register U8 *f2 = NULL;
4250     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4251     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4252     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4253
4254     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4255
4256     /* The algorithm requires that input with the flags on the first line of
4257      * the assert not be pre-folded. */
4258     assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_UTF8_LOCALE))
4259         && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
4260
4261     if (pe1) {
4262         e1 = *(U8**)pe1;
4263     }
4264
4265     if (l1) {
4266         g1 = (const U8*)s1 + l1;
4267     }
4268
4269     if (pe2) {
4270         e2 = *(U8**)pe2;
4271     }
4272
4273     if (l2) {
4274         g2 = (const U8*)s2 + l2;
4275     }
4276
4277     /* Must have at least one goal */
4278     assert(g1 || g2);
4279
4280     if (g1) {
4281
4282         /* Will never match if goal is out-of-bounds */
4283         assert(! e1  || e1 >= g1);
4284
4285         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4286         * only go as far as the goal */
4287         e1 = g1;
4288     }
4289     else {
4290         assert(e1);    /* Must have an end for looking at s1 */
4291     }
4292
4293     /* Same for goal for s2 */
4294     if (g2) {
4295         assert(! e2  || e2 >= g2);
4296         e2 = g2;
4297     }
4298     else {
4299         assert(e2);
4300     }
4301
4302     /* If both operands are already folded, we could just do a memEQ on the
4303      * whole strings at once, but it would be better if the caller realized
4304      * this and didn't even call us */
4305
4306     /* Look through both strings, a character at a time */
4307     while (p1 < e1 && p2 < e2) {
4308
4309         /* If at the beginning of a new character in s1, get its fold to use
4310          * and the length of the fold.  (exception: locale rules just get the
4311          * character to a single byte) */
4312         if (n1 == 0) {
4313             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4314                 f1 = (U8 *) p1;
4315                 n1 = UTF8SKIP(f1);
4316             }
4317
4318             else {
4319                 /* If in locale matching, we use two sets of rules, depending
4320                  * on if the code point is above or below 255.  Here, we test
4321                  * for and handle locale rules */
4322                 if ((flags & FOLDEQ_UTF8_LOCALE)
4323                     && (! u1 || UTF8_IS_INVARIANT(*p1)
4324                         || UTF8_IS_DOWNGRADEABLE_START(*p1)))
4325                 {
4326                     /* There is no mixing of code points above and below 255. */
4327                     if (u2 && (! UTF8_IS_INVARIANT(*p2)
4328                         && ! UTF8_IS_DOWNGRADEABLE_START(*p2)))
4329                     {
4330                         return 0;
4331                     }
4332
4333                     /* We handle locale rules by converting, if necessary, the
4334                      * code point to a single byte. */
4335                     if (! u1 || UTF8_IS_INVARIANT(*p1)) {
4336                         *foldbuf1 = *p1;
4337                     }
4338                     else {
4339                         *foldbuf1 = TWO_BYTE_UTF8_TO_UNI(*p1, *(p1 + 1));
4340                     }
4341                     n1 = 1;
4342                 }
4343                 else if (isASCII(*p1)) {    /* Note, that here won't be both
4344                                                ASCII and using locale rules */
4345
4346                     /* If trying to mix non- with ASCII, and not supposed to,
4347                      * fail */
4348                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4349                         return 0;
4350                     }
4351                     n1 = 1;
4352                     *foldbuf1 = toLOWER(*p1);   /* Folds in the ASCII range are
4353                                                    just lowercased */
4354                 }
4355                 else if (u1) {
4356                     to_utf8_fold(p1, foldbuf1, &n1);
4357                 }
4358                 else {  /* Not utf8, get utf8 fold */
4359                     to_uni_fold(NATIVE_TO_UNI(*p1), foldbuf1, &n1);
4360                 }
4361                 f1 = foldbuf1;
4362             }
4363         }
4364
4365         if (n2 == 0) {    /* Same for s2 */
4366             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4367                 f2 = (U8 *) p2;
4368                 n2 = UTF8SKIP(f2);
4369             }
4370             else {
4371                 if ((flags & FOLDEQ_UTF8_LOCALE)
4372                     && (! u2 || UTF8_IS_INVARIANT(*p2) || UTF8_IS_DOWNGRADEABLE_START(*p2)))
4373                 {
4374                     /* Here, the next char in s2 is < 256.  We've already
4375                      * worked on s1, and if it isn't also < 256, can't match */
4376                     if (u1 && (! UTF8_IS_INVARIANT(*p1)
4377                         && ! UTF8_IS_DOWNGRADEABLE_START(*p1)))
4378                     {
4379                         return 0;
4380                     }
4381                     if (! u2 || UTF8_IS_INVARIANT(*p2)) {
4382                         *foldbuf2 = *p2;
4383                     }
4384                     else {
4385                         *foldbuf2 = TWO_BYTE_UTF8_TO_UNI(*p2, *(p2 + 1));
4386                     }
4387
4388                     /* Use another function to handle locale rules.  We've made
4389                      * sure that both characters to compare are single bytes */
4390                     if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
4391                         return 0;
4392                     }
4393                     n1 = n2 = 0;
4394                 }
4395                 else if (isASCII(*p2)) {
4396                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4397                         return 0;
4398                     }
4399                     n2 = 1;
4400                     *foldbuf2 = toLOWER(*p2);
4401                 }
4402                 else if (u2) {
4403                     to_utf8_fold(p2, foldbuf2, &n2);
4404                 }
4405                 else {
4406                     to_uni_fold(NATIVE_TO_UNI(*p2), foldbuf2, &n2);
4407                 }
4408                 f2 = foldbuf2;
4409             }
4410         }
4411
4412         /* Here f1 and f2 point to the beginning of the strings to compare.
4413          * These strings are the folds of the next character from each input
4414          * string, stored in utf8. */
4415
4416         /* While there is more to look for in both folds, see if they
4417         * continue to match */
4418         while (n1 && n2) {
4419             U8 fold_length = UTF8SKIP(f1);
4420             if (fold_length != UTF8SKIP(f2)
4421                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4422                                                        function call for single
4423                                                        byte */
4424                 || memNE((char*)f1, (char*)f2, fold_length))
4425             {
4426                 return 0; /* mismatch */
4427             }
4428
4429             /* Here, they matched, advance past them */
4430             n1 -= fold_length;
4431             f1 += fold_length;
4432             n2 -= fold_length;
4433             f2 += fold_length;
4434         }
4435
4436         /* When reach the end of any fold, advance the input past it */
4437         if (n1 == 0) {
4438             p1 += u1 ? UTF8SKIP(p1) : 1;
4439         }
4440         if (n2 == 0) {
4441             p2 += u2 ? UTF8SKIP(p2) : 1;
4442         }
4443     } /* End of loop through both strings */
4444
4445     /* A match is defined by each scan that specified an explicit length
4446     * reaching its final goal, and the other not having matched a partial
4447     * character (which can happen when the fold of a character is more than one
4448     * character). */
4449     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4450         return 0;
4451     }
4452
4453     /* Successful match.  Set output pointers */
4454     if (pe1) {
4455         *pe1 = (char*)p1;
4456     }
4457     if (pe2) {
4458         *pe2 = (char*)p2;
4459     }
4460     return 1;
4461 }
4462
4463 /*
4464  * Local variables:
4465  * c-indentation-style: bsd
4466  * c-basic-offset: 4
4467  * indent-tabs-mode: t
4468  * End:
4469  *
4470  * ex: set ts=8 sts=4 sw=4 noet:
4471  */