utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34 #include "invlist_inline.h"
  35
  36 static const char malformed_text[] = "Malformed UTF-8 character";
  37 static const char unees[] =
  38                         "Malformed UTF-8 character (unexpected end of string)";
  39
  40 /* strlen() of a literal string constant.  We might want this more general,
  41  * but using it in just this file for now.  A problem with more generality is
  42  * the compiler warnings about comparing unlike signs */
  43 #define STRLENs(s)  (sizeof("" s "") - 1)
  44
  45 /*
  46 These are various utility functions for manipulating UTF8-encoded
  47 strings.  For the uninitiated, this is a method of representing arbitrary
  48 Unicode characters as a variable number of bytes, in such a way that
  49 characters in the ASCII range are unmodified, and a zero byte never appears
  50 within non-zero characters.
  51 */
  52
  53 /* helper for Perl__force_out_malformed_utf8_message(). Like
  54  * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
  55  * PL_compiling */
  56
  57 static void
  58 S_restore_cop_warnings(pTHX_ void *p)
  59 {
  60     free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
  61 }
  62
  63
  64 void
  65 Perl__force_out_malformed_utf8_message(pTHX_
  66             const U8 *const p,      /* First byte in UTF-8 sequence */
  67             const U8 * const e,     /* Final byte in sequence (may include
  68                                        multiple chars */
  69             const U32 flags,        /* Flags to pass to utf8n_to_uvchr(),
  70                                        usually 0, or some DISALLOW flags */
  71             const bool die_here)    /* If TRUE, this function does not return */
  72 {
  73     /* This core-only function is to be called when a malformed UTF-8 character
  74      * is found, in order to output the detailed information about the
  75      * malformation before dieing.  The reason it exists is for the occasions
  76      * when such a malformation is fatal, but warnings might be turned off, so
  77      * that normally they would not be actually output.  This ensures that they
  78      * do get output.  Because a sequence may be malformed in more than one
  79      * way, multiple messages may be generated, so we can't make them fatal, as
  80      * that would cause the first one to die.
  81      *
  82      * Instead we pretend -W was passed to perl, then die afterwards.  The
  83      * flexibility is here to return to the caller so they can finish up and
  84      * die themselves */
  85     U32 errors;
  86
  87     PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
  88
  89     ENTER;
  90     SAVEI8(PL_dowarn);
  91     SAVESPTR(PL_curcop);
  92
  93     PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
  94     if (PL_curcop) {
  95         /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
  96          * than PL_compiling */
  97         SAVEDESTRUCTOR_X(S_restore_cop_warnings,
  98                 (void*)PL_curcop->cop_warnings);
  99         PL_curcop->cop_warnings = pWARN_ALL;
 100     }
 101
 102     (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
 103
 104     LEAVE;
 105
 106     if (! errors) {
 107         Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
 108                          " be called only when there are errors found");
 109     }
 110
 111     if (die_here) {
 112         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
 113     }
 114 }
 115
 116 STATIC HV *
 117 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
 118                    U32 categories,  /* Packed warning categories */
 119                    U32 flag)        /* Flag associated with this message */
 120 {
 121     /* Creates, populates, and returns an HV* that describes an error message
 122      * for the translators between UTF8 and code point */
 123
 124     SV* msg_sv = newSVpv(message, 0);
 125     SV* category_sv = newSVuv(categories);
 126     SV* flag_bit_sv = newSVuv(flag);
 127
 128     HV* msg_hv = newHV();
 129
 130     PERL_ARGS_ASSERT_NEW_MSG_HV;
 131
 132     (void) hv_stores(msg_hv, "text", msg_sv);
 133     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
 134     (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
 135
 136     return msg_hv;
 137 }
 138
 139 /*
 140 =for apidoc uvoffuni_to_utf8_flags
 141
 142 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 143 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
 144 L<perlapi/uvchr_to_utf8_flags>>.
 145
 146 This function is like them, but the input is a strict Unicode
 147 (as opposed to native) code point.  Only in very rare circumstances should code
 148 not be using the native code point.
 149
 150 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
 151
 152 =cut
 153 */
 154
 155 U8 *
 156 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
 157 {
 158     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
 159
 160     return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
 161 }
 162
 163 /* All these formats take a single UV code point argument */
 164 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
 165 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
 166                                    " is not recommended for open interchange";
 167 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
 168                                    " may not be portable";
 169
 170 /*  Use shorter names internally in this file */
 171 #define SHIFT   UTF_ACCUMULATION_SHIFT
 172 #undef  MARK
 173 #define MARK    UTF_CONTINUATION_MARK
 174 #define MASK    UTF_CONTINUATION_MASK
 175
 176 /*
 177 =for apidoc uvchr_to_utf8_flags_msgs
 178
 179 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 180
 181 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
 182
 183 This function is for code that wants any warning and/or error messages to be
 184 returned to the caller rather than be displayed.  All messages that would have
 185 been displayed if all lexical warnings are enabled will be returned.
 186
 187 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
 188 placed after all the others, C<msgs>.  If this parameter is 0, this function
 189 behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
 190 be a pointer to an C<HV *> variable, in which this function creates a new HV to
 191 contain any appropriate messages.  The hash has three key-value pairs, as
 192 follows:
 193
 194 =over 4
 195
 196 =item C<text>
 197
 198 The text of the message as a C<SVpv>.
 199
 200 =item C<warn_categories>
 201
 202 The warning category (or categories) packed into a C<SVuv>.
 203
 204 =item C<flag>
 205
 206 A single flag bit associated with this message, in a C<SVuv>.
 207 The bit corresponds to some bit in the C<*errors> return value,
 208 such as C<UNICODE_GOT_SURROGATE>.
 209
 210 =back
 211
 212 It's important to note that specifying this parameter as non-null will cause
 213 any warnings this function would otherwise generate to be suppressed, and
 214 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
 215 (or not) when choosing what to do with the returned messages.
 216
 217 The caller, of course, is responsible for freeing any returned HV.
 218
 219 =cut
 220 */
 221
 222 /* Undocumented; we don't want people using this.  Instead they should use
 223  * uvchr_to_utf8_flags_msgs() */
 224 U8 *
 225 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
 226 {
 227     U8 *p;
 228     UV shifted_uv = input_uv;
 229     STRLEN utf8_skip = OFFUNISKIP(input_uv);
 230
 231     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
 232
 233     if (msgs) {
 234         *msgs = NULL;
 235     }
 236
 237     switch (utf8_skip) {
 238       case 1:
 239         *d++ = LATIN1_TO_NATIVE(input_uv);
 240         return d;
 241
 242       default:
 243         if (   UNLIKELY(input_uv > MAX_LEGAL_CP
 244             && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
 245         {
 246             Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
 247                                                          NULL, 0, input_uv));
 248         }
 249
 250         if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
 251             U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
 252             const char * format = PL_extended_cp_format;
 253             if (msgs) {
 254                 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 255                                    category,
 256                                    UNICODE_GOT_PERL_EXTENDED);
 257             }
 258             else {
 259                 Perl_ck_warner_d(aTHX_ category, format, input_uv);
 260             }
 261
 262             /* Don't output a 2nd msg */
 263             flags &= ~UNICODE_WARN_SUPER;
 264         }
 265
 266         if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
 267             return NULL;
 268         }
 269
 270         p = d + utf8_skip - 1;
 271         while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
 272             *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 273             shifted_uv >>= SHIFT;
 274         }
 275
 276         /* FALLTHROUGH */
 277
 278       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 279         d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 280                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 281         shifted_uv >>= SHIFT;
 282         /* FALLTHROUGH */
 283
 284       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 285         d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 286                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 287         shifted_uv >>= SHIFT;
 288         /* FALLTHROUGH */
 289
 290       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 291         if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
 292             if (flags & UNICODE_WARN_SUPER) {
 293                 U32 category = packWARN(WARN_NON_UNICODE);
 294                 const char * format = super_cp_format;
 295
 296                 if (msgs) {
 297                     *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 298                                        category,
 299                                        UNICODE_GOT_SUPER);
 300                 }
 301                 else {
 302                     Perl_ck_warner_d(aTHX_ category, format, input_uv);
 303                 }
 304
 305                 if (flags & UNICODE_DISALLOW_SUPER) {
 306                     return NULL;
 307                 }
 308             }
 309             if (       (flags & UNICODE_DISALLOW_SUPER)
 310                 || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
 311                     &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
 312             {
 313                 return NULL;
 314             }
 315         }
 316
 317         d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 318                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 319         shifted_uv >>= SHIFT;
 320         /* FALLTHROUGH */
 321
 322       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 323         if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
 324             if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
 325                 if (flags & UNICODE_WARN_NONCHAR) {
 326                     U32 category = packWARN(WARN_NONCHAR);
 327                     const char * format = nonchar_cp_format;
 328                     if (msgs) {
 329                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 330                                            category,
 331                                            UNICODE_GOT_NONCHAR);
 332                     }
 333                     else {
 334                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
 335                     }
 336                 }
 337                 if (flags & UNICODE_DISALLOW_NONCHAR) {
 338                     return NULL;
 339                 }
 340             }
 341             else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
 342                 if (flags & UNICODE_WARN_SURROGATE) {
 343                     U32 category = packWARN(WARN_SURROGATE);
 344                     const char * format = surrogate_cp_format;
 345                     if (msgs) {
 346                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 347                                            category,
 348                                            UNICODE_GOT_SURROGATE);
 349                     }
 350                     else {
 351                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
 352                     }
 353                 }
 354                 if (flags & UNICODE_DISALLOW_SURROGATE) {
 355                     return NULL;
 356                 }
 357             }
 358         }
 359
 360         d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 361                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 362         shifted_uv >>= SHIFT;
 363         /* FALLTHROUGH */
 364
 365 #ifdef EBCDIC
 366
 367       case 3:
 368         d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 369         shifted_uv >>= SHIFT;
 370         /* FALLTHROUGH */
 371
 372 #endif
 373
 374         /* FALLTHROUGH */
 375       case 2:
 376         d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 377         shifted_uv >>= SHIFT;
 378         d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
 379                                              | UTF_START_MARK(utf8_skip));
 380         break;
 381     }
 382
 383     return d + utf8_skip;
 384 }
 385
 386 /*
 387 =for apidoc uvchr_to_utf8
 388
 389 Adds the UTF-8 representation of the native code point C<uv> to the end
 390 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 391 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 392 the byte after the end of the new character.  In other words,
 393
 394     d = uvchr_to_utf8(d, uv);
 395
 396 is the recommended wide native character-aware way of saying
 397
 398     *(d++) = uv;
 399
 400 This function accepts any code point from 0..C<IV_MAX> as input.
 401 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
 402
 403 It is possible to forbid or warn on non-Unicode code points, or those that may
 404 be problematic by using L</uvchr_to_utf8_flags>.
 405
 406 =cut
 407 */
 408
 409 /* This is also a macro */
 410 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 411
 412 U8 *
 413 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 414 {
 415     return uvchr_to_utf8(d, uv);
 416 }
 417
 418 /*
 419 =for apidoc uvchr_to_utf8_flags
 420
 421 Adds the UTF-8 representation of the native code point C<uv> to the end
 422 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 423 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 424 the byte after the end of the new character.  In other words,
 425
 426     d = uvchr_to_utf8_flags(d, uv, flags);
 427
 428 or, in most cases,
 429
 430     d = uvchr_to_utf8_flags(d, uv, 0);
 431
 432 This is the Unicode-aware way of saying
 433
 434     *(d++) = uv;
 435
 436 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
 437 input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
 438
 439 Specifying C<flags> can further restrict what is allowed and not warned on, as
 440 follows:
 441
 442 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
 443 the function will raise a warning, provided UTF8 warnings are enabled.  If
 444 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
 445 NULL.  If both flags are set, the function will both warn and return NULL.
 446
 447 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
 448 affect how the function handles a Unicode non-character.
 449
 450 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
 451 affect the handling of code points that are above the Unicode maximum of
 452 0x10FFFF.  Languages other than Perl may not be able to accept files that
 453 contain these.
 454
 455 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
 456 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
 457 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
 458 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
 459 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
 460 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
 461 above-Unicode and surrogate flags, but not the non-character ones, as
 462 defined in
 463 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
 464 See L<perlunicode/Noncharacter code points>.
 465
 466 Extremely high code points were never specified in any standard, and require an
 467 extension to UTF-8 to express, which Perl does.  It is likely that programs
 468 written in something other than Perl would not be able to read files that
 469 contain these; nor would Perl understand files written by something that uses a
 470 different extension.  For these reasons, there is a separate set of flags that
 471 can warn and/or disallow these extremely high code points, even if other
 472 above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
 473 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
 474 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
 475 treat all above-Unicode code points, including these, as malformations.  (Note
 476 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
 477 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
 478
 479 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
 480 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
 481 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
 482 C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
 483 platforms,these flags can apply to code points that actually do fit in 31 bits.
 484 The new names accurately describe the situation in all cases.
 485
 486 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
 487 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
 488 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
 489 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
 490 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
 491 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
 492 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
 493 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
 494 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
 495 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
 496 =for apidoc Amnh||UNICODE_WARN_NONCHAR
 497 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
 498 =for apidoc Amnh||UNICODE_WARN_SUPER
 499 =for apidoc Amnh||UNICODE_WARN_SURROGATE
 500
 501 =cut
 502 */
 503
 504 /* This is also a macro */
 505 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 506
 507 U8 *
 508 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 509 {
 510     return uvchr_to_utf8_flags(d, uv, flags);
 511 }
 512
 513 PERL_STATIC_INLINE int
 514 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
 515 {
 516     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 517      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
 518      * it isn't, and -1 if there isn't enough information to tell.  This last
 519      * return value can happen if the sequence is incomplete, missing some
 520      * trailing bytes that would form a complete character.  If there are
 521      * enough bytes to make a definitive decision, this function does so.
 522      * Usually 2 bytes are sufficient.
 523      *
 524      * Overlongs can occur whenever the number of continuation bytes changes.
 525      * That means whenever the number of leading 1 bits in a start byte
 526      * increases from the next lower start byte.  That happens for start bytes
 527      * C0, E0, F0, F8, FC, FE, and FF.
 528      */
 529
 530     PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
 531
 532     /* Each platform has overlongs after the start bytes given above (expressed
 533      * in I8 for EBCDIC).  The values below were found by manually inspecting
 534      * the UTF-8 patterns.  See the tables in utf8.h and utfebcdic.h. */
 535
 536     switch (NATIVE_UTF8_TO_I8(s[0])) {
 537       default:
 538         assert(UTF8_IS_START(s[0]));
 539         return 0;
 540
 541       case 0xC0:
 542       case 0xC1:
 543         return 1;
 544
 545 #ifdef EBCDIC
 546
 547       case 0xC2:
 548       case 0xC3:
 549       case 0xC4:
 550       case 0xE0:
 551         return 1;
 552 #else
 553       case 0xE0:
 554         return (len < 2) ? -1 : s[1] < 0xA0;
 555 #endif
 556
 557       case 0xF0:
 558         return (len < 2)
 559                ? -1
 560                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
 561       case 0xF8:
 562         return (len < 2)
 563                ? -1
 564                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
 565       case 0xFC:
 566         return (len < 2)
 567                ? -1
 568                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
 569       case 0xFE:
 570         return (len < 2)
 571                ? -1
 572                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
 573       case 0xFF:
 574         return isFF_overlong(s, len);
 575     }
 576 }
 577
 578 PERL_STATIC_INLINE int
 579 S_isFF_overlong(const U8 * const s, const STRLEN len)
 580 {
 581     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 582      * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
 583      * it isn't, and -1 if there isn't enough information to tell.  This last
 584      * return value can happen if the sequence is incomplete, missing some
 585      * trailing bytes that would form a complete character.  If there are
 586      * enough bytes to make a definitive decision, this function does so. */
 587
 588     PERL_ARGS_ASSERT_ISFF_OVERLONG;
 589
 590 #ifdef EBCDIC
 591     /* This works on all three EBCDIC code pages traditionally supported by
 592      * perl */
 593 #  define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
 594 #else
 595 #  define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
 596 #endif
 597
 598     /* To be an FF overlong, all the available bytes must match */
 599     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
 600                      MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
 601     {
 602         return 0;
 603     }
 604
 605     /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
 606      * be there; what comes after them doesn't matter.  See tables in utf8.h,
 607      * utfebcdic.h. */
 608     if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
 609         return 1;
 610     }
 611
 612     /* The missing bytes could cause the result to go one way or the other, so
 613      * the result is indeterminate */
 614     return -1;
 615 }
 616
 617 /* At some point we may want to allow core to use up to UV_MAX */
 618
 619 #ifdef EBCDIC     /* Actually is I8 */
 620 #  if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
 621 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA7"
 622                               /* UV_MAX "\xFF\xAF" */
 623 #  else      /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
 624 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
 625                               /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
 626 #  endif
 627 #else
 628 #  if defined(UV_IS_QUAD)
 629 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\x80\x87"
 630                               /* UV_MAX "\xFF\x80" */
 631 #  else
 632 #    define HIGHEST_REPRESENTABLE_UTF  "\xFD"
 633                               /* UV_MAX "\xFE\x83" */
 634 #  endif
 635 #endif
 636
 637 PERL_STATIC_INLINE int
 638 S_does_utf8_overflow(const U8 * const s,
 639                      const U8 * e,
 640                      const bool consider_overlongs)
 641 {
 642     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 643      * 'e' - 1 would overflow an IV on this platform; that is if it represents
 644      * a code point larger than the highest representable code point.  It
 645      * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
 646      * enough information to tell.  This last return value can happen if the
 647      * sequence is incomplete, missing some trailing bytes that would form a
 648      * complete character.  If there are enough bytes to make a definitive
 649      * decision, this function does so.
 650      *
 651      * If 'consider_overlongs' is TRUE, the function checks for the possibility
 652      * that the sequence is an overlong that doesn't overflow.  Otherwise, it
 653      * assumes the sequence is not an overlong.  This can give different
 654      * results only on ASCII 32-bit platforms.
 655      *
 656      * (For ASCII platforms, we could use memcmp() because we don't have to
 657      * convert each byte to I8, but it's very rare input indeed that would
 658      * approach overflow, so the loop below will likely only get executed once.)
 659      *
 660      */
 661     const STRLEN len = e - s;
 662     const U8 *x;
 663     const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
 664     int is_overlong = 0;
 665
 666     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
 667
 668     for (x = s; x < e; x++, y++) {
 669
 670         /* 'y' is set up to not include the trailing bytes that are all the
 671          * maximum possible continuation byte.  So when we reach the end of 'y'
 672          * (known to be NUL terminated), it is impossible for 'x' to contain
 673          * bytes larger than those omitted bytes, and therefore 'x' can't
 674          * overflow */
 675         if (*y == '\0') {
 676             return 0;
 677         }
 678
 679         /* If this byte is less than the corresponding highest non-overflowing
 680          * UTF-8, the sequence doesn't overflow */
 681         if (NATIVE_UTF8_TO_I8(*x) < *y) {
 682             return 0;
 683         }
 684
 685         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
 686             goto overflows_if_not_overlong;
 687         }
 688     }
 689
 690     /* Got to the end, and all bytes are the same.  If the input is a whole
 691      * character, it doesn't overflow.  And if it is a partial character,
 692      * there's not enough information to tell */
 693     return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
 694
 695   overflows_if_not_overlong:
 696
 697     /* Here, a well-formed sequence overflows.  If we are assuming
 698      * well-formedness, return that it overflows. */
 699     if (! consider_overlongs) {
 700         return 1;
 701     }
 702
 703     /* Here, it could be the overlong malformation, and might not actuallly
 704      * overflow if you were to calculate it out.
 705      *
 706      * See if it actually is overlong */
 707     is_overlong = is_utf8_overlong(s, len);
 708
 709     /* If it isn't overlong, is well-formed, so overflows */
 710     if (is_overlong == 0) {
 711         return 1;
 712     }
 713
 714     /* Not long enough to determine */
 715     if (is_overlong < 0) {
 716         return -1;
 717     }
 718
 719     /* Here, it appears to overflow, but it is also overlong */
 720
 721 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
 722
 723     /* On many platforms, it is impossible for an overlong to overflow.  For
 724      * these, no further work is necessary: we can return immediately that this
 725      * overlong that is an apparent overflow actually isn't
 726      *
 727      * To see why, note that a length_N sequence can represent as overlongs all
 728      * the code points representable by shorter length sequences, but no
 729      * higher.  If it could represent a higher code point without being an
 730      * overlong, we wouldn't have had to increase the sequence length!
 731      *
 732      * The highest possible start byte is FF; the next highest is FE.  The
 733      * highest code point representable as an overlong on the platform is thus
 734      * the highest code point representable by a non-overlong sequence whose
 735      * start byte is FE.  If that value doesn't overflow the platform's word
 736      * size, overlongs can't overflow.
 737      *
 738      * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
 739      * information (the high 7 bits, all ones, say that the sequence is 7 bytes
 740      * long, and the bottom, zero, bit is s placeholder. That leaves the 6
 741      * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
 742       If that number of bits doesn't exceed the word size, it can't overflow. */
 743
 744     return 0;
 745
 746 #else
 747
 748     /* In practice, only a 32-bit ASCII box gets here.  The FE start byte can
 749      * represent, as an overlong, the highest code point representable by an FD
 750      * start byte, which is 5*6 continuation bytes of info plus one bit from
 751      * the start byte, or 31 bits.  That doesn't overflow.  More explicitly:
 752      * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
 753      *
 754      * That means only the FF start byte can have an overflowing overlong. */
 755     if (*s < 0xFF) {
 756         return 0;
 757     }
 758
 759     /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
 760      * evaluates to 2**31, so overflows an IV.  For a UV it's
 761      *              \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
 762 #  define OVERFLOWS  "\xff\x80\x80\x80\x80\x80\x80\x82"
 763
 764     if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) {   /* Not enough info */
 765          return -1;
 766     }
 767
 768 #  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
 769
 770     return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
 771
 772 #endif
 773
 774 }
 775
 776 STRLEN
 777 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
 778 {
 779     SSize_t len, full_len;
 780
 781     /* An internal helper function.
 782      *
 783      * On input:
 784      *  's' is a string, which is known to be syntactically valid UTF-8 as far
 785      *      as (e - 1); e > s must hold.
 786      *  'e' This function is allowed to look at any byte from 's'...'e-1', but
 787      *      nowhere else.  The function has to cope as best it can if that
 788      *      sequence does not form a full character.
 789      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
 790      *      accepted by L</utf8n_to_uvchr>.  If non-zero, this function returns
 791      *      0 if it determines the input will match something disallowed.
 792      * On output:
 793      *  The return is the number of bytes required to represent the code point
 794      *  if it isn't disallowed by 'flags'; 0 otherwise.  Be aware that if the
 795      *  input is for a partial character, a successful return will be larger
 796      *  than 'e - s'.
 797      *
 798      *  If *s..*(e-1) is only for a partial character, the function will return
 799      *  non-zero if there is any sequence of well-formed UTF-8 that, when
 800      *  appended to the input sequence, could result in an allowed code point;
 801      *  otherwise it returns 0.  Non characters cannot be determined based on
 802      *  partial character input.  But many  of the other excluded types can be
 803      *  determined with just the first one or two bytes.
 804      *
 805      */
 806
 807     PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
 808
 809     assert(e > s);
 810     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 811                           |UTF8_DISALLOW_PERL_EXTENDED)));
 812
 813     full_len = UTF8SKIP(s);
 814
 815     len = e - s;
 816     if (len > full_len) {
 817         e = s + full_len;
 818         len = full_len;
 819     }
 820
 821     switch (full_len) {
 822         bool is_super;
 823
 824       default: /* Extended */
 825         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
 826             return 0;
 827         }
 828
 829         /* FALLTHROUGH */
 830
 831       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
 832       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
 833
 834         if (flags & UTF8_DISALLOW_SUPER) {
 835             return 0;                       /* Above Unicode */
 836         }
 837
 838         return full_len;
 839
 840       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 841         is_super = (   UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
 842                     || (   len > 1
 843                         && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
 844                         && NATIVE_UTF8_TO_I8(s[1])
 845                                                 >= UTF_FIRST_CONT_BYTE_110000_));
 846         if (is_super) {
 847             if (flags & UTF8_DISALLOW_SUPER) {
 848                 return 0;
 849             }
 850         }
 851         else if (   (flags & UTF8_DISALLOW_NONCHAR)
 852                  && len == full_len
 853                  && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
 854         {
 855             return 0;
 856         }
 857
 858         return full_len;
 859
 860       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 861
 862         if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
 863             return full_len;
 864         }
 865
 866         if (   (flags & UTF8_DISALLOW_SURROGATE)
 867             &&  UNLIKELY(is_SURROGATE_utf8(s)))
 868         {
 869             return 0;       /* Surrogate */
 870         }
 871
 872         if (  (flags & UTF8_DISALLOW_NONCHAR)
 873             && len == full_len
 874             && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
 875         {
 876             return 0;
 877         }
 878
 879         return full_len;
 880
 881       /* The lower code points don't have any disallowable characters */
 882 #ifdef EBCDIC
 883       case 3:
 884         return full_len;
 885 #endif
 886
 887       case 2:
 888       case 1:
 889         return full_len;
 890     }
 891 }
 892
 893 Size_t
 894 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
 895                         const bool require_partial)
 896 {
 897     /* This is called to determine if the UTF-8 sequence starting at s0 and
 898      * continuing for up to one full character of bytes, but looking no further
 899      * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
 900      * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
 901      * for the largest code points recognized by Perl, the ones that require
 902      * the most UTF-8 bytes per character to represent (somewhat less than
 903      * twice the size of the next longest kind).  This sequence will only ever
 904      * be Perl extended UTF-8.
 905      *
 906      * The routine returns 0 if the sequence is not fully valid, syntactically
 907      * or semantically.  That means it checks that everything following the
 908      * start byte is a continuation byte, and that it doesn't overflow, nor is
 909      * an overlong representation.
 910      *
 911      * If 'require_partial' is FALSE, the routine returns non-zero only if the
 912      * input (as far as 'e-1') is a full character.  The return is the count of
 913      * the bytes in the character.
 914      *
 915      * If 'require_partial' is TRUE, the routine returns non-zero only if the
 916      * input as far as 'e-1' is a partial, not full character, with no
 917      * malformations found before position 'e'.  The return is either just
 918      * FALSE, or TRUE.  */
 919
 920     const U8 *s = s0 + 1;
 921     const U8 *send = e;
 922
 923     PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
 924
 925     assert(s0 < e);
 926     assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
 927
 928     send = s + MIN(UTF8_MAXBYTES - 1, e - s);
 929     while (s < send) {
 930         if (! UTF8_IS_CONTINUATION(*s)) {
 931             return 0;
 932         }
 933
 934         s++;
 935     }
 936
 937     if (0 < does_utf8_overflow(s0, e,
 938                                FALSE /* Don't consider_overlongs */
 939     )) {
 940         return 0;
 941     }
 942
 943     if (0 < isFF_overlong(s0, e - s0)) {
 944         return 0;
 945     }
 946
 947     /* Here, the character is valid as far as it got.  Check if got a partial
 948      * character */
 949     if (s - s0 < UTF8_MAXBYTES) {
 950         return (require_partial) ? 1 : 0;
 951     }
 952
 953     /* Here, got a full character */
 954     return (require_partial) ? 0 : UTF8_MAXBYTES;
 955 }
 956
 957 char *
 958 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
 959 {
 960     /* Returns a mortalized C string that is a displayable copy of the 'len'
 961      * bytes starting at 'start'.  'format' gives how to display each byte.
 962      * Currently, there are only two formats, so it is currently a bool:
 963      *      0   \xab
 964      *      1    ab         (that is a space between two hex digit bytes)
 965      */
 966
 967     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
 968                                                trailing NUL */
 969     const U8 * s = start;
 970     const U8 * const e = start + len;
 971     char * output;
 972     char * d;
 973
 974     PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
 975
 976     Newx(output, output_len, char);
 977     SAVEFREEPV(output);
 978
 979     d = output;
 980     for (s = start; s < e; s++) {
 981         const unsigned high_nibble = (*s & 0xF0) >> 4;
 982         const unsigned low_nibble =  (*s & 0x0F);
 983
 984         if (format) {
 985             if (s > start) {
 986                 *d++ = ' ';
 987             }
 988         }
 989         else {
 990             *d++ = '\\';
 991             *d++ = 'x';
 992         }
 993
 994         if (high_nibble < 10) {
 995             *d++ = high_nibble + '0';
 996         }
 997         else {
 998             *d++ = high_nibble - 10 + 'a';
 999         }
1000
1001         if (low_nibble < 10) {
1002             *d++ = low_nibble + '0';
1003         }
1004         else {
1005             *d++ = low_nibble - 10 + 'a';
1006         }
1007     }
1008
1009     *d = '\0';
1010     return output;
1011 }
1012
1013 PERL_STATIC_INLINE char *
1014 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1015
1016                                          /* Max number of bytes to print */
1017                                          STRLEN print_len,
1018
1019                                          /* Which one is the non-continuation */
1020                                          const STRLEN non_cont_byte_pos,
1021
1022                                          /* How many bytes should there be? */
1023                                          const STRLEN expect_len)
1024 {
1025     /* Return the malformation warning text for an unexpected continuation
1026      * byte. */
1027
1028     const char * const where = (non_cont_byte_pos == 1)
1029                                ? "immediately"
1030                                : Perl_form(aTHX_ "%d bytes",
1031                                                  (int) non_cont_byte_pos);
1032     const U8 * x = s + non_cont_byte_pos;
1033     const U8 * e = s + print_len;
1034
1035     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1036
1037     /* We don't need to pass this parameter, but since it has already been
1038      * calculated, it's likely faster to pass it; verify under DEBUGGING */
1039     assert(expect_len == UTF8SKIP(s));
1040
1041     /* As a defensive coding measure, don't output anything past a NUL.  Such
1042      * bytes shouldn't be in the middle of a malformation, and could mark the
1043      * end of the allocated string, and what comes after is undefined */
1044     for (; x < e; x++) {
1045         if (*x == '\0') {
1046             x++;            /* Output this particular NUL */
1047             break;
1048         }
1049     }
1050
1051     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1052                            " %s after start byte 0x%02x; need %d bytes, got %d)",
1053                            malformed_text,
1054                            _byte_dump_string(s, x - s, 0),
1055                            *(s + non_cont_byte_pos),
1056                            where,
1057                            *s,
1058                            (int) expect_len,
1059                            (int) non_cont_byte_pos);
1060 }
1061
1062 /*
1063
1064 =for apidoc utf8n_to_uvchr
1065
1066 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1067 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1068 directly.
1069
1070 Bottom level UTF-8 decode routine.
1071 Returns the native code point value of the first character in the string C<s>,
1072 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1073 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1074 the length, in bytes, of that character.
1075
1076 The value of C<flags> determines the behavior when C<s> does not point to a
1077 well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
1078 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1079 is the next possible position in C<s> that could begin a non-malformed
1080 character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
1081 is raised.  Some UTF-8 input sequences may contain multiple malformations.
1082 This function tries to find every possible one in each call, so multiple
1083 warnings can be raised for the same sequence.
1084
1085 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1086 individual types of malformations, such as the sequence being overlong (that
1087 is, when there is a shorter sequence that can express the same code point;
1088 overlong sequences are expressly forbidden in the UTF-8 standard due to
1089 potential security issues).  Another malformation example is the first byte of
1090 a character not being a legal first byte.  See F<utf8.h> for the list of such
1091 flags.  Even if allowed, this function generally returns the Unicode
1092 REPLACEMENT CHARACTER when it encounters a malformation.  There are flags in
1093 F<utf8.h> to override this behavior for the overlong malformations, but don't
1094 do that except for very specialized purposes.
1095
1096 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1097 flags) malformation is found.  If this flag is set, the routine assumes that
1098 the caller will raise a warning, and this function will silently just set
1099 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1100
1101 Note that this API requires disambiguation between successful decoding a C<NUL>
1102 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1103 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1104 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
1105 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
1106 error.  Or you can use C<L</utf8n_to_uvchr_error>>.
1107
1108 Certain code points are considered problematic.  These are Unicode surrogates,
1109 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1110 By default these are considered regular code points, but certain situations
1111 warrant special handling for them, which can be specified using the C<flags>
1112 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1113 three classes are treated as malformations and handled as such.  The flags
1114 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1115 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1116 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1117 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1118 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1119 definition given by
1120 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1121 The difference between traditional strictness and C9 strictness is that the
1122 latter does not forbid non-character code points.  (They are still discouraged,
1123 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
1124
1125 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1126 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1127 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1128 raised for their respective categories, but otherwise the code points are
1129 considered valid (not malformations).  To get a category to both be treated as
1130 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1131 (But note that warnings are not raised if lexically disabled nor if
1132 C<UTF8_CHECK_ONLY> is also specified.)
1133
1134 Extremely high code points were never specified in any standard, and require an
1135 extension to UTF-8 to express, which Perl does.  It is likely that programs
1136 written in something other than Perl would not be able to read files that
1137 contain these; nor would Perl understand files written by something that uses a
1138 different extension.  For these reasons, there is a separate set of flags that
1139 can warn and/or disallow these extremely high code points, even if other
1140 above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
1141 C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
1142 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
1143 above-Unicode code points, including these, as malformations.
1144 (Note that the Unicode standard considers anything above 0x10FFFF to be
1145 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1146 (2**31 -1))
1147
1148 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1149 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
1150 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1151 C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
1152 can apply to code points that actually do fit in 31 bits.  This happens on
1153 EBCDIC platforms, and sometimes when the L<overlong
1154 malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
1155 describe the situation in all cases.
1156
1157
1158 All other code points corresponding to Unicode characters, including private
1159 use and those yet to be assigned, are never considered malformed and never
1160 warn.
1161
1162 =for apidoc Amnh||UTF8_CHECK_ONLY
1163 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1164 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1165 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1166 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1167 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1168 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1169 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1170 =for apidoc Amnh||UTF8_WARN_SURROGATE
1171 =for apidoc Amnh||UTF8_WARN_NONCHAR
1172 =for apidoc Amnh||UTF8_WARN_SUPER
1173 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1174 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1175
1176 =cut
1177
1178 Also implemented as a macro in utf8.h
1179 */
1180
1181 UV
1182 Perl_utf8n_to_uvchr(const U8 *s,
1183                     STRLEN curlen,
1184                     STRLEN *retlen,
1185                     const U32 flags)
1186 {
1187     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1188
1189     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1190 }
1191
1192 /*
1193
1194 =for apidoc utf8n_to_uvchr_error
1195
1196 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1197 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1198 directly.
1199
1200 This function is for code that needs to know what the precise malformation(s)
1201 are when an error is found.  If you also need to know the generated warning
1202 messages, use L</utf8n_to_uvchr_msgs>() instead.
1203
1204 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1205 all the others, C<errors>.  If this parameter is 0, this function behaves
1206 identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
1207 to a C<U32> variable, which this function sets to indicate any errors found.
1208 Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
1209 C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
1210 of these bits will be set if a malformation is found, even if the input
1211 C<flags> parameter indicates that the given malformation is allowed; those
1212 exceptions are noted:
1213
1214 =over 4
1215
1216 =item C<UTF8_GOT_PERL_EXTENDED>
1217
1218 The input sequence is not standard UTF-8, but a Perl extension.  This bit is
1219 set only if the input C<flags> parameter contains either the
1220 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1221
1222 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1223 and so some extension must be used to express them.  Perl uses a natural
1224 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1225 extension to represent even higher ones, so that any code point that fits in a
1226 64-bit word can be represented.  Text using these extensions is not likely to
1227 be portable to non-Perl code.  We lump both of these extensions together and
1228 refer to them as Perl extended UTF-8.  There exist other extensions that people
1229 have invented, incompatible with Perl's.
1230
1231 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1232 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1233 than on ASCII.  Prior to that, code points 2**31 and higher were simply
1234 unrepresentable, and a different, incompatible method was used to represent
1235 code points between 2**30 and 2**31 - 1.
1236
1237 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1238 Perl extended UTF-8 is used.
1239
1240 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1241 may use for backward compatibility.  That name is misleading, as this flag may
1242 be set when the code point actually does fit in 31 bits.  This happens on
1243 EBCDIC platforms, and sometimes when the L<overlong
1244 malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
1245 describes the situation in all cases.
1246
1247 =item C<UTF8_GOT_CONTINUATION>
1248
1249 The input sequence was malformed in that the first byte was a UTF-8
1250 continuation byte.
1251
1252 =item C<UTF8_GOT_EMPTY>
1253
1254 The input C<curlen> parameter was 0.
1255
1256 =item C<UTF8_GOT_LONG>
1257
1258 The input sequence was malformed in that there is some other sequence that
1259 evaluates to the same code point, but that sequence is shorter than this one.
1260
1261 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1262 it was discovered that this created security issues.
1263
1264 =item C<UTF8_GOT_NONCHAR>
1265
1266 The code point represented by the input UTF-8 sequence is for a Unicode
1267 non-character code point.
1268 This bit is set only if the input C<flags> parameter contains either the
1269 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1270
1271 =item C<UTF8_GOT_NON_CONTINUATION>
1272
1273 The input sequence was malformed in that a non-continuation type byte was found
1274 in a position where only a continuation type one should be.  See also
1275 C<L</UTF8_GOT_SHORT>>.
1276
1277 =item C<UTF8_GOT_OVERFLOW>
1278
1279 The input sequence was malformed in that it is for a code point that is not
1280 representable in the number of bits available in an IV on the current platform.
1281
1282 =item C<UTF8_GOT_SHORT>
1283
1284 The input sequence was malformed in that C<curlen> is smaller than required for
1285 a complete sequence.  In other words, the input is for a partial character
1286 sequence.
1287
1288
1289 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1290 sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1291 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1292 sequence was looked at.   If no other flags are present, it means that the
1293 sequence was valid as far as it went.  Depending on the application, this could
1294 mean one of three things:
1295
1296 =over
1297
1298 =item *
1299
1300 The C<curlen> length parameter passed in was too small, and the function was
1301 prevented from examining all the necessary bytes.
1302
1303 =item *
1304
1305 The buffer being looked at is based on reading data, and the data received so
1306 far stopped in the middle of a character, so that the next read will
1307 read the remainder of this character.  (It is up to the caller to deal with the
1308 split bytes somehow.)
1309
1310 =item *
1311
1312 This is a real error, and the partial sequence is all we're going to get.
1313
1314 =back
1315
1316 =item C<UTF8_GOT_SUPER>
1317
1318 The input sequence was malformed in that it is for a non-Unicode code point;
1319 that is, one above the legal Unicode maximum.
1320 This bit is set only if the input C<flags> parameter contains either the
1321 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1322
1323 =item C<UTF8_GOT_SURROGATE>
1324
1325 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1326 code point.
1327 This bit is set only if the input C<flags> parameter contains either the
1328 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1329
1330 =back
1331
1332 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1333 flag to suppress any warnings, and then examine the C<*errors> return.
1334
1335 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1336 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1337 =for apidoc Amnh||UTF8_GOT_EMPTY
1338 =for apidoc Amnh||UTF8_GOT_LONG
1339 =for apidoc Amnh||UTF8_GOT_NONCHAR
1340 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1341 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1342 =for apidoc Amnh||UTF8_GOT_SHORT
1343 =for apidoc Amnh||UTF8_GOT_SUPER
1344 =for apidoc Amnh||UTF8_GOT_SURROGATE
1345
1346 =cut
1347
1348 Also implemented as a macro in utf8.h
1349 */
1350
1351 UV
1352 Perl_utf8n_to_uvchr_error(const U8 *s,
1353                           STRLEN curlen,
1354                           STRLEN *retlen,
1355                           const U32 flags,
1356                           U32 * errors)
1357 {
1358     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1359
1360     return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1361 }
1362
1363 /*
1364
1365 =for apidoc utf8n_to_uvchr_msgs
1366
1367 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1368 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1369 directly.
1370
1371 This function is for code that needs to know what the precise malformation(s)
1372 are when an error is found, and wants the corresponding warning and/or error
1373 messages to be returned to the caller rather than be displayed.  All messages
1374 that would have been displayed if all lexical warnings are enabled will be
1375 returned.
1376
1377 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1378 placed after all the others, C<msgs>.  If this parameter is 0, this function
1379 behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
1380 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1381 contain any appropriate messages.  The elements of the array are ordered so
1382 that the first message that would have been displayed is in the 0th element,
1383 and so on.  Each element is a hash with three key-value pairs, as follows:
1384
1385 =over 4
1386
1387 =item C<text>
1388
1389 The text of the message as a C<SVpv>.
1390
1391 =item C<warn_categories>
1392
1393 The warning category (or categories) packed into a C<SVuv>.
1394
1395 =item C<flag>
1396
1397 A single flag bit associated with this message, in a C<SVuv>.
1398 The bit corresponds to some bit in the C<*errors> return value,
1399 such as C<UTF8_GOT_LONG>.
1400
1401 =back
1402
1403 It's important to note that specifying this parameter as non-null will cause
1404 any warnings this function would otherwise generate to be suppressed, and
1405 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
1406 (or not) when choosing what to do with the returned messages.
1407
1408 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1409 no AV is created.
1410
1411 The caller, of course, is responsible for freeing any returned AV.
1412
1413 =cut
1414 */
1415
1416 UV
1417 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1418                                STRLEN curlen,
1419                                STRLEN *retlen,
1420                                const U32 flags,
1421                                U32 * errors,
1422                                AV ** msgs)
1423 {
1424     const U8 * const s0 = s;
1425     const U8 * send = s0 + curlen;
1426     U32 possible_problems;  /* A bit is set here for each potential problem
1427                                found as we go along */
1428     UV uv;
1429     STRLEN expectlen;     /* How long should this sequence be? */
1430     STRLEN avail_len;     /* When input is too short, gives what that is */
1431     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
1432                              gets set and discarded */
1433
1434     /* The below are used only if there is both an overlong malformation and a
1435      * too short one.  Otherwise the first two are set to 's0' and 'send', and
1436      * the third not used at all */
1437     U8 * adjusted_s0;
1438     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1439                                             routine; see [perl #130921] */
1440     UV uv_so_far;
1441     dTHX;
1442
1443     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1444
1445     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1446      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1447      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
1448      * final case. */
1449
1450     /* Each of the affected Hanguls starts with \xED */
1451
1452     if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1453         if (retlen) {
1454             *retlen = 3;
1455         }
1456         if (errors) {
1457             *errors = 0;
1458         }
1459         if (msgs) {
1460             *msgs = NULL;
1461         }
1462
1463         return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1464              | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1465              |  (s0[2] & UTF_CONTINUATION_MASK);
1466     }
1467
1468     /* In conjunction with the exhaustive tests that can be enabled in
1469      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1470      * what it is intended to do, and that no flaws in it are masked by
1471      * dropping down and executing the code below
1472     assert(! isUTF8_CHAR(s0, send)
1473           || UTF8_IS_SURROGATE(s0, send)
1474           || UTF8_IS_SUPER(s0, send)
1475           || UTF8_IS_NONCHAR(s0,send));
1476     */
1477
1478     s = s0;
1479     uv = *s0;
1480     possible_problems = 0;
1481     expectlen = 0;
1482     avail_len = 0;
1483     discard_errors = 0;
1484     adjusted_s0 = (U8 *) s0;
1485     uv_so_far = 0;
1486
1487     if (errors) {
1488         *errors = 0;
1489     }
1490     else {
1491         errors = &discard_errors;
1492     }
1493
1494     /* The order of malformation tests here is important.  We should consume as
1495      * few bytes as possible in order to not skip any valid character.  This is
1496      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1497      * https://unicode.org/reports/tr36 for more discussion as to why.  For
1498      * example, once we've done a UTF8SKIP, we can tell the expected number of
1499      * bytes, and could fail right off the bat if the input parameters indicate
1500      * that there are too few available.  But it could be that just that first
1501      * byte is garbled, and the intended character occupies fewer bytes.  If we
1502      * blindly assumed that the first byte is correct, and skipped based on
1503      * that number, we could skip over a valid input character.  So instead, we
1504      * always examine the sequence byte-by-byte.
1505      *
1506      * We also should not consume too few bytes, otherwise someone could inject
1507      * things.  For example, an input could be deliberately designed to
1508      * overflow, and if this code bailed out immediately upon discovering that,
1509      * returning to the caller C<*retlen> pointing to the very next byte (one
1510      * which is actually part of the overflowing sequence), that could look
1511      * legitimate to the caller, which could discard the initial partial
1512      * sequence and process the rest, inappropriately.
1513      *
1514      * Some possible input sequences are malformed in more than one way.  This
1515      * function goes to lengths to try to find all of them.  This is necessary
1516      * for correctness, as the inputs may allow one malformation but not
1517      * another, and if we abandon searching for others after finding the
1518      * allowed one, we could allow in something that shouldn't have been.
1519      */
1520
1521     if (UNLIKELY(curlen == 0)) {
1522         possible_problems |= UTF8_GOT_EMPTY;
1523         curlen = 0;
1524         uv = UNICODE_REPLACEMENT;
1525         goto ready_to_handle_errors;
1526     }
1527
1528     expectlen = UTF8SKIP(s);
1529
1530     /* A well-formed UTF-8 character, as the vast majority of calls to this
1531      * function will be for, has this expected length.  For efficiency, set
1532      * things up here to return it.  It will be overriden only in those rare
1533      * cases where a malformation is found */
1534     if (retlen) {
1535         *retlen = expectlen;
1536     }
1537
1538     /* A continuation character can't start a valid sequence */
1539     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1540         possible_problems |= UTF8_GOT_CONTINUATION;
1541         curlen = 1;
1542         uv = UNICODE_REPLACEMENT;
1543         goto ready_to_handle_errors;
1544     }
1545
1546     /* Here is not a continuation byte, nor an invariant.  The only thing left
1547      * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
1548      * because it excludes start bytes like \xC0 that always lead to
1549      * overlongs.) */
1550
1551     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1552      * that indicate the number of bytes in the character's whole UTF-8
1553      * sequence, leaving just the bits that are part of the value.  */
1554     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1555
1556     /* Setup the loop end point, making sure to not look past the end of the
1557      * input string, and flag it as too short if the size isn't big enough. */
1558     if (UNLIKELY(curlen < expectlen)) {
1559         possible_problems |= UTF8_GOT_SHORT;
1560         avail_len = curlen;
1561     }
1562     else {
1563         send = (U8*) s0 + expectlen;
1564     }
1565
1566     /* Now, loop through the remaining bytes in the character's sequence,
1567      * accumulating each into the working value as we go. */
1568     for (s = s0 + 1; s < send; s++) {
1569         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1570             uv = UTF8_ACCUMULATE(uv, *s);
1571             continue;
1572         }
1573
1574         /* Here, found a non-continuation before processing all expected bytes.
1575          * This byte indicates the beginning of a new character, so quit, even
1576          * if allowing this malformation. */
1577         possible_problems |= UTF8_GOT_NON_CONTINUATION;
1578         break;
1579     } /* End of loop through the character's bytes */
1580
1581     /* Save how many bytes were actually in the character */
1582     curlen = s - s0;
1583
1584     /* Note that there are two types of too-short malformation.  One is when
1585      * there is actual wrong data before the normal termination of the
1586      * sequence.  The other is that the sequence wasn't complete before the end
1587      * of the data we are allowed to look at, based on the input 'curlen'.
1588      * This means that we were passed data for a partial character, but it is
1589      * valid as far as we saw.  The other is definitely invalid.  This
1590      * distinction could be important to a caller, so the two types are kept
1591      * separate.
1592      *
1593      * A convenience macro that matches either of the too-short conditions.  */
1594 #   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1595
1596     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1597         uv_so_far = uv;
1598         uv = UNICODE_REPLACEMENT;
1599     }
1600
1601     /* Check for overflow.  The algorithm requires us to not look past the end
1602      * of the current character, even if partial, so the upper limit is 's' */
1603     if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1604                                          1 /* Do consider overlongs */
1605                                         )))
1606     {
1607         possible_problems |= UTF8_GOT_OVERFLOW;
1608         uv = UNICODE_REPLACEMENT;
1609     }
1610
1611     /* Check for overlong.  If no problems so far, 'uv' is the correct code
1612      * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
1613      * we must look at the UTF-8 byte sequence itself to see if it is for an
1614      * overlong */
1615     if (     (   LIKELY(! possible_problems)
1616               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1617         || (       UNLIKELY(possible_problems)
1618             && (   UNLIKELY(! UTF8_IS_START(*s0))
1619                 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1620     {
1621         possible_problems |= UTF8_GOT_LONG;
1622
1623         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
1624
1625                           /* The calculation in the 'true' branch of this 'if'
1626                            * below won't work if overflows, and isn't needed
1627                            * anyway.  Further below we handle all overflow
1628                            * cases */
1629             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1630         {
1631             UV min_uv = uv_so_far;
1632             STRLEN i;
1633
1634             /* Here, the input is both overlong and is missing some trailing
1635              * bytes.  There is no single code point it could be for, but there
1636              * may be enough information present to determine if what we have
1637              * so far is for an unallowed code point, such as for a surrogate.
1638              * The code further below has the intelligence to determine this,
1639              * but just for non-overlong UTF-8 sequences.  What we do here is
1640              * calculate the smallest code point the input could represent if
1641              * there were no too short malformation.  Then we compute and save
1642              * the UTF-8 for that, which is what the code below looks at
1643              * instead of the raw input.  It turns out that the smallest such
1644              * code point is all we need. */
1645             for (i = curlen; i < expectlen; i++) {
1646                 min_uv = UTF8_ACCUMULATE(min_uv,
1647                                 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1648             }
1649
1650             adjusted_s0 = temp_char_buf;
1651             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1652         }
1653     }
1654
1655     /* Here, we have found all the possible problems, except for when the input
1656      * is for a problematic code point not allowed by the input parameters. */
1657
1658                                 /* uv is valid for overlongs */
1659     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1660                    && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1661             || (   UNLIKELY(possible_problems)
1662
1663                           /* if overflow, we know without looking further
1664                            * precisely which of the problematic types it is,
1665                            * and we deal with those in the overflow handling
1666                            * code */
1667                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1668                 && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1669                     || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1670         && ((flags & ( UTF8_DISALLOW_NONCHAR
1671                       |UTF8_DISALLOW_SURROGATE
1672                       |UTF8_DISALLOW_SUPER
1673                       |UTF8_DISALLOW_PERL_EXTENDED
1674                       |UTF8_WARN_NONCHAR
1675                       |UTF8_WARN_SURROGATE
1676                       |UTF8_WARN_SUPER
1677                       |UTF8_WARN_PERL_EXTENDED))))
1678     {
1679         /* If there were no malformations, or the only malformation is an
1680          * overlong, 'uv' is valid */
1681         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1682             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1683                 possible_problems |= UTF8_GOT_SURROGATE;
1684             }
1685             else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1686                 possible_problems |= UTF8_GOT_SUPER;
1687             }
1688             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1689                 possible_problems |= UTF8_GOT_NONCHAR;
1690             }
1691         }
1692         else {  /* Otherwise, need to look at the source UTF-8, possibly
1693                    adjusted to be non-overlong */
1694
1695             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1696                                                     > UTF_START_BYTE_110000_))
1697             {
1698                 possible_problems |= UTF8_GOT_SUPER;
1699             }
1700             else if (curlen > 1) {
1701                 if (UNLIKELY(   NATIVE_UTF8_TO_I8(*adjusted_s0)
1702                                                 == UTF_START_BYTE_110000_
1703                              && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1704                                                 >= UTF_FIRST_CONT_BYTE_110000_))
1705                 {
1706                     possible_problems |= UTF8_GOT_SUPER;
1707                 }
1708                 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1709                     possible_problems |= UTF8_GOT_SURROGATE;
1710                 }
1711             }
1712
1713             /* We need a complete well-formed UTF-8 character to discern
1714              * non-characters, so can't look for them here */
1715         }
1716     }
1717
1718   ready_to_handle_errors:
1719
1720     /* At this point:
1721      * curlen               contains the number of bytes in the sequence that
1722      *                      this call should advance the input by.
1723      * avail_len            gives the available number of bytes passed in, but
1724      *                      only if this is less than the expected number of
1725      *                      bytes, based on the code point's start byte.
1726      * possible_problems'   is 0 if there weren't any problems; otherwise a bit
1727      *                      is set in it for each potential problem found.
1728      * uv                   contains the code point the input sequence
1729      *                      represents; or if there is a problem that prevents
1730      *                      a well-defined value from being computed, it is
1731      *                      some subsitute value, typically the REPLACEMENT
1732      *                      CHARACTER.
1733      * s0                   points to the first byte of the character
1734      * s                    points to just after were we left off processing
1735      *                      the character
1736      * send                 points to just after where that character should
1737      *                      end, based on how many bytes the start byte tells
1738      *                      us should be in it, but no further than s0 +
1739      *                      avail_len
1740      */
1741
1742     if (UNLIKELY(possible_problems)) {
1743         bool disallowed = FALSE;
1744         const U32 orig_problems = possible_problems;
1745
1746         if (msgs) {
1747             *msgs = NULL;
1748         }
1749
1750         while (possible_problems) { /* Handle each possible problem */
1751             U32 pack_warn = 0;
1752             char * message = NULL;
1753             U32 this_flag_bit = 0;
1754
1755             /* Each 'if' clause handles one problem.  They are ordered so that
1756              * the first ones' messages will be displayed before the later
1757              * ones; this is kinda in decreasing severity order.  But the
1758              * overlong must come last, as it changes 'uv' looked at by the
1759              * others */
1760             if (possible_problems & UTF8_GOT_OVERFLOW) {
1761
1762                 /* Overflow means also got a super and are using Perl's
1763                  * extended UTF-8, but we handle all three cases here */
1764                 possible_problems
1765                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1766                 *errors |= UTF8_GOT_OVERFLOW;
1767
1768                 /* But the API says we flag all errors found */
1769                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1770                     *errors |= UTF8_GOT_SUPER;
1771                 }
1772                 if (flags
1773                         & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1774                 {
1775                     *errors |= UTF8_GOT_PERL_EXTENDED;
1776                 }
1777
1778                 /* Disallow if any of the three categories say to */
1779                 if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
1780                     || (flags & ( UTF8_DISALLOW_SUPER
1781                                  |UTF8_DISALLOW_PERL_EXTENDED)))
1782                 {
1783                     disallowed = TRUE;
1784                 }
1785
1786                 /* Likewise, warn if any say to */
1787                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)
1788                     ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1789                 {
1790
1791                     /* The warnings code explicitly says it doesn't handle the
1792                      * case of packWARN2 and two categories which have
1793                      * parent-child relationship.  Even if it works now to
1794                      * raise the warning if either is enabled, it wouldn't
1795                      * necessarily do so in the future.  We output (only) the
1796                      * most dire warning */
1797                     if (! (flags & UTF8_CHECK_ONLY)) {
1798                         if (msgs || ckWARN_d(WARN_UTF8)) {
1799                             pack_warn = packWARN(WARN_UTF8);
1800                         }
1801                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1802                             pack_warn = packWARN(WARN_NON_UNICODE);
1803                         }
1804                         if (pack_warn) {
1805                             message = Perl_form(aTHX_ "%s: %s (overflows)",
1806                                             malformed_text,
1807                                             _byte_dump_string(s0, curlen, 0));
1808                             this_flag_bit = UTF8_GOT_OVERFLOW;
1809                         }
1810                     }
1811                 }
1812             }
1813             else if (possible_problems & UTF8_GOT_EMPTY) {
1814                 possible_problems &= ~UTF8_GOT_EMPTY;
1815                 *errors |= UTF8_GOT_EMPTY;
1816
1817                 if (! (flags & UTF8_ALLOW_EMPTY)) {
1818
1819                     /* This so-called malformation is now treated as a bug in
1820                      * the caller.  If you have nothing to decode, skip calling
1821                      * this function */
1822                     assert(0);
1823
1824                     disallowed = TRUE;
1825                     if (  (msgs
1826                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1827                     {
1828                         pack_warn = packWARN(WARN_UTF8);
1829                         message = Perl_form(aTHX_ "%s (empty string)",
1830                                                    malformed_text);
1831                         this_flag_bit = UTF8_GOT_EMPTY;
1832                     }
1833                 }
1834             }
1835             else if (possible_problems & UTF8_GOT_CONTINUATION) {
1836                 possible_problems &= ~UTF8_GOT_CONTINUATION;
1837                 *errors |= UTF8_GOT_CONTINUATION;
1838
1839                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1840                     disallowed = TRUE;
1841                     if ((   msgs
1842                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1843                     {
1844                         pack_warn = packWARN(WARN_UTF8);
1845                         message = Perl_form(aTHX_
1846                                 "%s: %s (unexpected continuation byte 0x%02x,"
1847                                 " with no preceding start byte)",
1848                                 malformed_text,
1849                                 _byte_dump_string(s0, 1, 0), *s0);
1850                         this_flag_bit = UTF8_GOT_CONTINUATION;
1851                     }
1852                 }
1853             }
1854             else if (possible_problems & UTF8_GOT_SHORT) {
1855                 possible_problems &= ~UTF8_GOT_SHORT;
1856                 *errors |= UTF8_GOT_SHORT;
1857
1858                 if (! (flags & UTF8_ALLOW_SHORT)) {
1859                     disallowed = TRUE;
1860                     if ((   msgs
1861                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1862                     {
1863                         pack_warn = packWARN(WARN_UTF8);
1864                         message = Perl_form(aTHX_
1865                              "%s: %s (too short; %d byte%s available, need %d)",
1866                              malformed_text,
1867                              _byte_dump_string(s0, send - s0, 0),
1868                              (int)avail_len,
1869                              avail_len == 1 ? "" : "s",
1870                              (int)expectlen);
1871                         this_flag_bit = UTF8_GOT_SHORT;
1872                     }
1873                 }
1874
1875             }
1876             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1877                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1878                 *errors |= UTF8_GOT_NON_CONTINUATION;
1879
1880                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1881                     disallowed = TRUE;
1882                     if ((   msgs
1883                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1884                     {
1885
1886                         /* If we don't know for sure that the input length is
1887                          * valid, avoid as much as possible reading past the
1888                          * end of the buffer */
1889                         int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1890                                        ? (int) (s - s0)
1891                                        : (int) (send - s0);
1892                         pack_warn = packWARN(WARN_UTF8);
1893                         message = Perl_form(aTHX_ "%s",
1894                             unexpected_non_continuation_text(s0,
1895                                                             printlen,
1896                                                             s - s0,
1897                                                             (int) expectlen));
1898                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1899                     }
1900                 }
1901             }
1902             else if (possible_problems & UTF8_GOT_SURROGATE) {
1903                 possible_problems &= ~UTF8_GOT_SURROGATE;
1904
1905                 if (flags & UTF8_WARN_SURROGATE) {
1906                     *errors |= UTF8_GOT_SURROGATE;
1907
1908                     if (   ! (flags & UTF8_CHECK_ONLY)
1909                         && (msgs || ckWARN_d(WARN_SURROGATE)))
1910                     {
1911                         pack_warn = packWARN(WARN_SURROGATE);
1912
1913                         /* These are the only errors that can occur with a
1914                         * surrogate when the 'uv' isn't valid */
1915                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1916                             message = Perl_form(aTHX_
1917                                     "UTF-16 surrogate (any UTF-8 sequence that"
1918                                     " starts with \"%s\" is for a surrogate)",
1919                                     _byte_dump_string(s0, curlen, 0));
1920                         }
1921                         else {
1922                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
1923                         }
1924                         this_flag_bit = UTF8_GOT_SURROGATE;
1925                     }
1926                 }
1927
1928                 if (flags & UTF8_DISALLOW_SURROGATE) {
1929                     disallowed = TRUE;
1930                     *errors |= UTF8_GOT_SURROGATE;
1931                 }
1932             }
1933             else if (possible_problems & UTF8_GOT_SUPER) {
1934                 possible_problems &= ~UTF8_GOT_SUPER;
1935
1936                 if (flags & UTF8_WARN_SUPER) {
1937                     *errors |= UTF8_GOT_SUPER;
1938
1939                     if (   ! (flags & UTF8_CHECK_ONLY)
1940                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1941                     {
1942                         pack_warn = packWARN(WARN_NON_UNICODE);
1943
1944                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1945                             message = Perl_form(aTHX_
1946                                     "Any UTF-8 sequence that starts with"
1947                                     " \"%s\" is for a non-Unicode code point,"
1948                                     " may not be portable",
1949                                     _byte_dump_string(s0, curlen, 0));
1950                         }
1951                         else {
1952                             message = Perl_form(aTHX_ super_cp_format, uv);
1953                         }
1954                         this_flag_bit = UTF8_GOT_SUPER;
1955                     }
1956                 }
1957
1958                 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1959                  * and before possibly bailing out, so that the more dire
1960                  * warning will override the regular one. */
1961                 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1962                     if (  ! (flags & UTF8_CHECK_ONLY)
1963                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1964                         &&  (msgs || (   ckWARN_d(WARN_NON_UNICODE)
1965                                       || ckWARN(WARN_PORTABLE))))
1966                     {
1967                         pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1968
1969                         /* If it is an overlong that evaluates to a code point
1970                          * that doesn't have to use the Perl extended UTF-8, it
1971                          * still used it, and so we output a message that
1972                          * doesn't refer to the code point.  The same is true
1973                          * if there was a SHORT malformation where the code
1974                          * point is not valid.  In that case, 'uv' will have
1975                          * been set to the REPLACEMENT CHAR, and the message
1976                          * below without the code point in it will be selected
1977                          * */
1978                         if (UNICODE_IS_PERL_EXTENDED(uv)) {
1979                             message = Perl_form(aTHX_
1980                                             PL_extended_cp_format, uv);
1981                         }
1982                         else {
1983                             message = Perl_form(aTHX_
1984                                         "Any UTF-8 sequence that starts with"
1985                                         " \"%s\" is a Perl extension, and"
1986                                         " so is not portable",
1987                                         _byte_dump_string(s0, curlen, 0));
1988                         }
1989                         this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1990                     }
1991
1992                     if (flags & ( UTF8_WARN_PERL_EXTENDED
1993                                  |UTF8_DISALLOW_PERL_EXTENDED))
1994                     {
1995                         *errors |= UTF8_GOT_PERL_EXTENDED;
1996
1997                         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1998                             disallowed = TRUE;
1999                         }
2000                     }
2001                 }
2002
2003                 if (flags & UTF8_DISALLOW_SUPER) {
2004                     *errors |= UTF8_GOT_SUPER;
2005                     disallowed = TRUE;
2006                 }
2007             }
2008             else if (possible_problems & UTF8_GOT_NONCHAR) {
2009                 possible_problems &= ~UTF8_GOT_NONCHAR;
2010
2011                 if (flags & UTF8_WARN_NONCHAR) {
2012                     *errors |= UTF8_GOT_NONCHAR;
2013
2014                     if (  ! (flags & UTF8_CHECK_ONLY)
2015                         && (msgs || ckWARN_d(WARN_NONCHAR)))
2016                     {
2017                         /* The code above should have guaranteed that we don't
2018                          * get here with errors other than overlong */
2019                         assert (! (orig_problems
2020                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2021
2022                         pack_warn = packWARN(WARN_NONCHAR);
2023                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
2024                         this_flag_bit = UTF8_GOT_NONCHAR;
2025                     }
2026                 }
2027
2028                 if (flags & UTF8_DISALLOW_NONCHAR) {
2029                     disallowed = TRUE;
2030                     *errors |= UTF8_GOT_NONCHAR;
2031                 }
2032             }
2033             else if (possible_problems & UTF8_GOT_LONG) {
2034                 possible_problems &= ~UTF8_GOT_LONG;
2035                 *errors |= UTF8_GOT_LONG;
2036
2037                 if (flags & UTF8_ALLOW_LONG) {
2038
2039                     /* We don't allow the actual overlong value, unless the
2040                      * special extra bit is also set */
2041                     if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
2042                                     & ~UTF8_ALLOW_LONG)))
2043                     {
2044                         uv = UNICODE_REPLACEMENT;
2045                     }
2046                 }
2047                 else {
2048                     disallowed = TRUE;
2049
2050                     if ((   msgs
2051                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2052                     {
2053                         pack_warn = packWARN(WARN_UTF8);
2054
2055                         /* These error types cause 'uv' to be something that
2056                          * isn't what was intended, so can't use it in the
2057                          * message.  The other error types either can't
2058                          * generate an overlong, or else the 'uv' is valid */
2059                         if (orig_problems &
2060                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2061                         {
2062                             message = Perl_form(aTHX_
2063                                     "%s: %s (any UTF-8 sequence that starts"
2064                                     " with \"%s\" is overlong which can and"
2065                                     " should be represented with a"
2066                                     " different, shorter sequence)",
2067                                     malformed_text,
2068                                     _byte_dump_string(s0, send - s0, 0),
2069                                     _byte_dump_string(s0, curlen, 0));
2070                         }
2071                         else {
2072                             U8 tmpbuf[UTF8_MAXBYTES+1];
2073                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2074                                                                         uv, 0);
2075                             /* Don't use U+ for non-Unicode code points, which
2076                              * includes those in the Latin1 range */
2077                             const char * preface = (   UNICODE_IS_SUPER(uv)
2078 #ifdef EBCDIC
2079                                                     || uv <= 0xFF
2080 #endif
2081                                                    )
2082                                                    ? "0x"
2083                                                    : "U+";
2084                             message = Perl_form(aTHX_
2085                                 "%s: %s (overlong; instead use %s to represent"
2086                                 " %s%0*" UVXf ")",
2087                                 malformed_text,
2088                                 _byte_dump_string(s0, send - s0, 0),
2089                                 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2090                                 preface,
2091                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2092                                                          small code points */
2093                                 UNI_TO_NATIVE(uv));
2094                         }
2095                         this_flag_bit = UTF8_GOT_LONG;
2096                     }
2097                 }
2098             } /* End of looking through the possible flags */
2099
2100             /* Display the message (if any) for the problem being handled in
2101              * this iteration of the loop */
2102             if (message) {
2103                 if (msgs) {
2104                     assert(this_flag_bit);
2105
2106                     if (*msgs == NULL) {
2107                         *msgs = newAV();
2108                     }
2109
2110                     av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2111                                                                 pack_warn,
2112                                                                 this_flag_bit)));
2113                 }
2114                 else if (PL_op)
2115                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2116                                                  OP_DESC(PL_op));
2117                 else
2118                     Perl_warner(aTHX_ pack_warn, "%s", message);
2119             }
2120         }   /* End of 'while (possible_problems)' */
2121
2122         /* Since there was a possible problem, the returned length may need to
2123          * be changed from the one stored at the beginning of this function.
2124          * Instead of trying to figure out if that's needed, just do it. */
2125         if (retlen) {
2126             *retlen = curlen;
2127         }
2128
2129         if (disallowed) {
2130             if (flags & UTF8_CHECK_ONLY && retlen) {
2131                 *retlen = ((STRLEN) -1);
2132             }
2133             return 0;
2134         }
2135     }
2136
2137     return UNI_TO_NATIVE(uv);
2138 }
2139
2140 /*
2141 =for apidoc utf8_to_uvchr_buf
2142
2143 Returns the native code point of the first character in the string C<s> which
2144 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2145 C<*retlen> will be set to the length, in bytes, of that character.
2146
2147 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2148 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2149 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
2150 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2151 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2152 the next possible position in C<s> that could begin a non-malformed character.
2153 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2154 returned.
2155
2156 =cut
2157
2158 Also implemented as a macro in utf8.h
2159
2160 */
2161
2162
2163 UV
2164 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2165 {
2166     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2167
2168     return utf8_to_uvchr_buf_helper(s, send, retlen);
2169 }
2170
2171 /* This is marked as deprecated
2172  *
2173 =for apidoc utf8_to_uvuni_buf
2174
2175 Only in very rare circumstances should code need to be dealing in Unicode
2176 (as opposed to native) code points.  In those few cases, use
2177 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
2178 If you are not absolutely sure this is one of those cases, then assume it isn't
2179 and use plain C<utf8_to_uvchr_buf> instead.
2180
2181 Returns the Unicode (not-native) code point of the first character in the
2182 string C<s> which
2183 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2184 C<retlen> will be set to the length, in bytes, of that character.
2185
2186 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2187 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2188 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
2189 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
2190 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
2191 next possible position in C<s> that could begin a non-malformed character.
2192 See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2193 returned.
2194
2195 =cut
2196 */
2197
2198 UV
2199 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2200 {
2201     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
2202
2203     assert(send > s);
2204
2205     return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
2206 }
2207
2208 /*
2209 =for apidoc utf8_length
2210
2211 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2212 at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
2213 same place, it returns 0 with no warning raised.
2214
2215 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2216 and returns the number of valid characters.
2217
2218 =cut
2219 */
2220
2221 STRLEN
2222 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
2223 {
2224     STRLEN len = 0;
2225
2226     PERL_ARGS_ASSERT_UTF8_LENGTH;
2227
2228     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2229      * the bitops (especially ~) can create illegal UTF-8.
2230      * In other words: in Perl UTF-8 is not just for Unicode. */
2231
2232     while (s < e) {
2233         Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2234
2235         if (UNLIKELY(e - s  < expected_byte_count)) {
2236             goto warn_and_return;
2237         }
2238
2239         len++;
2240         s += expected_byte_count;
2241     }
2242
2243     if (LIKELY(e == s)) {
2244         return len;
2245     }
2246
2247     /* Here, s > e on entry */
2248
2249   warn_and_return:
2250     if (PL_op)
2251         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2252                          "%s in %s", unees, OP_DESC(PL_op));
2253     else
2254         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2255
2256     return len;
2257 }
2258
2259 /*
2260 =for apidoc bytes_cmp_utf8
2261
2262 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2263 sequence of characters (stored as UTF-8)
2264 in C<u>, C<ulen>.  Returns 0 if they are
2265 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2266 if the first string is greater than the second string.
2267
2268 -1 or +1 is returned if the shorter string was identical to the start of the
2269 longer string.  -2 or +2 is returned if
2270 there was a difference between characters
2271 within the strings.
2272
2273 =cut
2274 */
2275
2276 int
2277 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2278 {
2279     const U8 *const bend = b + blen;
2280     const U8 *const uend = u + ulen;
2281
2282     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2283
2284     while (b < bend && u < uend) {
2285         U8 c = *u++;
2286         if (!UTF8_IS_INVARIANT(c)) {
2287             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2288                 if (u < uend) {
2289                     U8 c1 = *u++;
2290                     if (UTF8_IS_CONTINUATION(c1)) {
2291                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2292                     } else {
2293                         /* diag_listed_as: Malformed UTF-8 character%s */
2294                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2295                               "%s %s%s",
2296                               unexpected_non_continuation_text(u - 2, 2, 1, 2),
2297                               PL_op ? " in " : "",
2298                               PL_op ? OP_DESC(PL_op) : "");
2299                         return -2;
2300                     }
2301                 } else {
2302                     if (PL_op)
2303                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2304                                          "%s in %s", unees, OP_DESC(PL_op));
2305                     else
2306                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2307                     return -2; /* Really want to return undef :-)  */
2308                 }
2309             } else {
2310                 return -2;
2311             }
2312         }
2313         if (*b != c) {
2314             return *b < c ? -2 : +2;
2315         }
2316         ++b;
2317     }
2318
2319     if (b == bend && u == uend)
2320         return 0;
2321
2322     return b < bend ? +1 : -1;
2323 }
2324
2325 /*
2326 =for apidoc utf8_to_bytes
2327
2328 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2329 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2330 updates C<*lenp> to contain the new length.
2331 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2332
2333 Upon successful return, the number of variants in the string can be computed by
2334 having saved the value of C<*lenp> before the call, and subtracting the
2335 after-call value of C<*lenp> from it.
2336
2337 If you need a copy of the string, see L</bytes_from_utf8>.
2338
2339 =cut
2340 */
2341
2342 U8 *
2343 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2344 {
2345     U8 * first_variant;
2346
2347     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2348     PERL_UNUSED_CONTEXT;
2349
2350     /* This is a no-op if no variants at all in the input */
2351     if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2352         return s;
2353     }
2354
2355     {
2356         U8 * const save = s;
2357         U8 * const send = s + *lenp;
2358         U8 * d;
2359
2360         /* Nothing before the first variant needs to be changed, so start the real
2361          * work there */
2362         s = first_variant;
2363         while (s < send) {
2364             if (! UTF8_IS_INVARIANT(*s)) {
2365                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2366                     *lenp = ((STRLEN) -1);
2367                     return 0;
2368                 }
2369                 s++;
2370             }
2371             s++;
2372         }
2373
2374         /* Is downgradable, so do it */
2375         d = s = first_variant;
2376         while (s < send) {
2377             U8 c = *s++;
2378             if (! UVCHR_IS_INVARIANT(c)) {
2379                 /* Then it is two-byte encoded */
2380                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2381                 s++;
2382             }
2383             *d++ = c;
2384         }
2385         *d = '\0';
2386         *lenp = d - save;
2387
2388         return save;
2389     }
2390 }
2391
2392 /*
2393 =for apidoc bytes_from_utf8
2394
2395 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2396 byte encoding.  On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2397 actually encoded in UTF-8.
2398
2399 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2400 the input string.
2401
2402 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2403 not expressible in native byte encoding.  In these cases, C<*is_utf8p> and
2404 C<*lenp> are unchanged, and the return value is the original C<s>.
2405
2406 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2407 newly created string containing a downgraded copy of C<s>, and whose length is
2408 returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
2409 caller is responsible for arranging for the memory used by this string to get
2410 freed.
2411
2412 Upon successful return, the number of variants in the string can be computed by
2413 having saved the value of C<*lenp> before the call, and subtracting the
2414 after-call value of C<*lenp> from it.
2415
2416 =cut
2417
2418 There is a macro that avoids this function call, but this is retained for
2419 anyone who calls it with the Perl_ prefix */
2420
2421 U8 *
2422 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2423 {
2424     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2425     PERL_UNUSED_CONTEXT;
2426
2427     return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2428 }
2429
2430 /*
2431 =for apidoc bytes_from_utf8_loc
2432
2433 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2434 to where to store the location of the first character in C<"s"> that cannot be
2435 converted to non-UTF8.
2436
2437 If that parameter is C<NULL>, this function behaves identically to
2438 C<bytes_from_utf8>.
2439
2440 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2441 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2442
2443 Otherwise, the function returns a newly created C<NUL>-terminated string
2444 containing the non-UTF8 equivalent of the convertible first portion of
2445 C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
2446 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2447 and C<*first_non_downgradable> is set to C<NULL>.
2448
2449 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2450 first character in the original string that wasn't converted.  C<*is_utf8p> is
2451 unchanged.  Note that the new string may have length 0.
2452
2453 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2454 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2455 converts as many characters in it as possible stopping at the first one it
2456 finds that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
2457 set to point to that.  The function returns the portion that could be converted
2458 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2459 not including the terminating C<NUL>.  If the very first character in the
2460 original could not be converted, C<*lenp> will be 0, and the new string will
2461 contain just a single C<NUL>.  If the entire input string was converted,
2462 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2463
2464 Upon successful return, the number of variants in the converted portion of the
2465 string can be computed by having saved the value of C<*lenp> before the call,
2466 and subtracting the after-call value of C<*lenp> from it.
2467
2468 =cut
2469
2470
2471 */
2472
2473 U8 *
2474 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2475 {
2476     U8 *d;
2477     const U8 *original = s;
2478     U8 *converted_start;
2479     const U8 *send = s + *lenp;
2480
2481     PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2482
2483     if (! *is_utf8p) {
2484         if (first_unconverted) {
2485             *first_unconverted = NULL;
2486         }
2487
2488         return (U8 *) original;
2489     }
2490
2491     Newx(d, (*lenp) + 1, U8);
2492
2493     converted_start = d;
2494     while (s < send) {
2495         U8 c = *s++;
2496         if (! UTF8_IS_INVARIANT(c)) {
2497
2498             /* Then it is multi-byte encoded.  If the code point is above 0xFF,
2499              * have to stop now */
2500             if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2501                 if (first_unconverted) {
2502                     *first_unconverted = s - 1;
2503                     goto finish_and_return;
2504                 }
2505                 else {
2506                     Safefree(converted_start);
2507                     return (U8 *) original;
2508                 }
2509             }
2510
2511             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2512             s++;
2513         }
2514         *d++ = c;
2515     }
2516
2517     /* Here, converted the whole of the input */
2518     *is_utf8p = FALSE;
2519     if (first_unconverted) {
2520         *first_unconverted = NULL;
2521     }
2522
2523   finish_and_return:
2524     *d = '\0';
2525     *lenp = d - converted_start;
2526
2527     /* Trim unused space */
2528     Renew(converted_start, *lenp + 1, U8);
2529
2530     return converted_start;
2531 }
2532
2533 /*
2534 =for apidoc bytes_to_utf8
2535
2536 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2537 UTF-8.
2538 Returns a pointer to the newly-created string, and sets C<*lenp> to
2539 reflect the new length in bytes.  The caller is responsible for arranging for
2540 the memory used by this string to get freed.
2541
2542 Upon successful return, the number of variants in the string can be computed by
2543 having saved the value of C<*lenp> before the call, and subtracting it from the
2544 after-call value of C<*lenp>.
2545
2546 A C<NUL> character will be written after the end of the string.
2547
2548 If you want to convert to UTF-8 from encodings other than
2549 the native (Latin1 or EBCDIC),
2550 see L</sv_recode_to_utf8>().
2551
2552 =cut
2553 */
2554
2555 U8*
2556 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2557 {
2558     const U8 * const send = s + (*lenp);
2559     U8 *d;
2560     U8 *dst;
2561
2562     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2563     PERL_UNUSED_CONTEXT;
2564
2565     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2566     Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2567     dst = d;
2568
2569     while (s < send) {
2570         append_utf8_from_native_byte(*s, &d);
2571         s++;
2572     }
2573
2574     *d = '\0';
2575     *lenp = d-dst;
2576
2577     return dst;
2578 }
2579
2580 /*
2581  * Convert native UTF-16 to UTF-8. Called via the more public functions
2582  * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2583  * little-endian,
2584  *
2585  * 'p' is the UTF-16 input string, passed as a pointer to U8.
2586  * 'bytelen' is its length (must be even)
2587  * 'd' is the pointer to the destination buffer.  The caller must ensure that
2588  *     the space is large enough.  The maximum expansion factor is 2 times
2589  *     'bytelen'.  1.5 if never going to run on an EBCDIC box.
2590  * '*newlen' will contain the number of bytes this function filled of 'd'.
2591  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2592  * 'low_byte' is 1  if UTF-16BE; 0 if UTF-16LE
2593  *
2594  * The expansion factor is because UTF-16 requires 2 bytes for every code point
2595  * below 0x10000; otherwise 4 bytes.  UTF-8 requires 1-3 bytes for every code
2596  * point below 0x1000; otherwise 4 bytes.  UTF-EBCDIC requires 1-4 bytes for
2597  * every code point below 0x1000; otherwise 4-5 bytes.
2598  *
2599  * The worst case is where every code point is below U+10000, hence requiring 2
2600  * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2601  * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2602  *
2603  * Do not use in-place. */
2604
2605 U8*
2606 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2607                               const bool high_byte, /* Which of next two bytes is
2608                                                   high order */
2609                               const bool low_byte)
2610 {
2611     U8* pend;
2612     U8* dstart = d;
2613
2614     PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2615
2616     if (bytelen & 1)
2617         Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2618                 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2619     pend = p + bytelen;
2620
2621     while (p < pend) {
2622
2623         /* Next 16 bits is what we want.  (The bool is cast to U8 because on
2624          * platforms where a bool is implemented as a signed char, a compiler
2625          * warning may be generated) */
2626         U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2627         p += 2;
2628
2629         /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2630          * */
2631         if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2632
2633 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2634 #define LAST_HIGH_SURROGATE  0xDBFF
2635 #define FIRST_LOW_SURROGATE  0xDC00
2636 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
2637 #define FIRST_IN_PLANE1      0x10000
2638
2639             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2640                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2641             }
2642             else {
2643                 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2644                 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2645                                                        LAST_LOW_SURROGATE)))
2646                 {
2647                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2648                 }
2649
2650                 p += 2;
2651
2652                 /* Here uv is the high surrogate.  Combine with low surrogate
2653                  * just computed to form the actual U32 code point.
2654                  *
2655                  * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2656                 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2657                                      + low_surrogate - FIRST_LOW_SURROGATE;
2658             }
2659         }
2660
2661         /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2662         d = uvchr_to_utf8(d, uv);
2663     }
2664
2665     *newlen = d - dstart;
2666     return d;
2667 }
2668
2669 U8*
2670 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2671 {
2672     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2673
2674     return utf16_to_utf8(p, d, bytelen, newlen);
2675 }
2676
2677 U8*
2678 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2679 {
2680     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2681
2682     return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2683 }
2684
2685 /*
2686  * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2687  * big-endian and utf8_to_utf16_reversed() for little-endian,
2688  *
2689  * 's' is the UTF-8 input string, passed as a pointer to U8.
2690  * 'bytelen' is its length
2691  * 'd' is the pointer to the destination buffer, currently passed as U8 *.  The
2692  *     caller must ensure that the space is large enough.  The maximum
2693  *     expansion factor is 2 times 'bytelen'.  This happens when the input is
2694  *     entirely single-byte ASCII, expanding to two-byte UTF-16.
2695  * '*newlen' will contain the number of bytes this function filled of 'd'.
2696  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2697  * 'low_byte'  is 1 if UTF-16BE; 0 if UTF-16LE
2698  *
2699  * Do not use in-place. */
2700 U8*
2701 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2702                               const bool high_byte, /* Which of next two bytes
2703                                                        is high order */
2704                               const bool low_byte)
2705 {
2706     U8* send;
2707     U8* dstart = d;
2708
2709     PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2710
2711     send = s + bytelen;
2712
2713     while (s < send) {
2714         STRLEN retlen;
2715         UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2716                                /* No surrogates nor above-Unicode */
2717                                UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2718
2719         /* The modern method is to keep going with malformed input,
2720          * substituting the REPLACEMENT CHARACTER */
2721         if (UNLIKELY(uv == 0 && *s != '\0')) {
2722             uv = UNICODE_REPLACEMENT;
2723         }
2724
2725         if (uv >= FIRST_IN_PLANE1) {    /* Requires a surrogate pair */
2726
2727             /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2728             U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2729                                + FIRST_HIGH_SURROGATE;
2730
2731             /* (The bool is cast to U8 because on platforms where a bool is
2732              * implemented as a signed char, a compiler warning may be
2733              * generated) */
2734             d[(U8) high_byte] = high_surrogate >> 8;
2735             d[(U8) low_byte]  = high_surrogate & nBIT_MASK(8);
2736             d += 2;
2737
2738             /* The low surrogate is the lower 10 bits plus the offset */
2739             uv &= nBIT_MASK(10);
2740             uv += FIRST_LOW_SURROGATE;
2741
2742             /* Drop down to output the low surrogate like it were a
2743              * non-surrogate */
2744         }
2745
2746         d[(U8) high_byte] = uv >> 8;
2747         d[(U8) low_byte] = uv & nBIT_MASK(8);
2748         d += 2;
2749
2750         s += retlen;
2751     }
2752
2753     *newlen = d - dstart;
2754     return d;
2755 }
2756
2757 bool
2758 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2759 {
2760     return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2761 }
2762
2763 bool
2764 Perl__is_uni_perl_idcont(pTHX_ UV c)
2765 {
2766     return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2767 }
2768
2769 bool
2770 Perl__is_uni_perl_idstart(pTHX_ UV c)
2771 {
2772     return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2773 }
2774
2775 UV
2776 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2777                                   const char S_or_s)
2778 {
2779     /* We have the latin1-range values compiled into the core, so just use
2780      * those, converting the result to UTF-8.  The only difference between upper
2781      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2782      * either "SS" or "Ss".  Which one to use is passed into the routine in
2783      * 'S_or_s' to avoid a test */
2784
2785     UV converted = toUPPER_LATIN1_MOD(c);
2786
2787     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2788
2789     assert(S_or_s == 'S' || S_or_s == 's');
2790
2791     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2792                                              characters in this range */
2793         *p = (U8) converted;
2794         *lenp = 1;
2795         return converted;
2796     }
2797
2798     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2799      * which it maps to one of them, so as to only have to have one check for
2800      * it in the main case */
2801     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2802         switch (c) {
2803             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2804                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2805                 break;
2806             case MICRO_SIGN:
2807                 converted = GREEK_CAPITAL_LETTER_MU;
2808                 break;
2809 #if    UNICODE_MAJOR_VERSION > 2                                        \
2810    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
2811                                   && UNICODE_DOT_DOT_VERSION >= 8)
2812             case LATIN_SMALL_LETTER_SHARP_S:
2813                 *(p)++ = 'S';
2814                 *p = S_or_s;
2815                 *lenp = 2;
2816                 return 'S';
2817 #endif
2818             default:
2819                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2820                                  " '%c' to map to '%c'",
2821                                  c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2822                 NOT_REACHED; /* NOTREACHED */
2823         }
2824     }
2825
2826     *(p)++ = UTF8_TWO_BYTE_HI(converted);
2827     *p = UTF8_TWO_BYTE_LO(converted);
2828     *lenp = 2;
2829
2830     return converted;
2831 }
2832
2833 /* If compiled on an early Unicode version, there may not be auxiliary tables
2834  * */
2835 #ifndef HAS_UC_AUX_TABLES
2836 #  define UC_AUX_TABLE_ptrs     NULL
2837 #  define UC_AUX_TABLE_lengths  NULL
2838 #endif
2839 #ifndef HAS_TC_AUX_TABLES
2840 #  define TC_AUX_TABLE_ptrs     NULL
2841 #  define TC_AUX_TABLE_lengths  NULL
2842 #endif
2843 #ifndef HAS_LC_AUX_TABLES
2844 #  define LC_AUX_TABLE_ptrs     NULL
2845 #  define LC_AUX_TABLE_lengths  NULL
2846 #endif
2847 #ifndef HAS_CF_AUX_TABLES
2848 #  define CF_AUX_TABLE_ptrs     NULL
2849 #  define CF_AUX_TABLE_lengths  NULL
2850 #endif
2851
2852 /* Call the function to convert a UTF-8 encoded character to the specified case.
2853  * Note that there may be more than one character in the result.
2854  * 's' is a pointer to the first byte of the input character
2855  * 'd' will be set to the first byte of the string of changed characters.  It
2856  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2857  * 'lenp' will be set to the length in bytes of the string of changed characters
2858  *
2859  * The functions return the ordinal of the first character in the string of
2860  * 'd' */
2861 #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
2862                 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
2863                                               Uppercase_Mapping_invmap,     \
2864                                               UC_AUX_TABLE_ptrs,            \
2865                                               UC_AUX_TABLE_lengths,         \
2866                                               "uppercase")
2867 #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
2868                 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
2869                                               Titlecase_Mapping_invmap,     \
2870                                               TC_AUX_TABLE_ptrs,            \
2871                                               TC_AUX_TABLE_lengths,         \
2872                                               "titlecase")
2873 #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
2874                 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
2875                                               Lowercase_Mapping_invmap,     \
2876                                               LC_AUX_TABLE_ptrs,            \
2877                                               LC_AUX_TABLE_lengths,         \
2878                                               "lowercase")
2879
2880
2881 /* This additionally has the input parameter 'specials', which if non-zero will
2882  * cause this to use the specials hash for folding (meaning get full case
2883  * folding); otherwise, when zero, this implies a simple case fold */
2884 #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
2885         (specials)                                                          \
2886         ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
2887                                           Case_Folding_invmap,              \
2888                                           CF_AUX_TABLE_ptrs,                \
2889                                           CF_AUX_TABLE_lengths,             \
2890                                           "foldcase")                       \
2891         : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
2892                                          Simple_Case_Folding_invmap,        \
2893                                          NULL, NULL,                        \
2894                                          "foldcase")
2895
2896 UV
2897 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
2898 {
2899     /* Convert the Unicode character whose ordinal is <c> to its uppercase
2900      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2901      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2902      * the changed version may be longer than the original character.
2903      *
2904      * The ordinal of the first character of the changed version is returned
2905      * (but note, as explained above, that there may be more.) */
2906
2907     PERL_ARGS_ASSERT_TO_UNI_UPPER;
2908
2909     if (c < 256) {
2910         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2911     }
2912
2913     return CALL_UPPER_CASE(c, NULL, p, lenp);
2914 }
2915
2916 UV
2917 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
2918 {
2919     PERL_ARGS_ASSERT_TO_UNI_TITLE;
2920
2921     if (c < 256) {
2922         return _to_upper_title_latin1((U8) c, p, lenp, 's');
2923     }
2924
2925     return CALL_TITLE_CASE(c, NULL, p, lenp);
2926 }
2927
2928 STATIC U8
2929 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
2930 {
2931     /* We have the latin1-range values compiled into the core, so just use
2932      * those, converting the result to UTF-8.  Since the result is always just
2933      * one character, we allow <p> to be NULL */
2934
2935     U8 converted = toLOWER_LATIN1(c);
2936
2937     PERL_UNUSED_ARG(dummy);
2938
2939     if (p != NULL) {
2940         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
2941             *p = converted;
2942             *lenp = 1;
2943         }
2944         else {
2945             /* Result is known to always be < 256, so can use the EIGHT_BIT
2946              * macros */
2947             *p = UTF8_EIGHT_BIT_HI(converted);
2948             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
2949             *lenp = 2;
2950         }
2951     }
2952     return converted;
2953 }
2954
2955 UV
2956 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
2957 {
2958     PERL_ARGS_ASSERT_TO_UNI_LOWER;
2959
2960     if (c < 256) {
2961         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
2962     }
2963
2964     return CALL_LOWER_CASE(c, NULL, p, lenp);
2965 }
2966
2967 UV
2968 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
2969 {
2970     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
2971      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2972      *      FOLD_FLAGS_FULL  iff full folding is to be used;
2973      *
2974      *  Not to be used for locale folds
2975      */
2976
2977     UV converted;
2978
2979     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
2980
2981     assert (! (flags & FOLD_FLAGS_LOCALE));
2982
2983     if (UNLIKELY(c == MICRO_SIGN)) {
2984         converted = GREEK_SMALL_LETTER_MU;
2985     }
2986 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
2987    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
2988                                       || UNICODE_DOT_DOT_VERSION > 0)
2989     else if (   (flags & FOLD_FLAGS_FULL)
2990              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2991     {
2992         /* If can't cross 127/128 boundary, can't return "ss"; instead return
2993          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2994          * under those circumstances. */
2995         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2996             *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
2997             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2998                  p, *lenp, U8);
2999             return LATIN_SMALL_LETTER_LONG_S;
3000         }
3001         else {
3002             *(p)++ = 's';
3003             *p = 's';
3004             *lenp = 2;
3005             return 's';
3006         }
3007     }
3008 #endif
3009     else { /* In this range the fold of all other characters is their lower
3010               case */
3011         converted = toLOWER_LATIN1(c);
3012     }
3013
3014     if (UVCHR_IS_INVARIANT(converted)) {
3015         *p = (U8) converted;
3016         *lenp = 1;
3017     }
3018     else {
3019         *(p)++ = UTF8_TWO_BYTE_HI(converted);
3020         *p = UTF8_TWO_BYTE_LO(converted);
3021         *lenp = 2;
3022     }
3023
3024     return converted;
3025 }
3026
3027 UV
3028 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3029 {
3030
3031     /* Not currently externally documented, and subject to change
3032      *  <flags> bits meanings:
3033      *      FOLD_FLAGS_FULL  iff full folding is to be used;
3034      *      FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3035      *                        locale are to be used.
3036      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3037      */
3038
3039     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3040
3041     if (flags & FOLD_FLAGS_LOCALE) {
3042         /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3043          * except for potentially warning */
3044         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3045         if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
3046             flags &= ~FOLD_FLAGS_LOCALE;
3047         }
3048         else {
3049             goto needs_full_generality;
3050         }
3051     }
3052
3053     if (c < 256) {
3054         return _to_fold_latin1((U8) c, p, lenp,
3055                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3056     }
3057
3058     /* Here, above 255.  If no special needs, just use the macro */
3059     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3060         return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3061     }
3062     else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3063                the special flags. */
3064         U8 utf8_c[UTF8_MAXBYTES + 1];
3065
3066       needs_full_generality:
3067         uvchr_to_utf8(utf8_c, c);
3068         return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3069                                   p, lenp, flags);
3070     }
3071 }
3072
3073 PERL_STATIC_INLINE bool
3074 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3075                        SV* const invlist)
3076 {
3077     /* returns a boolean giving whether or not the UTF8-encoded character that
3078      * starts at <p>, and extending no further than <e - 1> is in the inversion
3079      * list <invlist>. */
3080
3081     UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3082
3083     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3084
3085     if (cp == 0 && (p >= e || *p != '\0')) {
3086         _force_out_malformed_utf8_message(p, e, 0, 1);
3087         NOT_REACHED; /* NOTREACHED */
3088     }
3089
3090     assert(invlist);
3091     return _invlist_contains_cp(invlist, cp);
3092 }
3093
3094 #if 0   /* Not currently used, but may be needed in the future */
3095 PERLVAR(I, seen_deprecated_macro, HV *)
3096
3097 STATIC void
3098 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3099                                      const char * const alternative,
3100                                      const bool use_locale,
3101                                      const char * const file,
3102                                      const unsigned line)
3103 {
3104     const char * key;
3105
3106     PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3107
3108     if (ckWARN_d(WARN_DEPRECATED)) {
3109
3110         key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3111         if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3112             if (! PL_seen_deprecated_macro) {
3113                 PL_seen_deprecated_macro = newHV();
3114             }
3115             if (! hv_store(PL_seen_deprecated_macro, key,
3116                            strlen(key), &PL_sv_undef, 0))
3117             {
3118                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3119             }
3120
3121             if (instr(file, "mathoms.c")) {
3122                 Perl_warner(aTHX_ WARN_DEPRECATED,
3123                             "In %s, line %d, starting in Perl v5.32, %s()"
3124                             " will be removed.  Avoid this message by"
3125                             " converting to use %s().\n",
3126                             file, line, name, alternative);
3127             }
3128             else {
3129                 Perl_warner(aTHX_ WARN_DEPRECATED,
3130                             "In %s, line %d, starting in Perl v5.32, %s() will"
3131                             " require an additional parameter.  Avoid this"
3132                             " message by converting to use %s().\n",
3133                             file, line, name, alternative);
3134             }
3135         }
3136     }
3137 }
3138 #endif
3139
3140 bool
3141 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3142 {
3143     PERL_ARGS_ASSERT__IS_UTF8_FOO;
3144
3145     return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3146 }
3147
3148 bool
3149 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3150 {
3151     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3152
3153     return is_utf8_common(p, e, PL_utf8_perl_idstart);
3154 }
3155
3156 bool
3157 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3158 {
3159     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3160
3161     return is_utf8_common(p, e, PL_utf8_perl_idcont);
3162 }
3163
3164 STATIC UV
3165 S_to_case_cp_list(pTHX_
3166                   const UV original,
3167                   const U32 ** const remaining_list,
3168                   Size_t * remaining_count,
3169                   SV *invlist, const I32 * const invmap,
3170                   const U32 * const * const aux_tables,
3171                   const U8 * const aux_table_lengths,
3172                   const char * const normal)
3173 {
3174     SSize_t index;
3175     I32 base;
3176
3177     /* Calculate the changed case of code point 'original'.  The first code
3178      * point of the changed case is returned.
3179      *
3180      * If 'remaining_count' is not NULL, *remaining_count will be set to how
3181      * many *other* code points are in the changed case.  If non-zero and
3182      * 'remaining_list' is also not NULL, *remaining_list will be set to point
3183      * to a non-modifiable array containing the second and potentially third
3184      * code points in the changed case.  (Unicode guarantees a maximum of 3.)
3185      * Note that this means that *remaining_list is undefined unless there are
3186      * multiple code points, and the caller has chosen to find out how many by
3187      * making 'remaining_count' not NULL.
3188      *
3189      * 'normal' is a string to use to name the new case in any generated
3190      * messages, as a fallback if the operation being used is not available.
3191      *
3192      * The casing to use is given by the data structures in the remaining
3193      * arguments.
3194      */
3195
3196     PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3197
3198     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3199      * that covers all possible inputs.  See [perl #133365] */
3200     index = _invlist_search(invlist, original);
3201     base = invmap[index];
3202
3203     /* Most likely, the case change will contain just a single code point */
3204     if (remaining_count) {
3205         *remaining_count = 0;
3206     }
3207
3208     if (LIKELY(base == 0)) {    /* 0 => original was unchanged by casing */
3209
3210         /* At this bottom level routine is where we warn about illegal code
3211          * points */
3212         if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3213             if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3214                 if (ckWARN_d(WARN_SURROGATE)) {
3215                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3216                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3217                         "Operation \"%s\" returns its argument for"
3218                         " UTF-16 surrogate U+%04" UVXf, desc, original);
3219                 }
3220             }
3221             else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3222                 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3223                     Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3224                 }
3225                 if (ckWARN_d(WARN_NON_UNICODE)) {
3226                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3227                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3228                         "Operation \"%s\" returns its argument for"
3229                         " non-Unicode code point 0x%04" UVXf, desc, original);
3230                 }
3231             }
3232
3233             /* Note that non-characters are perfectly legal, so no warning
3234              * should be given. */
3235         }
3236
3237         return original;
3238     }
3239
3240     if (LIKELY(base > 0)) {  /* means original mapped to a single code point,
3241                                 different from itself */
3242         return base + original - invlist_array(invlist)[index];
3243     }
3244
3245     /* Here 'base' is negative.  That means the mapping is 1-to-many, and
3246      * requires an auxiliary table look up.  abs(base) gives the index into a
3247      * list of such tables which points to the proper aux table.  And a
3248      * parallel list gives the length of each corresponding aux table.  Skip
3249      * the first entry in the *remaining returns, as it is returned by the
3250      * function. */
3251     base = -base;
3252     if (remaining_count) {
3253         *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3254
3255         if (remaining_list) {
3256             *remaining_list  = aux_tables[base] + 1;
3257         }
3258     }
3259
3260     return (UV) aux_tables[base][0];
3261 }
3262
3263 STATIC UV
3264 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3265                       U8* ustrp, STRLEN *lenp,
3266                       SV *invlist, const I32 * const invmap,
3267                       const U32 * const * const aux_tables,
3268                       const U8 * const aux_table_lengths,
3269                       const char * const normal)
3270 {
3271     /* Change the case of code point 'original'.  If 'p' is non-NULL, it points to
3272      * the beginning of the (assumed to be valid) UTF-8 representation of
3273      * 'original'.  'normal' is a string to use to name the new case in any
3274      * generated messages, as a fallback if the operation being used is not
3275      * available.  The new case is given by the data structures in the
3276      * remaining arguments.
3277      *
3278      * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3279      * entire changed case string, and the return value is the first code point
3280      * in that string
3281      *
3282      * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3283      * since the changed version may be longer than the original character. */
3284
3285     const U32 * remaining_list;
3286     Size_t remaining_count;
3287     UV first = to_case_cp_list(original,
3288                                &remaining_list, &remaining_count,
3289                                invlist, invmap,
3290                                aux_tables, aux_table_lengths,
3291                                normal);
3292
3293     PERL_ARGS_ASSERT__TO_UTF8_CASE;
3294
3295     /* If the code point maps to itself and we already have its representation,
3296      * copy it instead of recalculating */
3297     if (original == first && p) {
3298         *lenp = UTF8SKIP(p);
3299
3300         if (p != ustrp) {   /* Don't copy onto itself */
3301             Copy(p, ustrp, *lenp, U8);
3302         }
3303     }
3304     else {
3305         U8 * d = ustrp;
3306         Size_t i;
3307
3308         d = uvchr_to_utf8(d, first);
3309
3310         for (i = 0; i < remaining_count; i++) {
3311             d = uvchr_to_utf8(d, remaining_list[i]);
3312         }
3313
3314         *d = '\0';
3315         *lenp = d - ustrp;
3316     }
3317
3318     return first;
3319 }
3320
3321 Size_t
3322 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3323                           const U32 ** remaining_folds_to)
3324 {
3325     /* Returns the count of the number of code points that fold to the input
3326      * 'cp' (besides itself).
3327      *
3328      * If the return is 0, there is nothing else that folds to it, and
3329      * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3330      *
3331      * If the return is 1, '*first_folds_to' is set to the single code point,
3332      * and '*remaining_folds_to' is set to NULL.
3333      *
3334      * Otherwise, '*first_folds_to' is set to a code point, and
3335      * '*remaining_fold_to' is set to an array that contains the others.  The
3336      * length of this array is the returned count minus 1.
3337      *
3338      * The reason for this convolution is to avoid having to deal with
3339      * allocating and freeing memory.  The lists are already constructed, so
3340      * the return can point to them, but single code points aren't, so would
3341      * need to be constructed if we didn't employ something like this API
3342      *
3343      * The code points returned by this function are all legal Unicode, which
3344      * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3345      * constructed with this size (to save space and memory), and we return
3346      * pointers, so they must be this size */
3347
3348     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3349      * that covers all possible inputs.  See [perl #133365] */
3350     SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3351     I32 base = _Perl_IVCF_invmap[index];
3352
3353     PERL_ARGS_ASSERT__INVERSE_FOLDS;
3354
3355     if (base == 0) {            /* No fold */
3356         *first_folds_to = 0;
3357         *remaining_folds_to = NULL;
3358         return 0;
3359     }
3360
3361 #ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
3362
3363     assert(base > 0);
3364
3365 #else
3366
3367     if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
3368
3369         /* The data structure is set up so that the absolute value of 'base' is
3370          * an index into a table of pointers to arrays, with the array
3371          * corresponding to the index being the list of code points that fold
3372          * to 'cp', and the parallel array containing the length of the list
3373          * array */
3374         *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3375         *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3376                                                 /* +1 excludes first_folds_to */
3377         return IVCF_AUX_TABLE_lengths[-base];
3378     }
3379
3380 #endif
3381
3382     /* Only the single code point.  This works like 'fc(G) = G - A + a' */
3383     *first_folds_to = (U32) (base + cp
3384                                   - invlist_array(PL_utf8_foldclosures)[index]);
3385     *remaining_folds_to = NULL;
3386     return 1;
3387 }
3388
3389 STATIC UV
3390 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3391                                        U8* const ustrp, STRLEN *lenp)
3392 {
3393     /* This is called when changing the case of a UTF-8-encoded character above
3394      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
3395      * result contains a character that crosses the 255/256 boundary, disallow
3396      * the change, and return the original code point.  See L<perlfunc/lc> for
3397      * why;
3398      *
3399      * p        points to the original string whose case was changed; assumed
3400      *          by this routine to be well-formed
3401      * result   the code point of the first character in the changed-case string
3402      * ustrp    points to the changed-case string (<result> represents its
3403      *          first char)
3404      * lenp     points to the length of <ustrp> */
3405
3406     UV original;    /* To store the first code point of <p> */
3407
3408     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3409
3410     assert(UTF8_IS_ABOVE_LATIN1(*p));
3411
3412     /* We know immediately if the first character in the string crosses the
3413      * boundary, so can skip testing */
3414     if (result > 255) {
3415
3416         /* Look at every character in the result; if any cross the
3417         * boundary, the whole thing is disallowed */
3418         U8* s = ustrp + UTF8SKIP(ustrp);
3419         U8* e = ustrp + *lenp;
3420         while (s < e) {
3421             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3422                 goto bad_crossing;
3423             }
3424             s += UTF8SKIP(s);
3425         }
3426
3427         /* Here, no characters crossed, result is ok as-is, but we warn. */
3428         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3429         return result;
3430     }
3431
3432   bad_crossing:
3433
3434     /* Failed, have to return the original */
3435     original = valid_utf8_to_uvchr(p, lenp);
3436
3437     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3438     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3439                            "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3440                            " locale; resolved to \"\\x{%" UVXf "}\".",
3441                            OP_DESC(PL_op),
3442                            original,
3443                            original);
3444     Copy(p, ustrp, *lenp, char);
3445     return original;
3446 }
3447
3448 STATIC UV
3449 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3450                         U8 * ustrp, STRLEN *lenp)
3451 {
3452     /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3453      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3454      * Otherwise, it returns the first code point of the Turkic foldcased
3455      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3456      * contain *lenp bytes
3457      *
3458      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3459      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3460      * DOTLESS I */
3461
3462     PERL_ARGS_ASSERT_TURKIC_FC;
3463     assert(e > p);
3464
3465     if (UNLIKELY(*p == 'I')) {
3466         *lenp = 2;
3467         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3468         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3469         return LATIN_SMALL_LETTER_DOTLESS_I;
3470     }
3471
3472     if (UNLIKELY(memBEGINs(p, e - p,
3473                            LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3474     {
3475         *lenp = 1;
3476         *ustrp = 'i';
3477         return 'i';
3478     }
3479
3480     return 0;
3481 }
3482
3483 STATIC UV
3484 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3485                         U8 * ustrp, STRLEN *lenp)
3486 {
3487     /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3488      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3489      * Otherwise, it returns the first code point of the Turkic lowercased
3490      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3491      * contain *lenp bytes */
3492
3493     PERL_ARGS_ASSERT_TURKIC_LC;
3494     assert(e > p0);
3495
3496     /* A 'I' requires context as to what to do */
3497     if (UNLIKELY(*p0 == 'I')) {
3498         const U8 * p = p0 + 1;
3499
3500         /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3501          * modified by a dot above lowercases to 'i' even in turkic locales. */
3502         while (p < e) {
3503             UV cp;
3504
3505             if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3506                 ustrp[0] = 'i';
3507                 *lenp = 1;
3508                 return 'i';
3509             }
3510
3511             /* For the dot above to modify the 'I', it must be part of a
3512              * combining sequence immediately following the 'I', and no other
3513              * modifier with a ccc of 230 may intervene */
3514             cp = utf8_to_uvchr_buf(p, e, NULL);
3515             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3516                 break;
3517             }
3518
3519             /* Here the combining sequence continues */
3520             p += UTF8SKIP(p);
3521         }
3522     }
3523
3524     /* In all other cases the lc is the same as the fold */
3525     return turkic_fc(p0, e, ustrp, lenp);
3526 }
3527
3528 STATIC UV
3529 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3530                         U8 * ustrp, STRLEN *lenp)
3531 {
3532     /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3533      * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3534      * Otherwise, it returns the first code point of the Turkic upper or
3535      * title-cased sequence, and the entire sequence will be stored in *ustrp.
3536      * ustrp will contain *lenp bytes
3537      *
3538      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3539      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3540      * DOTLESS I */
3541
3542     PERL_ARGS_ASSERT_TURKIC_UC;
3543     assert(e > p);
3544
3545     if (*p == 'i') {
3546         *lenp = 2;
3547         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3548         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3549         return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3550     }
3551
3552     if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3553         *lenp = 1;
3554         *ustrp = 'I';
3555         return 'I';
3556     }
3557
3558     return 0;
3559 }
3560
3561 /* The process for changing the case is essentially the same for the four case
3562  * change types, except there are complications for folding.  Otherwise the
3563  * difference is only which case to change to.  To make sure that they all do
3564  * the same thing, the bodies of the functions are extracted out into the
3565  * following two macros.  The functions are written with the same variable
3566  * names, and these are known and used inside these macros.  It would be
3567  * better, of course, to have inline functions to do it, but since different
3568  * macros are called, depending on which case is being changed to, this is not
3569  * feasible in C (to khw's knowledge).  Two macros are created so that the fold
3570  * function can start with the common start macro, then finish with its special
3571  * handling; while the other three cases can just use the common end macro.
3572  *
3573  * The algorithm is to use the proper (passed in) macro or function to change
3574  * the case for code points that are below 256.  The macro is used if using
3575  * locale rules for the case change; the function if not.  If the code point is
3576  * above 255, it is computed from the input UTF-8, and another macro is called
3577  * to do the conversion.  If necessary, the output is converted to UTF-8.  If
3578  * using a locale, we have to check that the change did not cross the 255/256
3579  * boundary, see check_locale_boundary_crossing() for further details.
3580  *
3581  * The macros are split with the correct case change for the below-256 case
3582  * stored into 'result', and in the middle of an else clause for the above-255
3583  * case.  At that point in the 'else', 'result' is not the final result, but is
3584  * the input code point calculated from the UTF-8.  The fold code needs to
3585  * realize all this and take it from there.
3586  *
3587  * To deal with Turkic locales, the function specified by the parameter
3588  * 'turkic' is called when appropriate.
3589  *
3590  * If you read the two macros as sequential, it's easier to understand what's
3591  * going on. */
3592 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func,  \
3593                                L1_func_extra_param, turkic)                  \
3594                                                                              \
3595     if (flags & (locale_flags)) {                                            \
3596         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                  \
3597         if (IN_UTF8_CTYPE_LOCALE) {                                          \
3598             if (UNLIKELY(PL_in_utf8_turkic_locale)) {                        \
3599                 UV ret = turkic(p, e, ustrp, lenp);                          \
3600                 if (ret) return ret;                                         \
3601             }                                                                \
3602                                                                              \
3603             /* Otherwise, treat a UTF-8 locale as not being in locale at     \
3604              * all */                                                        \
3605             flags &= ~(locale_flags);                                        \
3606         }                                                                    \
3607     }                                                                        \
3608                                                                              \
3609     if (UTF8_IS_INVARIANT(*p)) {                                             \
3610         if (flags & (locale_flags)) {                                        \
3611             result = libc_change_function(*p);                               \
3612         }                                                                    \
3613         else {                                                               \
3614             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
3615         }                                                                    \
3616     }                                                                        \
3617     else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
3618         U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
3619         if (flags & (locale_flags)) {                                        \
3620             result = libc_change_function(c);                                \
3621         }                                                                    \
3622         else {                                                               \
3623             return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
3624         }                                                                    \
3625     }                                                                        \
3626     else {  /* malformed UTF-8 or ord above 255 */                           \
3627         STRLEN len_result;                                                   \
3628         result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
3629         if (len_result == (STRLEN) -1) {                                     \
3630             _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
3631         }
3632
3633 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
3634         result = change_macro(result, p, ustrp, lenp);                       \
3635                                                                              \
3636         if (flags & (locale_flags)) {                                        \
3637             result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3638         }                                                                    \
3639         return result;                                                       \
3640     }                                                                        \
3641                                                                              \
3642     /* Here, used locale rules.  Convert back to UTF-8 */                    \
3643     if (UTF8_IS_INVARIANT(result)) {                                         \
3644         *ustrp = (U8) result;                                                \
3645         *lenp = 1;                                                           \
3646     }                                                                        \
3647     else {                                                                   \
3648         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);                             \
3649         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);                       \
3650         *lenp = 2;                                                           \
3651     }                                                                        \
3652                                                                              \
3653     return result;
3654
3655 /* Not currently externally documented, and subject to change:
3656  * <flags> is set iff the rules from the current underlying locale are to
3657  *         be used. */
3658
3659 UV
3660 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3661                                 const U8 *e,
3662                                 U8* ustrp,
3663                                 STRLEN *lenp,
3664                                 bool flags)
3665 {
3666     UV result;
3667
3668     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3669
3670     /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3671     /* 2nd char of uc(U+DF) is 'S' */
3672     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3673                                                                     turkic_uc);
3674     CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
3675 }
3676
3677 /* Not currently externally documented, and subject to change:
3678  * <flags> is set iff the rules from the current underlying locale are to be
3679  *         used.  Since titlecase is not defined in POSIX, for other than a
3680  *         UTF-8 locale, uppercase is used instead for code points < 256.
3681  */
3682
3683 UV
3684 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3685                                 const U8 *e,
3686                                 U8* ustrp,
3687                                 STRLEN *lenp,
3688                                 bool flags)
3689 {
3690     UV result;
3691
3692     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3693
3694     /* 2nd char of ucfirst(U+DF) is 's' */
3695     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3696                                                                     turkic_uc);
3697     CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
3698 }
3699
3700 /* Not currently externally documented, and subject to change:
3701  * <flags> is set iff the rules from the current underlying locale are to
3702  *         be used.
3703  */
3704
3705 UV
3706 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3707                                 const U8 *e,
3708                                 U8* ustrp,
3709                                 STRLEN *lenp,
3710                                 bool flags)
3711 {
3712     UV result;
3713
3714     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3715
3716     CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3717                                                                     turkic_lc);
3718     CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
3719 }
3720
3721 /* Not currently externally documented, and subject to change,
3722  * in <flags>
3723  *      bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3724  *                            locale are to be used.
3725  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
3726  *                            otherwise simple folds
3727  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3728  *                            prohibited
3729  */
3730
3731 UV
3732 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3733                                const U8 *e,
3734                                U8* ustrp,
3735                                STRLEN *lenp,
3736                                U8 flags)
3737 {
3738     UV result;
3739
3740     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3741
3742     /* These are mutually exclusive */
3743     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3744
3745     assert(p != ustrp); /* Otherwise overwrites */
3746
3747     CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3748                  ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3749                                                                     turkic_fc);
3750
3751         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3752
3753         if (flags & FOLD_FLAGS_LOCALE) {
3754
3755 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3756 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3757 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3758
3759             /* Special case these two characters, as what normally gets
3760              * returned under locale doesn't work */
3761             if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3762             {
3763                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3764                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3765                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3766                               "resolved to \"\\x{17F}\\x{17F}\".");
3767                 goto return_long_s;
3768             }
3769             else
3770 #endif
3771                  if (memBEGINs((char *) p, e - p, LONG_S_T))
3772             {
3773                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3774                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3775                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3776                               "resolved to \"\\x{FB06}\".");
3777                 goto return_ligature_st;
3778             }
3779
3780 #if    UNICODE_MAJOR_VERSION   == 3         \
3781     && UNICODE_DOT_VERSION     == 0         \
3782     && UNICODE_DOT_DOT_VERSION == 1
3783 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3784
3785             /* And special case this on this Unicode version only, for the same
3786              * reaons the other two are special cased.  They would cross the
3787              * 255/256 boundary which is forbidden under /l, and so the code
3788              * wouldn't catch that they are equivalent (which they are only in
3789              * this release) */
3790             else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
3791                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3792                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3793                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3794                               "resolved to \"\\x{0131}\".");
3795                 goto return_dotless_i;
3796             }
3797 #endif
3798
3799             return check_locale_boundary_crossing(p, result, ustrp, lenp);
3800         }
3801         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3802             return result;
3803         }
3804         else {
3805             /* This is called when changing the case of a UTF-8-encoded
3806              * character above the ASCII range, and the result should not
3807              * contain an ASCII character. */
3808
3809             UV original;    /* To store the first code point of <p> */
3810
3811             /* Look at every character in the result; if any cross the
3812             * boundary, the whole thing is disallowed */
3813             U8* s = ustrp;
3814             U8* send = ustrp + *lenp;
3815             while (s < send) {
3816                 if (isASCII(*s)) {
3817                     /* Crossed, have to return the original */
3818                     original = valid_utf8_to_uvchr(p, lenp);
3819
3820                     /* But in these instances, there is an alternative we can
3821                      * return that is valid */
3822                     if (original == LATIN_SMALL_LETTER_SHARP_S
3823 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3824                         || original == LATIN_CAPITAL_LETTER_SHARP_S
3825 #endif
3826                     ) {
3827                         goto return_long_s;
3828                     }
3829                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3830                         goto return_ligature_st;
3831                     }
3832 #if    UNICODE_MAJOR_VERSION   == 3         \
3833     && UNICODE_DOT_VERSION     == 0         \
3834     && UNICODE_DOT_DOT_VERSION == 1
3835
3836                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3837                         goto return_dotless_i;
3838                     }
3839 #endif
3840                     Copy(p, ustrp, *lenp, char);
3841                     return original;
3842                 }
3843                 s += UTF8SKIP(s);
3844             }
3845
3846             /* Here, no characters crossed, result is ok as-is */
3847             return result;
3848         }
3849     }
3850
3851     /* Here, used locale rules.  Convert back to UTF-8 */
3852     if (UTF8_IS_INVARIANT(result)) {
3853         *ustrp = (U8) result;
3854         *lenp = 1;
3855     }
3856     else {
3857         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3858         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3859         *lenp = 2;
3860     }
3861
3862     return result;
3863
3864   return_long_s:
3865     /* Certain folds to 'ss' are prohibited by the options, but they do allow
3866      * folds to a string of two of these characters.  By returning this
3867      * instead, then, e.g.,
3868      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3869      * works. */
3870
3871     *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3872     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
3873         ustrp, *lenp, U8);
3874     return LATIN_SMALL_LETTER_LONG_S;
3875
3876   return_ligature_st:
3877     /* Two folds to 'st' are prohibited by the options; instead we pick one and
3878      * have the other one fold to it */
3879
3880     *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
3881     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3882     return LATIN_SMALL_LIGATURE_ST;
3883
3884 #if    UNICODE_MAJOR_VERSION   == 3         \
3885     && UNICODE_DOT_VERSION     == 0         \
3886     && UNICODE_DOT_DOT_VERSION == 1
3887
3888   return_dotless_i:
3889     *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
3890     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3891     return LATIN_SMALL_LETTER_DOTLESS_I;
3892
3893 #endif
3894
3895 }
3896
3897 bool
3898 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
3899 {
3900     /* May change: warns if surrogates, non-character code points, or
3901      * non-Unicode code points are in 's' which has length 'len' bytes.
3902      * Returns TRUE if none found; FALSE otherwise.  The only other validity
3903      * check is to make sure that this won't exceed the string's length nor
3904      * overflow */
3905
3906     const U8* const e = s + len;
3907     bool ok = TRUE;
3908
3909     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3910
3911     while (s < e) {
3912         if (UTF8SKIP(s) > len) {
3913             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3914                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3915             return FALSE;
3916         }
3917         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
3918             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
3919                 if (   ckWARN_d(WARN_NON_UNICODE)
3920                     || UNLIKELY(0 < does_utf8_overflow(s, s + len,
3921                                                0 /* Don't consider overlongs */
3922                                                )))
3923                 {
3924                     /* A side effect of this function will be to warn */
3925                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
3926                     ok = FALSE;
3927                 }
3928             }
3929             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
3930                 if (ckWARN_d(WARN_SURROGATE)) {
3931                     /* This has a different warning than the one the called
3932                      * function would output, so can't just call it, unlike we
3933                      * do for the non-chars and above-unicodes */
3934                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
3935                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3936                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
3937                                              uv);
3938                     ok = FALSE;
3939                 }
3940             }
3941             else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
3942                      && (ckWARN_d(WARN_NONCHAR)))
3943             {
3944                 /* A side effect of this function will be to warn */
3945                 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
3946                 ok = FALSE;
3947             }
3948         }
3949         s += UTF8SKIP(s);
3950     }
3951
3952     return ok;
3953 }
3954
3955 /*
3956 =for apidoc pv_uni_display
3957
3958 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
3959 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
3960 long (if longer, the rest is truncated and C<"..."> will be appended).
3961
3962 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
3963 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
3964 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
3965 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
3966 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
3967 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
3968
3969 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
3970 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
3971
3972 The pointer to the PV of the C<dsv> is returned.
3973
3974 See also L</sv_uni_display>.
3975
3976 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
3977 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
3978 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
3979 =for apidoc Amnh||UNI_DISPLAY_QQ
3980 =for apidoc Amnh||UNI_DISPLAY_REGEX
3981 =cut
3982 */
3983 char *
3984 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
3985                           UV flags)
3986 {
3987     int truncated = 0;
3988     const char *s, *e;
3989
3990     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
3991
3992     SvPVCLEAR(dsv);
3993     SvUTF8_off(dsv);
3994     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
3995          UV u;
3996          bool ok = 0;
3997
3998          if (pvlim && SvCUR(dsv) >= pvlim) {
3999               truncated++;
4000               break;
4001          }
4002          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4003          if (u < 256) {
4004              const U8 c = (U8) u;
4005              if (flags & UNI_DISPLAY_BACKSLASH) {
4006                  if (    isMNEMONIC_CNTRL(c)
4007                      && (   c != '\b'
4008                          || (flags & UNI_DISPLAY_BACKSPACE)))
4009                  {
4010                     const char * mnemonic = cntrl_to_mnemonic(c);
4011                     sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4012                     ok = 1;
4013                  }
4014                  else if (c == '\\') {
4015                     sv_catpvs(dsv, "\\\\");
4016                     ok = 1;
4017                  }
4018              }
4019              /* isPRINT() is the locale-blind version. */
4020              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4021                  const char string = c;
4022                  sv_catpvn(dsv, &string, 1);
4023                  ok = 1;
4024              }
4025          }
4026          if (!ok)
4027              Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4028     }
4029     if (truncated)
4030          sv_catpvs(dsv, "...");
4031
4032     return SvPVX(dsv);
4033 }
4034
4035 /*
4036 =for apidoc sv_uni_display
4037
4038 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4039 the displayable version being at most C<pvlim> bytes long
4040 (if longer, the rest is truncated and "..." will be appended).
4041
4042 The C<flags> argument is as in L</pv_uni_display>().
4043
4044 The pointer to the PV of the C<dsv> is returned.
4045
4046 =cut
4047 */
4048 char *
4049 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4050 {
4051     const char * const ptr =
4052         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4053
4054     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4055
4056     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4057                                 SvCUR(ssv), pvlim, flags);
4058 }
4059
4060 /*
4061 =for apidoc foldEQ_utf8
4062
4063 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4064 both of which may be in UTF-8) are the same case-insensitively; false
4065 otherwise.  How far into the strings to compare is determined by other input
4066 parameters.
4067
4068 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4069 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
4070 C<u2> with respect to C<s2>.
4071
4072 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4073 fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
4074 The scan will not be considered to be a match unless the goal is reached, and
4075 scanning won't continue past that goal.  Correspondingly for C<l2> with respect
4076 to C<s2>.
4077
4078 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4079 pointer is considered an end pointer to the position 1 byte past the maximum
4080 point in C<s1> beyond which scanning will not continue under any circumstances.
4081 (This routine assumes that UTF-8 encoded input strings are not malformed;
4082 malformed input can cause it to read past C<pe1>).  This means that if both
4083 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4084 will never be successful because it can never
4085 get as far as its goal (and in fact is asserted against).  Correspondingly for
4086 C<pe2> with respect to C<s2>.
4087
4088 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4089 C<l2> must be non-zero), and if both do, both have to be
4090 reached for a successful match.   Also, if the fold of a character is multiple
4091 characters, all of them must be matched (see tr21 reference below for
4092 'folding').
4093
4094 Upon a successful match, if C<pe1> is non-C<NULL>,
4095 it will be set to point to the beginning of the I<next> character of C<s1>
4096 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4097
4098 For case-insensitiveness, the "casefolding" of Unicode is used
4099 instead of upper/lowercasing both the characters, see
4100 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4101
4102 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4103 =for apidoc Cmnh||FOLDEQ_LOCALE
4104 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4105 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4106 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4107 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4108
4109 =cut */
4110
4111 /* A flags parameter has been added which may change, and hence isn't
4112  * externally documented.  Currently it is:
4113  *  0 for as-documented above
4114  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4115                             ASCII one, to not match
4116  *  FOLDEQ_LOCALE           is set iff the rules from the current underlying
4117  *                          locale are to be used.
4118  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4119  *                          routine.  This allows that step to be skipped.
4120  *                          Currently, this requires s1 to be encoded as UTF-8
4121  *                          (u1 must be true), which is asserted for.
4122  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4123  *                          cross certain boundaries.  Hence, the caller should
4124  *                          let this function do the folding instead of
4125  *                          pre-folding.  This code contains an assertion to
4126  *                          that effect.  However, if the caller knows what
4127  *                          it's doing, it can pass this flag to indicate that,
4128  *                          and the assertion is skipped.
4129  *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4130  *                          to s2, and s2 doesn't have to be UTF-8 encoded.
4131  *                          This introduces an asymmetry to save a few branches
4132  *                          in a loop.  Currently, this is not a problem, as
4133  *                          never are both inputs pre-folded.  Simply call this
4134  *                          function with the pre-folded one as the second
4135  *                          string.
4136  *  FOLDEQ_S2_FOLDS_SANE
4137  */
4138
4139 I32
4140 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4141                              const char *s2, char **pe2, UV l2, bool u2,
4142                              U32 flags)
4143 {
4144     const U8 *p1  = (const U8*)s1; /* Point to current char */
4145     const U8 *p2  = (const U8*)s2;
4146     const U8 *g1 = NULL;       /* goal for s1 */
4147     const U8 *g2 = NULL;
4148     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4149     U8 *f1 = NULL;             /* Point to current folded */
4150     const U8 *e2 = NULL;
4151     U8 *f2 = NULL;
4152     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4153     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4154     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4155     U8 flags_for_folder = FOLD_FLAGS_FULL;
4156
4157     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4158
4159     assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4160                && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
4161                         && !(flags &  FOLDEQ_S1_FOLDS_SANE))
4162                     || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
4163                         && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
4164     /* The algorithm is to trial the folds without regard to the flags on
4165      * the first line of the above assert(), and then see if the result
4166      * violates them.  This means that the inputs can't be pre-folded to a
4167      * violating result, hence the assert.  This could be changed, with the
4168      * addition of extra tests here for the already-folded case, which would
4169      * slow it down.  That cost is more than any possible gain for when these
4170      * flags are specified, as the flags indicate /il or /iaa matching which
4171      * is less common than /iu, and I (khw) also believe that real-world /il
4172      * and /iaa matches are most likely to involve code points 0-255, and this
4173      * function only under rare conditions gets called for 0-255. */
4174
4175     if (flags & FOLDEQ_LOCALE) {
4176         if (IN_UTF8_CTYPE_LOCALE) {
4177             if (UNLIKELY(PL_in_utf8_turkic_locale)) {
4178                 flags_for_folder |= FOLD_FLAGS_LOCALE;
4179             }
4180             else {
4181                 flags &= ~FOLDEQ_LOCALE;
4182             }
4183         }
4184         else {
4185             flags_for_folder |= FOLD_FLAGS_LOCALE;
4186         }
4187     }
4188     if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4189         flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4190     }
4191
4192     if (pe1) {
4193         e1 = *(U8**)pe1;
4194     }
4195
4196     if (l1) {
4197         g1 = (const U8*)s1 + l1;
4198     }
4199
4200     if (pe2) {
4201         e2 = *(U8**)pe2;
4202     }
4203
4204     if (l2) {
4205         g2 = (const U8*)s2 + l2;
4206     }
4207
4208     /* Must have at least one goal */
4209     assert(g1 || g2);
4210
4211     if (g1) {
4212
4213         /* Will never match if goal is out-of-bounds */
4214         assert(! e1  || e1 >= g1);
4215
4216         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4217         * only go as far as the goal */
4218         e1 = g1;
4219     }
4220     else {
4221         assert(e1);    /* Must have an end for looking at s1 */
4222     }
4223
4224     /* Same for goal for s2 */
4225     if (g2) {
4226         assert(! e2  || e2 >= g2);
4227         e2 = g2;
4228     }
4229     else {
4230         assert(e2);
4231     }
4232
4233     /* If both operands are already folded, we could just do a memEQ on the
4234      * whole strings at once, but it would be better if the caller realized
4235      * this and didn't even call us */
4236
4237     /* Look through both strings, a character at a time */
4238     while (p1 < e1 && p2 < e2) {
4239
4240         /* If at the beginning of a new character in s1, get its fold to use
4241          * and the length of the fold. */
4242         if (n1 == 0) {
4243             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4244                 f1 = (U8 *) p1;
4245                 assert(u1);
4246                 n1 = UTF8SKIP(f1);
4247             }
4248             else {
4249                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4250
4251                     /* We have to forbid mixing ASCII with non-ASCII if the
4252                      * flags so indicate.  And, we can short circuit having to
4253                      * call the general functions for this common ASCII case,
4254                      * all of whose non-locale folds are also ASCII, and hence
4255                      * UTF-8 invariants, so the UTF8ness of the strings is not
4256                      * relevant. */
4257                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4258                         return 0;
4259                     }
4260                     n1 = 1;
4261                     *foldbuf1 = toFOLD(*p1);
4262                 }
4263                 else if (u1) {
4264                     _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4265                 }
4266                 else {  /* Not UTF-8, get UTF-8 fold */
4267                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4268                 }
4269                 f1 = foldbuf1;
4270             }
4271         }
4272
4273         if (n2 == 0) {    /* Same for s2 */
4274             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4275
4276                 /* Point to the already-folded character.  But for non-UTF-8
4277                  * variants, convert to UTF-8 for the algorithm below */
4278                 if (UTF8_IS_INVARIANT(*p2)) {
4279                     f2 = (U8 *) p2;
4280                     n2 = 1;
4281                 }
4282                 else if (u2) {
4283                     f2 = (U8 *) p2;
4284                     n2 = UTF8SKIP(f2);
4285                 }
4286                 else {
4287                     foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4288                     foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4289                     f2 = foldbuf2;
4290                     n2 = 2;
4291                 }
4292             }
4293             else {
4294                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4295                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4296                         return 0;
4297                     }
4298                     n2 = 1;
4299                     *foldbuf2 = toFOLD(*p2);
4300                 }
4301                 else if (u2) {
4302                     _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4303                 }
4304                 else {
4305                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4306                 }
4307                 f2 = foldbuf2;
4308             }
4309         }
4310
4311         /* Here f1 and f2 point to the beginning of the strings to compare.
4312          * These strings are the folds of the next character from each input
4313          * string, stored in UTF-8. */
4314
4315         /* While there is more to look for in both folds, see if they
4316         * continue to match */
4317         while (n1 && n2) {
4318             U8 fold_length = UTF8SKIP(f1);
4319             if (fold_length != UTF8SKIP(f2)
4320                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4321                                                        function call for single
4322                                                        byte */
4323                 || memNE((char*)f1, (char*)f2, fold_length))
4324             {
4325                 return 0; /* mismatch */
4326             }
4327
4328             /* Here, they matched, advance past them */
4329             n1 -= fold_length;
4330             f1 += fold_length;
4331             n2 -= fold_length;
4332             f2 += fold_length;
4333         }
4334
4335         /* When reach the end of any fold, advance the input past it */
4336         if (n1 == 0) {
4337             p1 += u1 ? UTF8SKIP(p1) : 1;
4338         }
4339         if (n2 == 0) {
4340             p2 += u2 ? UTF8SKIP(p2) : 1;
4341         }
4342     } /* End of loop through both strings */
4343
4344     /* A match is defined by each scan that specified an explicit length
4345     * reaching its final goal, and the other not having matched a partial
4346     * character (which can happen when the fold of a character is more than one
4347     * character). */
4348     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4349         return 0;
4350     }
4351
4352     /* Successful match.  Set output pointers */
4353     if (pe1) {
4354         *pe1 = (char*)p1;
4355     }
4356     if (pe2) {
4357         *pe2 = (char*)p2;
4358     }
4359     return 1;
4360 }
4361
4362 /*
4363  * ex: set ts=8 sts=4 sw=4 et:
4364  */