utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34 #include "invlist_inline.h"
  35
  36 static const char malformed_text[] = "Malformed UTF-8 character";
  37 static const char unees[] =
  38                         "Malformed UTF-8 character (unexpected end of string)";
  39
  40 /*
  41 These are various utility functions for manipulating UTF8-encoded
  42 strings.  For the uninitiated, this is a method of representing arbitrary
  43 Unicode characters as a variable number of bytes, in such a way that
  44 characters in the ASCII range are unmodified, and a zero byte never appears
  45 within non-zero characters.
  46 */
  47
  48 /* helper for Perl__force_out_malformed_utf8_message(). Like
  49  * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
  50  * PL_compiling */
  51
  52 static void
  53 S_restore_cop_warnings(pTHX_ void *p)
  54 {
  55     free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
  56 }
  57
  58
  59 void
  60 Perl__force_out_malformed_utf8_message(pTHX_
  61             const U8 *const p,      /* First byte in UTF-8 sequence */
  62             const U8 * const e,     /* Final byte in sequence (may include
  63                                        multiple chars */
  64             const U32 flags,        /* Flags to pass to utf8n_to_uvchr(),
  65                                        usually 0, or some DISALLOW flags */
  66             const bool die_here)    /* If TRUE, this function does not return */
  67 {
  68     /* This core-only function is to be called when a malformed UTF-8 character
  69      * is found, in order to output the detailed information about the
  70      * malformation before dieing.  The reason it exists is for the occasions
  71      * when such a malformation is fatal, but warnings might be turned off, so
  72      * that normally they would not be actually output.  This ensures that they
  73      * do get output.  Because a sequence may be malformed in more than one
  74      * way, multiple messages may be generated, so we can't make them fatal, as
  75      * that would cause the first one to die.
  76      *
  77      * Instead we pretend -W was passed to perl, then die afterwards.  The
  78      * flexibility is here to return to the caller so they can finish up and
  79      * die themselves */
  80     U32 errors;
  81
  82     PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
  83
  84     ENTER;
  85     SAVEI8(PL_dowarn);
  86     SAVESPTR(PL_curcop);
  87
  88     PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
  89     if (PL_curcop) {
  90         /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
  91          * than PL_compiling */
  92         SAVEDESTRUCTOR_X(S_restore_cop_warnings,
  93                 (void*)PL_curcop->cop_warnings);
  94         PL_curcop->cop_warnings = pWARN_ALL;
  95     }
  96
  97     (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
  98
  99     LEAVE;
 100
 101     if (! errors) {
 102         Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
 103                          " be called only when there are errors found");
 104     }
 105
 106     if (die_here) {
 107         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
 108     }
 109 }
 110
 111 STATIC HV *
 112 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
 113                    U32 categories,  /* Packed warning categories */
 114                    U32 flag)        /* Flag associated with this message */
 115 {
 116     /* Creates, populates, and returns an HV* that describes an error message
 117      * for the translators between UTF8 and code point */
 118
 119     SV* msg_sv = newSVpv(message, 0);
 120     SV* category_sv = newSVuv(categories);
 121     SV* flag_bit_sv = newSVuv(flag);
 122
 123     HV* msg_hv = newHV();
 124
 125     PERL_ARGS_ASSERT_NEW_MSG_HV;
 126
 127     (void) hv_stores(msg_hv, "text", msg_sv);
 128     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
 129     (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
 130
 131     return msg_hv;
 132 }
 133
 134 /*
 135 =for apidoc uvoffuni_to_utf8_flags
 136
 137 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 138 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
 139 L<perlapi/uvchr_to_utf8_flags>>.
 140
 141 This function is like them, but the input is a strict Unicode
 142 (as opposed to native) code point.  Only in very rare circumstances should code
 143 not be using the native code point.
 144
 145 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
 146
 147 =cut
 148 */
 149
 150 U8 *
 151 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
 152 {
 153     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
 154
 155     return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
 156 }
 157
 158 /* All these formats take a single UV code point argument */
 159 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
 160 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
 161                                    " is not recommended for open interchange";
 162 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
 163                                    " may not be portable";
 164
 165 /*  Use shorter names internally in this file */
 166 #define SHIFT   UTF_ACCUMULATION_SHIFT
 167 #undef  MARK
 168 #define MARK    UTF_CONTINUATION_MARK
 169 #define MASK    UTF_CONTINUATION_MASK
 170
 171 /*
 172 =for apidoc uvchr_to_utf8_flags_msgs
 173
 174 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 175
 176 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
 177
 178 This function is for code that wants any warning and/or error messages to be
 179 returned to the caller rather than be displayed.  All messages that would have
 180 been displayed if all lexical warnings are enabled will be returned.
 181
 182 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
 183 placed after all the others, C<msgs>.  If this parameter is 0, this function
 184 behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
 185 be a pointer to an C<HV *> variable, in which this function creates a new HV to
 186 contain any appropriate messages.  The hash has three key-value pairs, as
 187 follows:
 188
 189 =over 4
 190
 191 =item C<text>
 192
 193 The text of the message as a C<SVpv>.
 194
 195 =item C<warn_categories>
 196
 197 The warning category (or categories) packed into a C<SVuv>.
 198
 199 =item C<flag>
 200
 201 A single flag bit associated with this message, in a C<SVuv>.
 202 The bit corresponds to some bit in the C<*errors> return value,
 203 such as C<UNICODE_GOT_SURROGATE>.
 204
 205 =back
 206
 207 It's important to note that specifying this parameter as non-null will cause
 208 any warnings this function would otherwise generate to be suppressed, and
 209 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
 210 (or not) when choosing what to do with the returned messages.
 211
 212 The caller, of course, is responsible for freeing any returned HV.
 213
 214 =cut
 215 */
 216
 217 /* Undocumented; we don't want people using this.  Instead they should use
 218  * uvchr_to_utf8_flags_msgs() */
 219 U8 *
 220 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
 221 {
 222     U8 *p;
 223     UV shifted_uv = input_uv;
 224     STRLEN utf8_skip = OFFUNISKIP(input_uv);
 225
 226     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
 227
 228     if (msgs) {
 229         *msgs = NULL;
 230     }
 231
 232     switch (utf8_skip) {
 233       case 1:
 234         *d++ = LATIN1_TO_NATIVE(input_uv);
 235         return d;
 236
 237       default:
 238         if (   UNLIKELY(input_uv > MAX_LEGAL_CP
 239             && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
 240         {
 241             Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
 242                                                          NULL, 0, input_uv));
 243         }
 244
 245         if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
 246             U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
 247             const char * format = PL_extended_cp_format;
 248             if (msgs) {
 249                 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 250                                    category,
 251                                    UNICODE_GOT_PERL_EXTENDED);
 252             }
 253             else {
 254                 Perl_ck_warner_d(aTHX_ category, format, input_uv);
 255             }
 256
 257             /* Don't output a 2nd msg */
 258             flags &= ~UNICODE_WARN_SUPER;
 259         }
 260
 261         if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
 262             return NULL;
 263         }
 264
 265         p = d + utf8_skip - 1;
 266         while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
 267             *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 268             shifted_uv >>= SHIFT;
 269         }
 270
 271         /* FALLTHROUGH */
 272
 273       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 274         d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 275                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 276         shifted_uv >>= SHIFT;
 277         /* FALLTHROUGH */
 278
 279       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 280         d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 281                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 282         shifted_uv >>= SHIFT;
 283         /* FALLTHROUGH */
 284
 285       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 286         if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
 287             if (flags & UNICODE_WARN_SUPER) {
 288                 U32 category = packWARN(WARN_NON_UNICODE);
 289                 const char * format = super_cp_format;
 290
 291                 if (msgs) {
 292                     *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 293                                        category,
 294                                        UNICODE_GOT_SUPER);
 295                 }
 296                 else {
 297                     Perl_ck_warner_d(aTHX_ category, format, input_uv);
 298                 }
 299
 300                 if (flags & UNICODE_DISALLOW_SUPER) {
 301                     return NULL;
 302                 }
 303             }
 304             if (       (flags & UNICODE_DISALLOW_SUPER)
 305                 || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
 306                     &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
 307             {
 308                 return NULL;
 309             }
 310         }
 311
 312         d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 313                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 314         shifted_uv >>= SHIFT;
 315         /* FALLTHROUGH */
 316
 317       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 318         if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
 319             if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
 320                 if (flags & UNICODE_WARN_NONCHAR) {
 321                     U32 category = packWARN(WARN_NONCHAR);
 322                     const char * format = nonchar_cp_format;
 323                     if (msgs) {
 324                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 325                                            category,
 326                                            UNICODE_GOT_NONCHAR);
 327                     }
 328                     else {
 329                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
 330                     }
 331                 }
 332                 if (flags & UNICODE_DISALLOW_NONCHAR) {
 333                     return NULL;
 334                 }
 335             }
 336             else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
 337                 if (flags & UNICODE_WARN_SURROGATE) {
 338                     U32 category = packWARN(WARN_SURROGATE);
 339                     const char * format = surrogate_cp_format;
 340                     if (msgs) {
 341                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
 342                                            category,
 343                                            UNICODE_GOT_SURROGATE);
 344                     }
 345                     else {
 346                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
 347                     }
 348                 }
 349                 if (flags & UNICODE_DISALLOW_SURROGATE) {
 350                     return NULL;
 351                 }
 352             }
 353         }
 354
 355         d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
 356                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 357         shifted_uv >>= SHIFT;
 358         /* FALLTHROUGH */
 359
 360 #ifdef EBCDIC
 361
 362       case 3:
 363         d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 364         shifted_uv >>= SHIFT;
 365         /* FALLTHROUGH */
 366
 367 #endif
 368
 369         /* FALLTHROUGH */
 370       case 2:
 371         d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
 372         shifted_uv >>= SHIFT;
 373         d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
 374                                              | UTF_START_MARK(utf8_skip));
 375         break;
 376     }
 377
 378     return d + utf8_skip;
 379 }
 380
 381 /*
 382 =for apidoc uvchr_to_utf8
 383
 384 Adds the UTF-8 representation of the native code point C<uv> to the end
 385 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 386 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 387 the byte after the end of the new character.  In other words,
 388
 389     d = uvchr_to_utf8(d, uv);
 390
 391 is the recommended wide native character-aware way of saying
 392
 393     *(d++) = uv;
 394
 395 This function accepts any code point from 0..C<IV_MAX> as input.
 396 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
 397
 398 It is possible to forbid or warn on non-Unicode code points, or those that may
 399 be problematic by using L</uvchr_to_utf8_flags>.
 400
 401 =cut
 402 */
 403
 404 /* This is also a macro */
 405 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 406
 407 U8 *
 408 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 409 {
 410     return uvchr_to_utf8(d, uv);
 411 }
 412
 413 /*
 414 =for apidoc uvchr_to_utf8_flags
 415
 416 Adds the UTF-8 representation of the native code point C<uv> to the end
 417 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 418 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 419 the byte after the end of the new character.  In other words,
 420
 421     d = uvchr_to_utf8_flags(d, uv, flags);
 422
 423 or, in most cases,
 424
 425     d = uvchr_to_utf8_flags(d, uv, 0);
 426
 427 This is the Unicode-aware way of saying
 428
 429     *(d++) = uv;
 430
 431 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
 432 input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
 433
 434 Specifying C<flags> can further restrict what is allowed and not warned on, as
 435 follows:
 436
 437 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
 438 the function will raise a warning, provided UTF8 warnings are enabled.  If
 439 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
 440 NULL.  If both flags are set, the function will both warn and return NULL.
 441
 442 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
 443 affect how the function handles a Unicode non-character.
 444
 445 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
 446 affect the handling of code points that are above the Unicode maximum of
 447 0x10FFFF.  Languages other than Perl may not be able to accept files that
 448 contain these.
 449
 450 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
 451 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
 452 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
 453 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
 454 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
 455 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
 456 above-Unicode and surrogate flags, but not the non-character ones, as
 457 defined in
 458 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
 459 See L<perlunicode/Noncharacter code points>.
 460
 461 Extremely high code points were never specified in any standard, and require an
 462 extension to UTF-8 to express, which Perl does.  It is likely that programs
 463 written in something other than Perl would not be able to read files that
 464 contain these; nor would Perl understand files written by something that uses a
 465 different extension.  For these reasons, there is a separate set of flags that
 466 can warn and/or disallow these extremely high code points, even if other
 467 above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
 468 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
 469 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
 470 treat all above-Unicode code points, including these, as malformations.  (Note
 471 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
 472 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
 473
 474 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
 475 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
 476 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
 477 C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
 478 platforms,these flags can apply to code points that actually do fit in 31 bits.
 479 The new names accurately describe the situation in all cases.
 480
 481 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
 482 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
 483 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
 484 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
 485 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
 486 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
 487 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
 488 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
 489 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
 490 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
 491 =for apidoc Amnh||UNICODE_WARN_NONCHAR
 492 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
 493 =for apidoc Amnh||UNICODE_WARN_SUPER
 494 =for apidoc Amnh||UNICODE_WARN_SURROGATE
 495
 496 =cut
 497 */
 498
 499 /* This is also a macro */
 500 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 501
 502 U8 *
 503 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 504 {
 505     return uvchr_to_utf8_flags(d, uv, flags);
 506 }
 507
 508 PERL_STATIC_INLINE int
 509 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
 510 {
 511     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 512      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
 513      * it isn't, and -1 if there isn't enough information to tell.  This last
 514      * return value can happen if the sequence is incomplete, missing some
 515      * trailing bytes that would form a complete character.  If there are
 516      * enough bytes to make a definitive decision, this function does so.
 517      * Usually 2 bytes are sufficient.
 518      *
 519      * Overlongs can occur whenever the number of continuation bytes changes.
 520      * That means whenever the number of leading 1 bits in a start byte
 521      * increases from the next lower start byte.  That happens for start bytes
 522      * C0, E0, F0, F8, FC, FE, and FF.
 523      */
 524
 525     PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
 526
 527     /* Each platform has overlongs after the start bytes given above (expressed
 528      * in I8 for EBCDIC).  The values below were found by manually inspecting
 529      * the UTF-8 patterns.  See the tables in utf8.h and utfebcdic.h. */
 530
 531     switch (NATIVE_UTF8_TO_I8(s[0])) {
 532       default:
 533         assert(UTF8_IS_START(s[0]));
 534         return 0;
 535
 536       case 0xC0:
 537       case 0xC1:
 538         return 1;
 539
 540 #ifdef EBCDIC
 541
 542       case 0xC2:
 543       case 0xC3:
 544       case 0xC4:
 545       case 0xE0:
 546         return 1;
 547 #else
 548       case 0xE0:
 549         return (len < 2) ? -1 : s[1] < 0xA0;
 550 #endif
 551
 552       case 0xF0:
 553         return (len < 2)
 554                ? -1
 555                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
 556       case 0xF8:
 557         return (len < 2)
 558                ? -1
 559                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
 560       case 0xFC:
 561         return (len < 2)
 562                ? -1
 563                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
 564       case 0xFE:
 565         return (len < 2)
 566                ? -1
 567                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
 568       case 0xFF:
 569         return isFF_overlong(s, len);
 570     }
 571 }
 572
 573 PERL_STATIC_INLINE int
 574 S_isFF_overlong(const U8 * const s, const STRLEN len)
 575 {
 576     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 577      * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
 578      * it isn't, and -1 if there isn't enough information to tell.  This last
 579      * return value can happen if the sequence is incomplete, missing some
 580      * trailing bytes that would form a complete character.  If there are
 581      * enough bytes to make a definitive decision, this function does so. */
 582
 583     PERL_ARGS_ASSERT_ISFF_OVERLONG;
 584
 585 #ifdef EBCDIC
 586     /* This works on all three EBCDIC code pages traditionally supported by
 587      * perl */
 588 #  define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
 589 #else
 590 #  define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
 591 #endif
 592
 593     /* To be an FF overlong, all the available bytes must match */
 594     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
 595                      MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
 596     {
 597         return 0;
 598     }
 599
 600     /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
 601      * be there; what comes after them doesn't matter.  See tables in utf8.h,
 602      * utfebcdic.h. */
 603     if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
 604         return 1;
 605     }
 606
 607     /* The missing bytes could cause the result to go one way or the other, so
 608      * the result is indeterminate */
 609     return -1;
 610 }
 611
 612 /* At some point we may want to allow core to use up to UV_MAX */
 613
 614 #ifdef EBCDIC     /* Actually is I8 */
 615 #  if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
 616 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA7"
 617                               /* UV_MAX "\xFF\xAF" */
 618 #  else      /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
 619 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
 620                               /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
 621 #  endif
 622 #else
 623 #  if defined(UV_IS_QUAD)
 624 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\x80\x87"
 625                               /* UV_MAX "\xFF\x80" */
 626 #  else
 627 #    define HIGHEST_REPRESENTABLE_UTF  "\xFD"
 628                               /* UV_MAX "\xFE\x83" */
 629 #  endif
 630 #endif
 631
 632 PERL_STATIC_INLINE int
 633 S_does_utf8_overflow(const U8 * const s,
 634                      const U8 * e,
 635                      const bool consider_overlongs)
 636 {
 637     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 638      * 'e' - 1 would overflow an IV on this platform; that is if it represents
 639      * a code point larger than the highest representable code point.  It
 640      * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
 641      * enough information to tell.  This last return value can happen if the
 642      * sequence is incomplete, missing some trailing bytes that would form a
 643      * complete character.  If there are enough bytes to make a definitive
 644      * decision, this function does so.
 645      *
 646      * If 'consider_overlongs' is TRUE, the function checks for the possibility
 647      * that the sequence is an overlong that doesn't overflow.  Otherwise, it
 648      * assumes the sequence is not an overlong.  This can give different
 649      * results only on ASCII 32-bit platforms.
 650      *
 651      * (For ASCII platforms, we could use memcmp() because we don't have to
 652      * convert each byte to I8, but it's very rare input indeed that would
 653      * approach overflow, so the loop below will likely only get executed once.)
 654      *
 655      */
 656     const STRLEN len = e - s;
 657     const U8 *x;
 658     const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
 659     int is_overlong = 0;
 660
 661     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
 662
 663     for (x = s; x < e; x++, y++) {
 664
 665         /* 'y' is set up to not include the trailing bytes that are all the
 666          * maximum possible continuation byte.  So when we reach the end of 'y'
 667          * (known to be NUL terminated), it is impossible for 'x' to contain
 668          * bytes larger than those omitted bytes, and therefore 'x' can't
 669          * overflow */
 670         if (*y == '\0') {
 671             return 0;
 672         }
 673
 674         /* If this byte is less than the corresponding highest non-overflowing
 675          * UTF-8, the sequence doesn't overflow */
 676         if (NATIVE_UTF8_TO_I8(*x) < *y) {
 677             return 0;
 678         }
 679
 680         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
 681             goto overflows_if_not_overlong;
 682         }
 683     }
 684
 685     /* Got to the end, and all bytes are the same.  If the input is a whole
 686      * character, it doesn't overflow.  And if it is a partial character,
 687      * there's not enough information to tell */
 688     return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
 689
 690   overflows_if_not_overlong:
 691
 692     /* Here, a well-formed sequence overflows.  If we are assuming
 693      * well-formedness, return that it overflows. */
 694     if (! consider_overlongs) {
 695         return 1;
 696     }
 697
 698     /* Here, it could be the overlong malformation, and might not actuallly
 699      * overflow if you were to calculate it out.
 700      *
 701      * See if it actually is overlong */
 702     is_overlong = is_utf8_overlong(s, len);
 703
 704     /* If it isn't overlong, is well-formed, so overflows */
 705     if (is_overlong == 0) {
 706         return 1;
 707     }
 708
 709     /* Not long enough to determine */
 710     if (is_overlong < 0) {
 711         return -1;
 712     }
 713
 714     /* Here, it appears to overflow, but it is also overlong */
 715
 716 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
 717
 718     /* On many platforms, it is impossible for an overlong to overflow.  For
 719      * these, no further work is necessary: we can return immediately that this
 720      * overlong that is an apparent overflow actually isn't
 721      *
 722      * To see why, note that a length_N sequence can represent as overlongs all
 723      * the code points representable by shorter length sequences, but no
 724      * higher.  If it could represent a higher code point without being an
 725      * overlong, we wouldn't have had to increase the sequence length!
 726      *
 727      * The highest possible start byte is FF; the next highest is FE.  The
 728      * highest code point representable as an overlong on the platform is thus
 729      * the highest code point representable by a non-overlong sequence whose
 730      * start byte is FE.  If that value doesn't overflow the platform's word
 731      * size, overlongs can't overflow.
 732      *
 733      * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
 734      * information (the high 7 bits, all ones, say that the sequence is 7 bytes
 735      * long, and the bottom, zero, bit is s placeholder. That leaves the 6
 736      * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
 737       If that number of bits doesn't exceed the word size, it can't overflow. */
 738
 739     return 0;
 740
 741 #else
 742
 743     /* In practice, only a 32-bit ASCII box gets here.  The FE start byte can
 744      * represent, as an overlong, the highest code point representable by an FD
 745      * start byte, which is 5*6 continuation bytes of info plus one bit from
 746      * the start byte, or 31 bits.  That doesn't overflow.  More explicitly:
 747      * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
 748      *
 749      * That means only the FF start byte can have an overflowing overlong. */
 750     if (*s < 0xFF) {
 751         return 0;
 752     }
 753
 754     /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
 755      * evaluates to 2**31, so overflows an IV.  For a UV it's
 756      *              \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
 757 #  define OVERFLOWS  "\xff\x80\x80\x80\x80\x80\x80\x82"
 758
 759     if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) {   /* Not enough info */
 760          return -1;
 761     }
 762
 763 #  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
 764
 765     return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
 766
 767 #endif
 768
 769 }
 770
 771 STRLEN
 772 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
 773 {
 774     SSize_t len, full_len;
 775
 776     /* An internal helper function.
 777      *
 778      * On input:
 779      *  's' is a string, which is known to be syntactically valid UTF-8 as far
 780      *      as (e - 1); e > s must hold.
 781      *  'e' This function is allowed to look at any byte from 's'...'e-1', but
 782      *      nowhere else.  The function has to cope as best it can if that
 783      *      sequence does not form a full character.
 784      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
 785      *      accepted by L</utf8n_to_uvchr>.  If non-zero, this function returns
 786      *      0 if it determines the input will match something disallowed.
 787      * On output:
 788      *  The return is the number of bytes required to represent the code point
 789      *  if it isn't disallowed by 'flags'; 0 otherwise.  Be aware that if the
 790      *  input is for a partial character, a successful return will be larger
 791      *  than 'e - s'.
 792      *
 793      *  If *s..*(e-1) is only for a partial character, the function will return
 794      *  non-zero if there is any sequence of well-formed UTF-8 that, when
 795      *  appended to the input sequence, could result in an allowed code point;
 796      *  otherwise it returns 0.  Non characters cannot be determined based on
 797      *  partial character input.  But many  of the other excluded types can be
 798      *  determined with just the first one or two bytes.
 799      *
 800      */
 801
 802     PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
 803
 804     assert(e > s);
 805     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 806                           |UTF8_DISALLOW_PERL_EXTENDED)));
 807
 808     full_len = UTF8SKIP(s);
 809
 810     len = e - s;
 811     if (len > full_len) {
 812         e = s + full_len;
 813         len = full_len;
 814     }
 815
 816     switch (full_len) {
 817         bool is_super;
 818
 819       default: /* Extended */
 820         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
 821             return 0;
 822         }
 823
 824         /* FALLTHROUGH */
 825
 826       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
 827       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
 828
 829         if (flags & UTF8_DISALLOW_SUPER) {
 830             return 0;                       /* Above Unicode */
 831         }
 832
 833         return full_len;
 834
 835       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 836         is_super = (   UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
 837                     || (   len > 1
 838                         && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
 839                         && NATIVE_UTF8_TO_I8(s[1])
 840                                                 >= UTF_FIRST_CONT_BYTE_110000_));
 841         if (is_super) {
 842             if (flags & UTF8_DISALLOW_SUPER) {
 843                 return 0;
 844             }
 845         }
 846         else if (   (flags & UTF8_DISALLOW_NONCHAR)
 847                  && len == full_len
 848                  && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
 849         {
 850             return 0;
 851         }
 852
 853         return full_len;
 854
 855       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
 856
 857         if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
 858             return full_len;
 859         }
 860
 861         if (   (flags & UTF8_DISALLOW_SURROGATE)
 862             &&  UNLIKELY(is_SURROGATE_utf8(s)))
 863         {
 864             return 0;       /* Surrogate */
 865         }
 866
 867         if (  (flags & UTF8_DISALLOW_NONCHAR)
 868             && len == full_len
 869             && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
 870         {
 871             return 0;
 872         }
 873
 874         return full_len;
 875
 876       /* The lower code points don't have any disallowable characters */
 877 #ifdef EBCDIC
 878       case 3:
 879         return full_len;
 880 #endif
 881
 882       case 2:
 883       case 1:
 884         return full_len;
 885     }
 886 }
 887
 888 Size_t
 889 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
 890                         const bool require_partial)
 891 {
 892     /* This is called to determine if the UTF-8 sequence starting at s0 and
 893      * continuing for up to one full character of bytes, but looking no further
 894      * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
 895      * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
 896      * for the largest code points recognized by Perl, the ones that require
 897      * the most UTF-8 bytes per character to represent (somewhat less than
 898      * twice the size of the next longest kind).  This sequence will only ever
 899      * be Perl extended UTF-8.
 900      *
 901      * The routine returns 0 if the sequence is not fully valid, syntactically
 902      * or semantically.  That means it checks that everything following the
 903      * start byte is a continuation byte, and that it doesn't overflow, nor is
 904      * an overlong representation.
 905      *
 906      * If 'require_partial' is FALSE, the routine returns non-zero only if the
 907      * input (as far as 'e-1') is a full character.  The return is the count of
 908      * the bytes in the character.
 909      *
 910      * If 'require_partial' is TRUE, the routine returns non-zero only if the
 911      * input as far as 'e-1' is a partial, not full character, with no
 912      * malformations found before position 'e'.  The return is either just
 913      * FALSE, or TRUE.  */
 914
 915     const U8 *s = s0 + 1;
 916     const U8 *send = e;
 917
 918     PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
 919
 920     assert(s0 < e);
 921     assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
 922
 923     send = s + MIN(UTF8_MAXBYTES - 1, e - s);
 924     while (s < send) {
 925         if (! UTF8_IS_CONTINUATION(*s)) {
 926             return 0;
 927         }
 928
 929         s++;
 930     }
 931
 932     if (0 < does_utf8_overflow(s0, e,
 933                                FALSE /* Don't consider_overlongs */
 934     )) {
 935         return 0;
 936     }
 937
 938     if (0 < isFF_overlong(s0, e - s0)) {
 939         return 0;
 940     }
 941
 942     /* Here, the character is valid as far as it got.  Check if got a partial
 943      * character */
 944     if (s - s0 < UTF8_MAXBYTES) {
 945         return (require_partial) ? 1 : 0;
 946     }
 947
 948     /* Here, got a full character */
 949     return (require_partial) ? 0 : UTF8_MAXBYTES;
 950 }
 951
 952 char *
 953 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
 954 {
 955     /* Returns a mortalized C string that is a displayable copy of the 'len'
 956      * bytes starting at 'start'.  'format' gives how to display each byte.
 957      * Currently, there are only two formats, so it is currently a bool:
 958      *      0   \xab
 959      *      1    ab         (that is a space between two hex digit bytes)
 960      */
 961
 962     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
 963                                                trailing NUL */
 964     const U8 * s = start;
 965     const U8 * const e = start + len;
 966     char * output;
 967     char * d;
 968
 969     PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
 970
 971     Newx(output, output_len, char);
 972     SAVEFREEPV(output);
 973
 974     d = output;
 975     for (s = start; s < e; s++) {
 976         const unsigned high_nibble = (*s & 0xF0) >> 4;
 977         const unsigned low_nibble =  (*s & 0x0F);
 978
 979         if (format) {
 980             if (s > start) {
 981                 *d++ = ' ';
 982             }
 983         }
 984         else {
 985             *d++ = '\\';
 986             *d++ = 'x';
 987         }
 988
 989         if (high_nibble < 10) {
 990             *d++ = high_nibble + '0';
 991         }
 992         else {
 993             *d++ = high_nibble - 10 + 'a';
 994         }
 995
 996         if (low_nibble < 10) {
 997             *d++ = low_nibble + '0';
 998         }
 999         else {
1000             *d++ = low_nibble - 10 + 'a';
1001         }
1002     }
1003
1004     *d = '\0';
1005     return output;
1006 }
1007
1008 PERL_STATIC_INLINE char *
1009 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1010
1011                                          /* Max number of bytes to print */
1012                                          STRLEN print_len,
1013
1014                                          /* Which one is the non-continuation */
1015                                          const STRLEN non_cont_byte_pos,
1016
1017                                          /* How many bytes should there be? */
1018                                          const STRLEN expect_len)
1019 {
1020     /* Return the malformation warning text for an unexpected continuation
1021      * byte. */
1022
1023     const char * const where = (non_cont_byte_pos == 1)
1024                                ? "immediately"
1025                                : Perl_form(aTHX_ "%d bytes",
1026                                                  (int) non_cont_byte_pos);
1027     const U8 * x = s + non_cont_byte_pos;
1028     const U8 * e = s + print_len;
1029
1030     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1031
1032     /* We don't need to pass this parameter, but since it has already been
1033      * calculated, it's likely faster to pass it; verify under DEBUGGING */
1034     assert(expect_len == UTF8SKIP(s));
1035
1036     /* As a defensive coding measure, don't output anything past a NUL.  Such
1037      * bytes shouldn't be in the middle of a malformation, and could mark the
1038      * end of the allocated string, and what comes after is undefined */
1039     for (; x < e; x++) {
1040         if (*x == '\0') {
1041             x++;            /* Output this particular NUL */
1042             break;
1043         }
1044     }
1045
1046     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1047                            " %s after start byte 0x%02x; need %d bytes, got %d)",
1048                            malformed_text,
1049                            _byte_dump_string(s, x - s, 0),
1050                            *(s + non_cont_byte_pos),
1051                            where,
1052                            *s,
1053                            (int) expect_len,
1054                            (int) non_cont_byte_pos);
1055 }
1056
1057 /*
1058
1059 =for apidoc utf8n_to_uvchr
1060
1061 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1062 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1063 directly.
1064
1065 Bottom level UTF-8 decode routine.
1066 Returns the native code point value of the first character in the string C<s>,
1067 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1068 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1069 the length, in bytes, of that character.
1070
1071 The value of C<flags> determines the behavior when C<s> does not point to a
1072 well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
1073 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1074 is the next possible position in C<s> that could begin a non-malformed
1075 character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
1076 is raised.  Some UTF-8 input sequences may contain multiple malformations.
1077 This function tries to find every possible one in each call, so multiple
1078 warnings can be raised for the same sequence.
1079
1080 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1081 individual types of malformations, such as the sequence being overlong (that
1082 is, when there is a shorter sequence that can express the same code point;
1083 overlong sequences are expressly forbidden in the UTF-8 standard due to
1084 potential security issues).  Another malformation example is the first byte of
1085 a character not being a legal first byte.  See F<utf8.h> for the list of such
1086 flags.  Even if allowed, this function generally returns the Unicode
1087 REPLACEMENT CHARACTER when it encounters a malformation.  There are flags in
1088 F<utf8.h> to override this behavior for the overlong malformations, but don't
1089 do that except for very specialized purposes.
1090
1091 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1092 flags) malformation is found.  If this flag is set, the routine assumes that
1093 the caller will raise a warning, and this function will silently just set
1094 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1095
1096 Note that this API requires disambiguation between successful decoding a C<NUL>
1097 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1098 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1099 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
1100 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
1101 error.  Or you can use C<L</utf8n_to_uvchr_error>>.
1102
1103 Certain code points are considered problematic.  These are Unicode surrogates,
1104 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1105 By default these are considered regular code points, but certain situations
1106 warrant special handling for them, which can be specified using the C<flags>
1107 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1108 three classes are treated as malformations and handled as such.  The flags
1109 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1110 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1111 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1112 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1113 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1114 definition given by
1115 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1116 The difference between traditional strictness and C9 strictness is that the
1117 latter does not forbid non-character code points.  (They are still discouraged,
1118 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
1119
1120 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1121 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1122 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1123 raised for their respective categories, but otherwise the code points are
1124 considered valid (not malformations).  To get a category to both be treated as
1125 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1126 (But note that warnings are not raised if lexically disabled nor if
1127 C<UTF8_CHECK_ONLY> is also specified.)
1128
1129 Extremely high code points were never specified in any standard, and require an
1130 extension to UTF-8 to express, which Perl does.  It is likely that programs
1131 written in something other than Perl would not be able to read files that
1132 contain these; nor would Perl understand files written by something that uses a
1133 different extension.  For these reasons, there is a separate set of flags that
1134 can warn and/or disallow these extremely high code points, even if other
1135 above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
1136 C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
1137 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
1138 above-Unicode code points, including these, as malformations.
1139 (Note that the Unicode standard considers anything above 0x10FFFF to be
1140 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1141 (2**31 -1))
1142
1143 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1144 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
1145 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1146 C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
1147 can apply to code points that actually do fit in 31 bits.  This happens on
1148 EBCDIC platforms, and sometimes when the L<overlong
1149 malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
1150 describe the situation in all cases.
1151
1152
1153 All other code points corresponding to Unicode characters, including private
1154 use and those yet to be assigned, are never considered malformed and never
1155 warn.
1156
1157 =for apidoc Amnh||UTF8_CHECK_ONLY
1158 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1159 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1160 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1161 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1162 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1163 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1164 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1165 =for apidoc Amnh||UTF8_WARN_SURROGATE
1166 =for apidoc Amnh||UTF8_WARN_NONCHAR
1167 =for apidoc Amnh||UTF8_WARN_SUPER
1168 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1169 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1170
1171 =cut
1172
1173 Also implemented as a macro in utf8.h
1174 */
1175
1176 UV
1177 Perl_utf8n_to_uvchr(const U8 *s,
1178                     STRLEN curlen,
1179                     STRLEN *retlen,
1180                     const U32 flags)
1181 {
1182     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1183
1184     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1185 }
1186
1187 /*
1188
1189 =for apidoc utf8n_to_uvchr_error
1190
1191 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1192 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1193 directly.
1194
1195 This function is for code that needs to know what the precise malformation(s)
1196 are when an error is found.  If you also need to know the generated warning
1197 messages, use L</utf8n_to_uvchr_msgs>() instead.
1198
1199 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1200 all the others, C<errors>.  If this parameter is 0, this function behaves
1201 identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
1202 to a C<U32> variable, which this function sets to indicate any errors found.
1203 Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
1204 C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
1205 of these bits will be set if a malformation is found, even if the input
1206 C<flags> parameter indicates that the given malformation is allowed; those
1207 exceptions are noted:
1208
1209 =over 4
1210
1211 =item C<UTF8_GOT_PERL_EXTENDED>
1212
1213 The input sequence is not standard UTF-8, but a Perl extension.  This bit is
1214 set only if the input C<flags> parameter contains either the
1215 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1216
1217 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1218 and so some extension must be used to express them.  Perl uses a natural
1219 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1220 extension to represent even higher ones, so that any code point that fits in a
1221 64-bit word can be represented.  Text using these extensions is not likely to
1222 be portable to non-Perl code.  We lump both of these extensions together and
1223 refer to them as Perl extended UTF-8.  There exist other extensions that people
1224 have invented, incompatible with Perl's.
1225
1226 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1227 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1228 than on ASCII.  Prior to that, code points 2**31 and higher were simply
1229 unrepresentable, and a different, incompatible method was used to represent
1230 code points between 2**30 and 2**31 - 1.
1231
1232 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1233 Perl extended UTF-8 is used.
1234
1235 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1236 may use for backward compatibility.  That name is misleading, as this flag may
1237 be set when the code point actually does fit in 31 bits.  This happens on
1238 EBCDIC platforms, and sometimes when the L<overlong
1239 malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
1240 describes the situation in all cases.
1241
1242 =item C<UTF8_GOT_CONTINUATION>
1243
1244 The input sequence was malformed in that the first byte was a UTF-8
1245 continuation byte.
1246
1247 =item C<UTF8_GOT_EMPTY>
1248
1249 The input C<curlen> parameter was 0.
1250
1251 =item C<UTF8_GOT_LONG>
1252
1253 The input sequence was malformed in that there is some other sequence that
1254 evaluates to the same code point, but that sequence is shorter than this one.
1255
1256 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1257 it was discovered that this created security issues.
1258
1259 =item C<UTF8_GOT_NONCHAR>
1260
1261 The code point represented by the input UTF-8 sequence is for a Unicode
1262 non-character code point.
1263 This bit is set only if the input C<flags> parameter contains either the
1264 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1265
1266 =item C<UTF8_GOT_NON_CONTINUATION>
1267
1268 The input sequence was malformed in that a non-continuation type byte was found
1269 in a position where only a continuation type one should be.  See also
1270 C<L</UTF8_GOT_SHORT>>.
1271
1272 =item C<UTF8_GOT_OVERFLOW>
1273
1274 The input sequence was malformed in that it is for a code point that is not
1275 representable in the number of bits available in an IV on the current platform.
1276
1277 =item C<UTF8_GOT_SHORT>
1278
1279 The input sequence was malformed in that C<curlen> is smaller than required for
1280 a complete sequence.  In other words, the input is for a partial character
1281 sequence.
1282
1283
1284 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1285 sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1286 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1287 sequence was looked at.   If no other flags are present, it means that the
1288 sequence was valid as far as it went.  Depending on the application, this could
1289 mean one of three things:
1290
1291 =over
1292
1293 =item *
1294
1295 The C<curlen> length parameter passed in was too small, and the function was
1296 prevented from examining all the necessary bytes.
1297
1298 =item *
1299
1300 The buffer being looked at is based on reading data, and the data received so
1301 far stopped in the middle of a character, so that the next read will
1302 read the remainder of this character.  (It is up to the caller to deal with the
1303 split bytes somehow.)
1304
1305 =item *
1306
1307 This is a real error, and the partial sequence is all we're going to get.
1308
1309 =back
1310
1311 =item C<UTF8_GOT_SUPER>
1312
1313 The input sequence was malformed in that it is for a non-Unicode code point;
1314 that is, one above the legal Unicode maximum.
1315 This bit is set only if the input C<flags> parameter contains either the
1316 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1317
1318 =item C<UTF8_GOT_SURROGATE>
1319
1320 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1321 code point.
1322 This bit is set only if the input C<flags> parameter contains either the
1323 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1324
1325 =back
1326
1327 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1328 flag to suppress any warnings, and then examine the C<*errors> return.
1329
1330 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1331 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1332 =for apidoc Amnh||UTF8_GOT_EMPTY
1333 =for apidoc Amnh||UTF8_GOT_LONG
1334 =for apidoc Amnh||UTF8_GOT_NONCHAR
1335 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1336 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1337 =for apidoc Amnh||UTF8_GOT_SHORT
1338 =for apidoc Amnh||UTF8_GOT_SUPER
1339 =for apidoc Amnh||UTF8_GOT_SURROGATE
1340
1341 =cut
1342
1343 Also implemented as a macro in utf8.h
1344 */
1345
1346 UV
1347 Perl_utf8n_to_uvchr_error(const U8 *s,
1348                           STRLEN curlen,
1349                           STRLEN *retlen,
1350                           const U32 flags,
1351                           U32 * errors)
1352 {
1353     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1354
1355     return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1356 }
1357
1358 /*
1359
1360 =for apidoc utf8n_to_uvchr_msgs
1361
1362 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1363 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1364 directly.
1365
1366 This function is for code that needs to know what the precise malformation(s)
1367 are when an error is found, and wants the corresponding warning and/or error
1368 messages to be returned to the caller rather than be displayed.  All messages
1369 that would have been displayed if all lexical warnings are enabled will be
1370 returned.
1371
1372 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1373 placed after all the others, C<msgs>.  If this parameter is 0, this function
1374 behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
1375 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1376 contain any appropriate messages.  The elements of the array are ordered so
1377 that the first message that would have been displayed is in the 0th element,
1378 and so on.  Each element is a hash with three key-value pairs, as follows:
1379
1380 =over 4
1381
1382 =item C<text>
1383
1384 The text of the message as a C<SVpv>.
1385
1386 =item C<warn_categories>
1387
1388 The warning category (or categories) packed into a C<SVuv>.
1389
1390 =item C<flag>
1391
1392 A single flag bit associated with this message, in a C<SVuv>.
1393 The bit corresponds to some bit in the C<*errors> return value,
1394 such as C<UTF8_GOT_LONG>.
1395
1396 =back
1397
1398 It's important to note that specifying this parameter as non-null will cause
1399 any warnings this function would otherwise generate to be suppressed, and
1400 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
1401 (or not) when choosing what to do with the returned messages.
1402
1403 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1404 no AV is created.
1405
1406 The caller, of course, is responsible for freeing any returned AV.
1407
1408 =cut
1409 */
1410
1411 UV
1412 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1413                                STRLEN curlen,
1414                                STRLEN *retlen,
1415                                const U32 flags,
1416                                U32 * errors,
1417                                AV ** msgs)
1418 {
1419     const U8 * const s0 = s;
1420     const U8 * send = s0 + curlen;
1421     U32 possible_problems;  /* A bit is set here for each potential problem
1422                                found as we go along */
1423     UV uv;
1424     STRLEN expectlen;     /* How long should this sequence be? */
1425     STRLEN avail_len;     /* When input is too short, gives what that is */
1426     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
1427                              gets set and discarded */
1428
1429     /* The below are used only if there is both an overlong malformation and a
1430      * too short one.  Otherwise the first two are set to 's0' and 'send', and
1431      * the third not used at all */
1432     U8 * adjusted_s0;
1433     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1434                                             routine; see [perl #130921] */
1435     UV uv_so_far;
1436     dTHX;
1437
1438     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1439
1440     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1441      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1442      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
1443      * final case. */
1444
1445     /* Each of the affected Hanguls starts with \xED */
1446
1447     if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1448         if (retlen) {
1449             *retlen = 3;
1450         }
1451         if (errors) {
1452             *errors = 0;
1453         }
1454         if (msgs) {
1455             *msgs = NULL;
1456         }
1457
1458         return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1459              | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1460              |  (s0[2] & UTF_CONTINUATION_MASK);
1461     }
1462
1463     /* In conjunction with the exhaustive tests that can be enabled in
1464      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1465      * what it is intended to do, and that no flaws in it are masked by
1466      * dropping down and executing the code below
1467     assert(! isUTF8_CHAR(s0, send)
1468           || UTF8_IS_SURROGATE(s0, send)
1469           || UTF8_IS_SUPER(s0, send)
1470           || UTF8_IS_NONCHAR(s0,send));
1471     */
1472
1473     s = s0;
1474     possible_problems = 0;
1475     expectlen = 0;
1476     avail_len = 0;
1477     discard_errors = 0;
1478     adjusted_s0 = (U8 *) s0;
1479     uv_so_far = 0;
1480
1481     if (errors) {
1482         *errors = 0;
1483     }
1484     else {
1485         errors = &discard_errors;
1486     }
1487
1488     /* The order of malformation tests here is important.  We should consume as
1489      * few bytes as possible in order to not skip any valid character.  This is
1490      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1491      * https://unicode.org/reports/tr36 for more discussion as to why.  For
1492      * example, once we've done a UTF8SKIP, we can tell the expected number of
1493      * bytes, and could fail right off the bat if the input parameters indicate
1494      * that there are too few available.  But it could be that just that first
1495      * byte is garbled, and the intended character occupies fewer bytes.  If we
1496      * blindly assumed that the first byte is correct, and skipped based on
1497      * that number, we could skip over a valid input character.  So instead, we
1498      * always examine the sequence byte-by-byte.
1499      *
1500      * We also should not consume too few bytes, otherwise someone could inject
1501      * things.  For example, an input could be deliberately designed to
1502      * overflow, and if this code bailed out immediately upon discovering that,
1503      * returning to the caller C<*retlen> pointing to the very next byte (one
1504      * which is actually part of the overflowing sequence), that could look
1505      * legitimate to the caller, which could discard the initial partial
1506      * sequence and process the rest, inappropriately.
1507      *
1508      * Some possible input sequences are malformed in more than one way.  This
1509      * function goes to lengths to try to find all of them.  This is necessary
1510      * for correctness, as the inputs may allow one malformation but not
1511      * another, and if we abandon searching for others after finding the
1512      * allowed one, we could allow in something that shouldn't have been.
1513      */
1514
1515     if (UNLIKELY(curlen == 0)) {
1516         possible_problems |= UTF8_GOT_EMPTY;
1517         curlen = 0;
1518         uv = UNICODE_REPLACEMENT;
1519         goto ready_to_handle_errors;
1520     }
1521
1522     /* We now know we can examine the first byte of the input */
1523     expectlen = UTF8SKIP(s);
1524     uv = *s;
1525
1526     /* A well-formed UTF-8 character, as the vast majority of calls to this
1527      * function will be for, has this expected length.  For efficiency, set
1528      * things up here to return it.  It will be overriden only in those rare
1529      * cases where a malformation is found */
1530     if (retlen) {
1531         *retlen = expectlen;
1532     }
1533
1534     /* A continuation character can't start a valid sequence */
1535     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1536         possible_problems |= UTF8_GOT_CONTINUATION;
1537         curlen = 1;
1538         uv = UNICODE_REPLACEMENT;
1539         goto ready_to_handle_errors;
1540     }
1541
1542     /* Here is not a continuation byte, nor an invariant.  The only thing left
1543      * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
1544      * because it excludes start bytes like \xC0 that always lead to
1545      * overlongs.) */
1546
1547     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1548      * that indicate the number of bytes in the character's whole UTF-8
1549      * sequence, leaving just the bits that are part of the value.  */
1550     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1551
1552     /* Setup the loop end point, making sure to not look past the end of the
1553      * input string, and flag it as too short if the size isn't big enough. */
1554     if (UNLIKELY(curlen < expectlen)) {
1555         possible_problems |= UTF8_GOT_SHORT;
1556         avail_len = curlen;
1557     }
1558     else {
1559         send = (U8*) s0 + expectlen;
1560     }
1561
1562     /* Now, loop through the remaining bytes in the character's sequence,
1563      * accumulating each into the working value as we go. */
1564     for (s = s0 + 1; s < send; s++) {
1565         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1566             uv = UTF8_ACCUMULATE(uv, *s);
1567             continue;
1568         }
1569
1570         /* Here, found a non-continuation before processing all expected bytes.
1571          * This byte indicates the beginning of a new character, so quit, even
1572          * if allowing this malformation. */
1573         possible_problems |= UTF8_GOT_NON_CONTINUATION;
1574         break;
1575     } /* End of loop through the character's bytes */
1576
1577     /* Save how many bytes were actually in the character */
1578     curlen = s - s0;
1579
1580     /* Note that there are two types of too-short malformation.  One is when
1581      * there is actual wrong data before the normal termination of the
1582      * sequence.  The other is that the sequence wasn't complete before the end
1583      * of the data we are allowed to look at, based on the input 'curlen'.
1584      * This means that we were passed data for a partial character, but it is
1585      * valid as far as we saw.  The other is definitely invalid.  This
1586      * distinction could be important to a caller, so the two types are kept
1587      * separate.
1588      *
1589      * A convenience macro that matches either of the too-short conditions.  */
1590 #   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1591
1592     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1593         uv_so_far = uv;
1594         uv = UNICODE_REPLACEMENT;
1595     }
1596
1597     /* Check for overflow.  The algorithm requires us to not look past the end
1598      * of the current character, even if partial, so the upper limit is 's' */
1599     if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1600                                          1 /* Do consider overlongs */
1601                                         )))
1602     {
1603         possible_problems |= UTF8_GOT_OVERFLOW;
1604         uv = UNICODE_REPLACEMENT;
1605     }
1606
1607     /* Check for overlong.  If no problems so far, 'uv' is the correct code
1608      * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
1609      * we must look at the UTF-8 byte sequence itself to see if it is for an
1610      * overlong */
1611     if (     (   LIKELY(! possible_problems)
1612               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1613         || (       UNLIKELY(possible_problems)
1614             && (   UNLIKELY(! UTF8_IS_START(*s0))
1615                 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1616     {
1617         possible_problems |= UTF8_GOT_LONG;
1618
1619         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
1620
1621                           /* The calculation in the 'true' branch of this 'if'
1622                            * below won't work if overflows, and isn't needed
1623                            * anyway.  Further below we handle all overflow
1624                            * cases */
1625             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1626         {
1627             UV min_uv = uv_so_far;
1628             STRLEN i;
1629
1630             /* Here, the input is both overlong and is missing some trailing
1631              * bytes.  There is no single code point it could be for, but there
1632              * may be enough information present to determine if what we have
1633              * so far is for an unallowed code point, such as for a surrogate.
1634              * The code further below has the intelligence to determine this,
1635              * but just for non-overlong UTF-8 sequences.  What we do here is
1636              * calculate the smallest code point the input could represent if
1637              * there were no too short malformation.  Then we compute and save
1638              * the UTF-8 for that, which is what the code below looks at
1639              * instead of the raw input.  It turns out that the smallest such
1640              * code point is all we need. */
1641             for (i = curlen; i < expectlen; i++) {
1642                 min_uv = UTF8_ACCUMULATE(min_uv,
1643                                 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1644             }
1645
1646             adjusted_s0 = temp_char_buf;
1647             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1648         }
1649     }
1650
1651     /* Here, we have found all the possible problems, except for when the input
1652      * is for a problematic code point not allowed by the input parameters. */
1653
1654                                 /* uv is valid for overlongs */
1655     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1656                    && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1657             || (   UNLIKELY(possible_problems)
1658
1659                           /* if overflow, we know without looking further
1660                            * precisely which of the problematic types it is,
1661                            * and we deal with those in the overflow handling
1662                            * code */
1663                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1664                 && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1665                     || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1666         && ((flags & ( UTF8_DISALLOW_NONCHAR
1667                       |UTF8_DISALLOW_SURROGATE
1668                       |UTF8_DISALLOW_SUPER
1669                       |UTF8_DISALLOW_PERL_EXTENDED
1670                       |UTF8_WARN_NONCHAR
1671                       |UTF8_WARN_SURROGATE
1672                       |UTF8_WARN_SUPER
1673                       |UTF8_WARN_PERL_EXTENDED))))
1674     {
1675         /* If there were no malformations, or the only malformation is an
1676          * overlong, 'uv' is valid */
1677         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1678             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1679                 possible_problems |= UTF8_GOT_SURROGATE;
1680             }
1681             else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1682                 possible_problems |= UTF8_GOT_SUPER;
1683             }
1684             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1685                 possible_problems |= UTF8_GOT_NONCHAR;
1686             }
1687         }
1688         else {  /* Otherwise, need to look at the source UTF-8, possibly
1689                    adjusted to be non-overlong */
1690
1691             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1692                                                     > UTF_START_BYTE_110000_))
1693             {
1694                 possible_problems |= UTF8_GOT_SUPER;
1695             }
1696             else if (curlen > 1) {
1697                 if (UNLIKELY(   NATIVE_UTF8_TO_I8(*adjusted_s0)
1698                                                 == UTF_START_BYTE_110000_
1699                              && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1700                                                 >= UTF_FIRST_CONT_BYTE_110000_))
1701                 {
1702                     possible_problems |= UTF8_GOT_SUPER;
1703                 }
1704                 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1705                     possible_problems |= UTF8_GOT_SURROGATE;
1706                 }
1707             }
1708
1709             /* We need a complete well-formed UTF-8 character to discern
1710              * non-characters, so can't look for them here */
1711         }
1712     }
1713
1714   ready_to_handle_errors:
1715
1716     /* At this point:
1717      * curlen               contains the number of bytes in the sequence that
1718      *                      this call should advance the input by.
1719      * avail_len            gives the available number of bytes passed in, but
1720      *                      only if this is less than the expected number of
1721      *                      bytes, based on the code point's start byte.
1722      * possible_problems    is 0 if there weren't any problems; otherwise a bit
1723      *                      is set in it for each potential problem found.
1724      * uv                   contains the code point the input sequence
1725      *                      represents; or if there is a problem that prevents
1726      *                      a well-defined value from being computed, it is
1727      *                      some subsitute value, typically the REPLACEMENT
1728      *                      CHARACTER.
1729      * s0                   points to the first byte of the character
1730      * s                    points to just after where we left off processing
1731      *                      the character
1732      * send                 points to just after where that character should
1733      *                      end, based on how many bytes the start byte tells
1734      *                      us should be in it, but no further than s0 +
1735      *                      avail_len
1736      */
1737
1738     if (UNLIKELY(possible_problems)) {
1739         bool disallowed = FALSE;
1740         const U32 orig_problems = possible_problems;
1741
1742         if (msgs) {
1743             *msgs = NULL;
1744         }
1745
1746         while (possible_problems) { /* Handle each possible problem */
1747             U32 pack_warn = 0;
1748             char * message = NULL;
1749             U32 this_flag_bit = 0;
1750
1751             /* Each 'if' clause handles one problem.  They are ordered so that
1752              * the first ones' messages will be displayed before the later
1753              * ones; this is kinda in decreasing severity order.  But the
1754              * overlong must come last, as it changes 'uv' looked at by the
1755              * others */
1756             if (possible_problems & UTF8_GOT_OVERFLOW) {
1757
1758                 /* Overflow means also got a super and are using Perl's
1759                  * extended UTF-8, but we handle all three cases here */
1760                 possible_problems
1761                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1762                 *errors |= UTF8_GOT_OVERFLOW;
1763
1764                 /* But the API says we flag all errors found */
1765                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1766                     *errors |= UTF8_GOT_SUPER;
1767                 }
1768                 if (flags
1769                         & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1770                 {
1771                     *errors |= UTF8_GOT_PERL_EXTENDED;
1772                 }
1773
1774                 /* Disallow if any of the three categories say to */
1775                 if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
1776                     || (flags & ( UTF8_DISALLOW_SUPER
1777                                  |UTF8_DISALLOW_PERL_EXTENDED)))
1778                 {
1779                     disallowed = TRUE;
1780                 }
1781
1782                 /* Likewise, warn if any say to */
1783                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)
1784                     ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1785                 {
1786
1787                     /* The warnings code explicitly says it doesn't handle the
1788                      * case of packWARN2 and two categories which have
1789                      * parent-child relationship.  Even if it works now to
1790                      * raise the warning if either is enabled, it wouldn't
1791                      * necessarily do so in the future.  We output (only) the
1792                      * most dire warning */
1793                     if (! (flags & UTF8_CHECK_ONLY)) {
1794                         if (msgs || ckWARN_d(WARN_UTF8)) {
1795                             pack_warn = packWARN(WARN_UTF8);
1796                         }
1797                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1798                             pack_warn = packWARN(WARN_NON_UNICODE);
1799                         }
1800                         if (pack_warn) {
1801                             message = Perl_form(aTHX_ "%s: %s (overflows)",
1802                                             malformed_text,
1803                                             _byte_dump_string(s0, curlen, 0));
1804                             this_flag_bit = UTF8_GOT_OVERFLOW;
1805                         }
1806                     }
1807                 }
1808             }
1809             else if (possible_problems & UTF8_GOT_EMPTY) {
1810                 possible_problems &= ~UTF8_GOT_EMPTY;
1811                 *errors |= UTF8_GOT_EMPTY;
1812
1813                 if (! (flags & UTF8_ALLOW_EMPTY)) {
1814
1815                     /* This so-called malformation is now treated as a bug in
1816                      * the caller.  If you have nothing to decode, skip calling
1817                      * this function */
1818                     assert(0);
1819
1820                     disallowed = TRUE;
1821                     if (  (msgs
1822                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1823                     {
1824                         pack_warn = packWARN(WARN_UTF8);
1825                         message = Perl_form(aTHX_ "%s (empty string)",
1826                                                    malformed_text);
1827                         this_flag_bit = UTF8_GOT_EMPTY;
1828                     }
1829                 }
1830             }
1831             else if (possible_problems & UTF8_GOT_CONTINUATION) {
1832                 possible_problems &= ~UTF8_GOT_CONTINUATION;
1833                 *errors |= UTF8_GOT_CONTINUATION;
1834
1835                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1836                     disallowed = TRUE;
1837                     if ((   msgs
1838                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1839                     {
1840                         pack_warn = packWARN(WARN_UTF8);
1841                         message = Perl_form(aTHX_
1842                                 "%s: %s (unexpected continuation byte 0x%02x,"
1843                                 " with no preceding start byte)",
1844                                 malformed_text,
1845                                 _byte_dump_string(s0, 1, 0), *s0);
1846                         this_flag_bit = UTF8_GOT_CONTINUATION;
1847                     }
1848                 }
1849             }
1850             else if (possible_problems & UTF8_GOT_SHORT) {
1851                 possible_problems &= ~UTF8_GOT_SHORT;
1852                 *errors |= UTF8_GOT_SHORT;
1853
1854                 if (! (flags & UTF8_ALLOW_SHORT)) {
1855                     disallowed = TRUE;
1856                     if ((   msgs
1857                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1858                     {
1859                         pack_warn = packWARN(WARN_UTF8);
1860                         message = Perl_form(aTHX_
1861                              "%s: %s (too short; %d byte%s available, need %d)",
1862                              malformed_text,
1863                              _byte_dump_string(s0, send - s0, 0),
1864                              (int)avail_len,
1865                              avail_len == 1 ? "" : "s",
1866                              (int)expectlen);
1867                         this_flag_bit = UTF8_GOT_SHORT;
1868                     }
1869                 }
1870
1871             }
1872             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1873                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1874                 *errors |= UTF8_GOT_NON_CONTINUATION;
1875
1876                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1877                     disallowed = TRUE;
1878                     if ((   msgs
1879                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1880                     {
1881
1882                         /* If we don't know for sure that the input length is
1883                          * valid, avoid as much as possible reading past the
1884                          * end of the buffer */
1885                         int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1886                                        ? (int) (s - s0)
1887                                        : (int) (send - s0);
1888                         pack_warn = packWARN(WARN_UTF8);
1889                         message = Perl_form(aTHX_ "%s",
1890                             unexpected_non_continuation_text(s0,
1891                                                             printlen,
1892                                                             s - s0,
1893                                                             (int) expectlen));
1894                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1895                     }
1896                 }
1897             }
1898             else if (possible_problems & UTF8_GOT_SURROGATE) {
1899                 possible_problems &= ~UTF8_GOT_SURROGATE;
1900
1901                 if (flags & UTF8_WARN_SURROGATE) {
1902                     *errors |= UTF8_GOT_SURROGATE;
1903
1904                     if (   ! (flags & UTF8_CHECK_ONLY)
1905                         && (msgs || ckWARN_d(WARN_SURROGATE)))
1906                     {
1907                         pack_warn = packWARN(WARN_SURROGATE);
1908
1909                         /* These are the only errors that can occur with a
1910                         * surrogate when the 'uv' isn't valid */
1911                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1912                             message = Perl_form(aTHX_
1913                                     "UTF-16 surrogate (any UTF-8 sequence that"
1914                                     " starts with \"%s\" is for a surrogate)",
1915                                     _byte_dump_string(s0, curlen, 0));
1916                         }
1917                         else {
1918                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
1919                         }
1920                         this_flag_bit = UTF8_GOT_SURROGATE;
1921                     }
1922                 }
1923
1924                 if (flags & UTF8_DISALLOW_SURROGATE) {
1925                     disallowed = TRUE;
1926                     *errors |= UTF8_GOT_SURROGATE;
1927                 }
1928             }
1929             else if (possible_problems & UTF8_GOT_SUPER) {
1930                 possible_problems &= ~UTF8_GOT_SUPER;
1931
1932                 if (flags & UTF8_WARN_SUPER) {
1933                     *errors |= UTF8_GOT_SUPER;
1934
1935                     if (   ! (flags & UTF8_CHECK_ONLY)
1936                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1937                     {
1938                         pack_warn = packWARN(WARN_NON_UNICODE);
1939
1940                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1941                             message = Perl_form(aTHX_
1942                                     "Any UTF-8 sequence that starts with"
1943                                     " \"%s\" is for a non-Unicode code point,"
1944                                     " may not be portable",
1945                                     _byte_dump_string(s0, curlen, 0));
1946                         }
1947                         else {
1948                             message = Perl_form(aTHX_ super_cp_format, uv);
1949                         }
1950                         this_flag_bit = UTF8_GOT_SUPER;
1951                     }
1952                 }
1953
1954                 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1955                  * and before possibly bailing out, so that the more dire
1956                  * warning will override the regular one. */
1957                 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1958                     if (  ! (flags & UTF8_CHECK_ONLY)
1959                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1960                         &&  (msgs || (   ckWARN_d(WARN_NON_UNICODE)
1961                                       || ckWARN(WARN_PORTABLE))))
1962                     {
1963                         pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1964
1965                         /* If it is an overlong that evaluates to a code point
1966                          * that doesn't have to use the Perl extended UTF-8, it
1967                          * still used it, and so we output a message that
1968                          * doesn't refer to the code point.  The same is true
1969                          * if there was a SHORT malformation where the code
1970                          * point is not valid.  In that case, 'uv' will have
1971                          * been set to the REPLACEMENT CHAR, and the message
1972                          * below without the code point in it will be selected
1973                          * */
1974                         if (UNICODE_IS_PERL_EXTENDED(uv)) {
1975                             message = Perl_form(aTHX_
1976                                             PL_extended_cp_format, uv);
1977                         }
1978                         else {
1979                             message = Perl_form(aTHX_
1980                                         "Any UTF-8 sequence that starts with"
1981                                         " \"%s\" is a Perl extension, and"
1982                                         " so is not portable",
1983                                         _byte_dump_string(s0, curlen, 0));
1984                         }
1985                         this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1986                     }
1987
1988                     if (flags & ( UTF8_WARN_PERL_EXTENDED
1989                                  |UTF8_DISALLOW_PERL_EXTENDED))
1990                     {
1991                         *errors |= UTF8_GOT_PERL_EXTENDED;
1992
1993                         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1994                             disallowed = TRUE;
1995                         }
1996                     }
1997                 }
1998
1999                 if (flags & UTF8_DISALLOW_SUPER) {
2000                     *errors |= UTF8_GOT_SUPER;
2001                     disallowed = TRUE;
2002                 }
2003             }
2004             else if (possible_problems & UTF8_GOT_NONCHAR) {
2005                 possible_problems &= ~UTF8_GOT_NONCHAR;
2006
2007                 if (flags & UTF8_WARN_NONCHAR) {
2008                     *errors |= UTF8_GOT_NONCHAR;
2009
2010                     if (  ! (flags & UTF8_CHECK_ONLY)
2011                         && (msgs || ckWARN_d(WARN_NONCHAR)))
2012                     {
2013                         /* The code above should have guaranteed that we don't
2014                          * get here with errors other than overlong */
2015                         assert (! (orig_problems
2016                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2017
2018                         pack_warn = packWARN(WARN_NONCHAR);
2019                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
2020                         this_flag_bit = UTF8_GOT_NONCHAR;
2021                     }
2022                 }
2023
2024                 if (flags & UTF8_DISALLOW_NONCHAR) {
2025                     disallowed = TRUE;
2026                     *errors |= UTF8_GOT_NONCHAR;
2027                 }
2028             }
2029             else if (possible_problems & UTF8_GOT_LONG) {
2030                 possible_problems &= ~UTF8_GOT_LONG;
2031                 *errors |= UTF8_GOT_LONG;
2032
2033                 if (flags & UTF8_ALLOW_LONG) {
2034
2035                     /* We don't allow the actual overlong value, unless the
2036                      * special extra bit is also set */
2037                     if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
2038                                     & ~UTF8_ALLOW_LONG)))
2039                     {
2040                         uv = UNICODE_REPLACEMENT;
2041                     }
2042                 }
2043                 else {
2044                     disallowed = TRUE;
2045
2046                     if ((   msgs
2047                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2048                     {
2049                         pack_warn = packWARN(WARN_UTF8);
2050
2051                         /* These error types cause 'uv' to be something that
2052                          * isn't what was intended, so can't use it in the
2053                          * message.  The other error types either can't
2054                          * generate an overlong, or else the 'uv' is valid */
2055                         if (orig_problems &
2056                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2057                         {
2058                             message = Perl_form(aTHX_
2059                                     "%s: %s (any UTF-8 sequence that starts"
2060                                     " with \"%s\" is overlong which can and"
2061                                     " should be represented with a"
2062                                     " different, shorter sequence)",
2063                                     malformed_text,
2064                                     _byte_dump_string(s0, send - s0, 0),
2065                                     _byte_dump_string(s0, curlen, 0));
2066                         }
2067                         else {
2068                             U8 tmpbuf[UTF8_MAXBYTES+1];
2069                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2070                                                                         uv, 0);
2071                             /* Don't use U+ for non-Unicode code points, which
2072                              * includes those in the Latin1 range */
2073                             const char * preface = (   UNICODE_IS_SUPER(uv)
2074 #ifdef EBCDIC
2075                                                     || uv <= 0xFF
2076 #endif
2077                                                    )
2078                                                    ? "0x"
2079                                                    : "U+";
2080                             message = Perl_form(aTHX_
2081                                 "%s: %s (overlong; instead use %s to represent"
2082                                 " %s%0*" UVXf ")",
2083                                 malformed_text,
2084                                 _byte_dump_string(s0, send - s0, 0),
2085                                 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2086                                 preface,
2087                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2088                                                          small code points */
2089                                 UNI_TO_NATIVE(uv));
2090                         }
2091                         this_flag_bit = UTF8_GOT_LONG;
2092                     }
2093                 }
2094             } /* End of looking through the possible flags */
2095
2096             /* Display the message (if any) for the problem being handled in
2097              * this iteration of the loop */
2098             if (message) {
2099                 if (msgs) {
2100                     assert(this_flag_bit);
2101
2102                     if (*msgs == NULL) {
2103                         *msgs = newAV();
2104                     }
2105
2106                     av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2107                                                                 pack_warn,
2108                                                                 this_flag_bit)));
2109                 }
2110                 else if (PL_op)
2111                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2112                                                  OP_DESC(PL_op));
2113                 else
2114                     Perl_warner(aTHX_ pack_warn, "%s", message);
2115             }
2116         }   /* End of 'while (possible_problems)' */
2117
2118         /* Since there was a possible problem, the returned length may need to
2119          * be changed from the one stored at the beginning of this function.
2120          * Instead of trying to figure out if it has changed, just do it. */
2121         if (retlen) {
2122             *retlen = curlen;
2123         }
2124
2125         if (disallowed) {
2126             if (flags & UTF8_CHECK_ONLY && retlen) {
2127                 *retlen = ((STRLEN) -1);
2128             }
2129             return 0;
2130         }
2131     }
2132
2133     return UNI_TO_NATIVE(uv);
2134 }
2135
2136 /*
2137 =for apidoc utf8_to_uvchr_buf
2138
2139 Returns the native code point of the first character in the string C<s> which
2140 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2141 C<*retlen> will be set to the length, in bytes, of that character.
2142
2143 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2144 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2145 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
2146 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2147 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2148 the next possible position in C<s> that could begin a non-malformed character.
2149 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2150 returned.
2151
2152 =cut
2153
2154 Also implemented as a macro in utf8.h
2155
2156 */
2157
2158
2159 UV
2160 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2161 {
2162     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2163
2164     return utf8_to_uvchr_buf_helper(s, send, retlen);
2165 }
2166
2167 /*
2168 =for apidoc utf8_length
2169
2170 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2171 at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
2172 same place, it returns 0 with no warning raised.
2173
2174 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2175 and returns the number of valid characters.
2176
2177 =cut
2178 */
2179
2180 STRLEN
2181 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
2182 {
2183     STRLEN len = 0;
2184
2185     PERL_ARGS_ASSERT_UTF8_LENGTH;
2186
2187     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2188      * the bitops (especially ~) can create illegal UTF-8.
2189      * In other words: in Perl UTF-8 is not just for Unicode. */
2190
2191     while (s < e) {
2192         Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2193
2194         if (UNLIKELY(e - s  < expected_byte_count)) {
2195             goto warn_and_return;
2196         }
2197
2198         len++;
2199         s += expected_byte_count;
2200     }
2201
2202     if (LIKELY(e == s)) {
2203         return len;
2204     }
2205
2206     /* Here, s > e on entry */
2207
2208   warn_and_return:
2209     if (PL_op)
2210         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2211                          "%s in %s", unees, OP_DESC(PL_op));
2212     else
2213         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2214
2215     return len;
2216 }
2217
2218 /*
2219 =for apidoc bytes_cmp_utf8
2220
2221 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2222 sequence of characters (stored as UTF-8)
2223 in C<u>, C<ulen>.  Returns 0 if they are
2224 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2225 if the first string is greater than the second string.
2226
2227 -1 or +1 is returned if the shorter string was identical to the start of the
2228 longer string.  -2 or +2 is returned if
2229 there was a difference between characters
2230 within the strings.
2231
2232 =cut
2233 */
2234
2235 int
2236 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2237 {
2238     const U8 *const bend = b + blen;
2239     const U8 *const uend = u + ulen;
2240
2241     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2242
2243     while (b < bend && u < uend) {
2244         U8 c = *u++;
2245         if (!UTF8_IS_INVARIANT(c)) {
2246             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2247                 if (u < uend) {
2248                     U8 c1 = *u++;
2249                     if (UTF8_IS_CONTINUATION(c1)) {
2250                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2251                     } else {
2252                         /* diag_listed_as: Malformed UTF-8 character%s */
2253                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2254                               "%s %s%s",
2255                               unexpected_non_continuation_text(u - 2, 2, 1, 2),
2256                               PL_op ? " in " : "",
2257                               PL_op ? OP_DESC(PL_op) : "");
2258                         return -2;
2259                     }
2260                 } else {
2261                     if (PL_op)
2262                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2263                                          "%s in %s", unees, OP_DESC(PL_op));
2264                     else
2265                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2266                     return -2; /* Really want to return undef :-)  */
2267                 }
2268             } else {
2269                 return -2;
2270             }
2271         }
2272         if (*b != c) {
2273             return *b < c ? -2 : +2;
2274         }
2275         ++b;
2276     }
2277
2278     if (b == bend && u == uend)
2279         return 0;
2280
2281     return b < bend ? +1 : -1;
2282 }
2283
2284 /*
2285 =for apidoc utf8_to_bytes
2286
2287 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2288 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2289 updates C<*lenp> to contain the new length.
2290 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2291
2292 Upon successful return, the number of variants in the string can be computed by
2293 having saved the value of C<*lenp> before the call, and subtracting the
2294 after-call value of C<*lenp> from it.
2295
2296 If you need a copy of the string, see L</bytes_from_utf8>.
2297
2298 =cut
2299 */
2300
2301 U8 *
2302 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2303 {
2304     U8 * first_variant;
2305
2306     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2307     PERL_UNUSED_CONTEXT;
2308
2309     /* This is a no-op if no variants at all in the input */
2310     if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2311         return s;
2312     }
2313
2314     /* Nothing before 'first_variant' needs to be changed, so start the real
2315      * work there */
2316
2317     U8 * const save = s;
2318     U8 * const send = s + *lenp;
2319     U8 * d;
2320
2321 #ifndef EBCDIC      /* The below relies on the bit patterns of UTF-8 */
2322
2323     /* There is some start-up/tear-down overhead with this, so no real gain
2324      * unless the string is long enough.  The current value is just a
2325      * guess. */
2326     if (*lenp > 5 * PERL_WORDSIZE) {
2327
2328         /* First, go through the string a word at-a-time to verify that it is
2329          * downgradable.  If it contains any start byte besides C2 and C3, then
2330          * it isn't. */
2331
2332         const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0;
2333         const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
2334         const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
2335
2336         /* Points to the first byte >=s which is positioned at a word boundary.
2337          * If s is on a word boundary, it is s, otherwise it is the first byte
2338          * of the next word. */
2339         U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2340                                 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2341
2342         /* Here there is at least a full word beyond the first word boundary.
2343          * Process up to that boundary. */
2344         while (s < partial_word_end) {
2345             if (! UTF8_IS_INVARIANT(*s)) {
2346                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2347                     *lenp = ((STRLEN) -1);
2348                     return NULL;
2349                 }
2350                 s++;
2351             }
2352             s++;
2353         }
2354
2355         /* Adjust back down any overshoot */
2356         s = partial_word_end;
2357
2358         /* Process per-word */
2359         do {
2360
2361             PERL_UINTMAX_T C2_C3_start_bytes;
2362
2363             /* First find the bytes that are start bytes.  ANDing with
2364              * C0C0...C0 causes any start byte to become C0; any other byte
2365              * becomes something else.  Then XORing with C0 causes any start
2366              * byte to become 0; all other bytes non-zero. */
2367             PERL_UINTMAX_T start_bytes
2368                           = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask;
2369
2370             /* These shifts causes the most significant bit to be set to 1 for
2371              * any bytes in the word that aren't completely 0.  Hence after
2372              * these, only the start bytes have 0 in their msb */
2373             start_bytes |= start_bytes << 1;
2374             start_bytes |= start_bytes << 2;
2375             start_bytes |= start_bytes << 4;
2376
2377             /* When we complement, then AND with 8080...80, the start bytes
2378              * will have 1 in their msb, and all other bits are 0 */
2379             start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK;
2380
2381             /* Now repeat the procedure, but look for bytes that match only
2382              * C2-C3. */
2383             C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask)
2384                                                                 ^ C2_mask;
2385             C2_C3_start_bytes |= C2_C3_start_bytes << 1;
2386             C2_C3_start_bytes |= C2_C3_start_bytes << 2;
2387             C2_C3_start_bytes |= C2_C3_start_bytes << 4;
2388             C2_C3_start_bytes = ~ C2_C3_start_bytes
2389                                 & PERL_VARIANTS_WORD_MASK;
2390
2391             /* Here, start_bytes has a 1 in the msb of each byte that has a
2392              *                                              start_byte; And
2393              * C2_C3_start_bytes has a 1 in the msb of each byte that has a
2394              *                                       start_byte of C2 or C3
2395              * If they're not equal, there are start bytes that aren't C2
2396              * nor C3, hence this is not downgradable */
2397             if (start_bytes != C2_C3_start_bytes) {
2398                 *lenp = ((STRLEN) -1);
2399                 return NULL;
2400             }
2401
2402             s += PERL_WORDSIZE;
2403         } while (s + PERL_WORDSIZE <= send);
2404
2405         /* If the final byte was a start byte, it means that the character
2406          * straddles two words, so back off one to start looking below at the
2407          * first byte of the character  */
2408         if (s > first_variant && UTF8_IS_START(*(s-1))) {
2409             s--;
2410         }
2411     }
2412
2413 #endif
2414
2415     /* Do the straggler bytes beyond the final word boundary (or all bytes
2416      * in the case of EBCDIC) */
2417     while (s < send) {
2418         if (! UTF8_IS_INVARIANT(*s)) {
2419             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2420                 *lenp = ((STRLEN) -1);
2421                 return NULL;
2422             }
2423             s++;
2424         }
2425         s++;
2426     }
2427
2428     /* Here, we passed the tests above.  For the EBCDIC case, everything
2429      * was well-formed and can be downgraded to non-UTF8.  For non-EBCDIC,
2430      * it means only that all start bytes were C2 or C3, hence any
2431      * well-formed sequences are downgradable.  But we didn't test, for
2432      * example, that there weren't two C2's in a row.  That means that in
2433      * the loop below, we have to be sure things are well-formed.  Because
2434      * this is very very likely, and we don't care about having speedy
2435      * handling of malformed input, the loop proceeds as if well formed,
2436      * and should a malformed one come along, it undoes what it already has
2437      * done */
2438
2439     d = s = first_variant;
2440
2441     while (s < send) {
2442         U8 * s1;
2443
2444         if (UVCHR_IS_INVARIANT(*s)) {
2445             *d++ = *s++;
2446             continue;
2447         }
2448
2449         /* Here it is two-byte encoded. */
2450         if (   LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
2451             && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
2452         {
2453             U8 first_byte = *s++;
2454             *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
2455             s++;
2456             continue;
2457         }
2458
2459         /* Here, it is malformed.  This shouldn't happen on EBCDIC, and on
2460          * ASCII platforms, we know that the only start bytes in the text
2461          * are C2 and C3, and the code above has made sure that it doesn't
2462          * end with a start byte.  That means the only malformations that
2463          * are possible are a start byte without a continuation (either
2464          * followed by another start byte or an invariant) or an unexpected
2465          * continuation.
2466          *
2467          * We have to undo all we've done before, back down to the first
2468          * UTF-8 variant.  Note that each 2-byte variant we've done so far
2469          * (converted to single byte) slides things to the left one byte,
2470          * and so we have bytes that haven't been written over.
2471          *
2472          * Here, 'd' points to the next position to overwrite, and 's'
2473          * points to the first invalid byte.  That means 'd's contents
2474          * haven't been changed yet, nor has anything else beyond it in the
2475          * string.  In restoring to the original contents, we don't need to
2476          * do anything past (d-1).
2477          *
2478          * In particular, the bytes from 'd' to 's' have not been changed.
2479          * This loop uses a new variable 's1' (to avoid confusing 'source'
2480          * and 'destination') set to 'd',  and moves 's' and 's1' in lock
2481          * step back so that afterwards, 's1' points to the first changed
2482          * byte that will be the source for the first byte (or bytes) at
2483          * 's' that need to be changed back.  Note that s1 can expand to
2484          * two bytes */
2485         s1 = d;
2486         while (s >= d) {
2487             s--;
2488             if (! UVCHR_IS_INVARIANT(*s1)) {
2489                 s--;
2490             }
2491             s1--;
2492         }
2493
2494         /* Do the changing back */
2495         while (s1 >= first_variant) {
2496             if (UVCHR_IS_INVARIANT(*s1)) {
2497                 *s-- = *s1--;
2498             }
2499             else {
2500                 *s-- = UTF8_EIGHT_BIT_LO(*s1);
2501                 *s-- = UTF8_EIGHT_BIT_HI(*s1);
2502                 s1--;
2503             }
2504         }
2505
2506         *lenp = ((STRLEN) -1);
2507         return NULL;
2508     }
2509
2510     /* Success! */
2511     *d = '\0';
2512     *lenp = d - save;
2513
2514     return save;
2515 }
2516
2517 /*
2518 =for apidoc bytes_from_utf8
2519
2520 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2521 byte encoding.  On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2522 actually encoded in UTF-8.
2523
2524 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2525 the input string.
2526
2527 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2528 not expressible in native byte encoding.  In these cases, C<*is_utf8p> and
2529 C<*lenp> are unchanged, and the return value is the original C<s>.
2530
2531 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2532 newly created string containing a downgraded copy of C<s>, and whose length is
2533 returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
2534 caller is responsible for arranging for the memory used by this string to get
2535 freed.
2536
2537 Upon successful return, the number of variants in the string can be computed by
2538 having saved the value of C<*lenp> before the call, and subtracting the
2539 after-call value of C<*lenp> from it.
2540
2541 =cut
2542
2543 There is a macro that avoids this function call, but this is retained for
2544 anyone who calls it with the Perl_ prefix */
2545
2546 U8 *
2547 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2548 {
2549     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2550     PERL_UNUSED_CONTEXT;
2551
2552     return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2553 }
2554
2555 /*
2556 =for apidoc bytes_from_utf8_loc
2557
2558 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2559 to where to store the location of the first character in C<"s"> that cannot be
2560 converted to non-UTF8.
2561
2562 If that parameter is C<NULL>, this function behaves identically to
2563 C<bytes_from_utf8>.
2564
2565 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2566 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2567
2568 Otherwise, the function returns a newly created C<NUL>-terminated string
2569 containing the non-UTF8 equivalent of the convertible first portion of
2570 C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
2571 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2572 and C<*first_non_downgradable> is set to C<NULL>.
2573
2574 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2575 first character in the original string that wasn't converted.  C<*is_utf8p> is
2576 unchanged.  Note that the new string may have length 0.
2577
2578 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2579 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2580 converts as many characters in it as possible stopping at the first one it
2581 finds that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
2582 set to point to that.  The function returns the portion that could be converted
2583 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2584 not including the terminating C<NUL>.  If the very first character in the
2585 original could not be converted, C<*lenp> will be 0, and the new string will
2586 contain just a single C<NUL>.  If the entire input string was converted,
2587 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2588
2589 Upon successful return, the number of variants in the converted portion of the
2590 string can be computed by having saved the value of C<*lenp> before the call,
2591 and subtracting the after-call value of C<*lenp> from it.
2592
2593 =cut
2594
2595
2596 */
2597
2598 U8 *
2599 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2600 {
2601     U8 *d;
2602     const U8 *original = s;
2603     U8 *converted_start;
2604     const U8 *send = s + *lenp;
2605
2606     PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2607
2608     if (! *is_utf8p) {
2609         if (first_unconverted) {
2610             *first_unconverted = NULL;
2611         }
2612
2613         return (U8 *) original;
2614     }
2615
2616     Newx(d, (*lenp) + 1, U8);
2617
2618     converted_start = d;
2619     while (s < send) {
2620         U8 c = *s++;
2621         if (! UTF8_IS_INVARIANT(c)) {
2622
2623             /* Then it is multi-byte encoded.  If the code point is above 0xFF,
2624              * have to stop now */
2625             if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2626                 if (first_unconverted) {
2627                     *first_unconverted = s - 1;
2628                     goto finish_and_return;
2629                 }
2630                 else {
2631                     Safefree(converted_start);
2632                     return (U8 *) original;
2633                 }
2634             }
2635
2636             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2637             s++;
2638         }
2639         *d++ = c;
2640     }
2641
2642     /* Here, converted the whole of the input */
2643     *is_utf8p = FALSE;
2644     if (first_unconverted) {
2645         *first_unconverted = NULL;
2646     }
2647
2648   finish_and_return:
2649     *d = '\0';
2650     *lenp = d - converted_start;
2651
2652     /* Trim unused space */
2653     Renew(converted_start, *lenp + 1, U8);
2654
2655     return converted_start;
2656 }
2657
2658 /*
2659 =for apidoc bytes_to_utf8
2660
2661 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2662 UTF-8.
2663 Returns a pointer to the newly-created string, and sets C<*lenp> to
2664 reflect the new length in bytes.  The caller is responsible for arranging for
2665 the memory used by this string to get freed.
2666
2667 Upon successful return, the number of variants in the string can be computed by
2668 having saved the value of C<*lenp> before the call, and subtracting it from the
2669 after-call value of C<*lenp>.
2670
2671 A C<NUL> character will be written after the end of the string.
2672
2673 If you want to convert to UTF-8 from encodings other than
2674 the native (Latin1 or EBCDIC),
2675 see L</sv_recode_to_utf8>().
2676
2677 =cut
2678 */
2679
2680 U8*
2681 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2682 {
2683     const U8 * const send = s + (*lenp);
2684     U8 *d;
2685     U8 *dst;
2686
2687     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2688     PERL_UNUSED_CONTEXT;
2689
2690     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2691     Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2692     dst = d;
2693
2694     while (s < send) {
2695         append_utf8_from_native_byte(*s, &d);
2696         s++;
2697     }
2698
2699     *d = '\0';
2700     *lenp = d-dst;
2701
2702     return dst;
2703 }
2704
2705 /*
2706  * Convert native UTF-16 to UTF-8. Called via the more public functions
2707  * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2708  * little-endian,
2709  *
2710  * 'p' is the UTF-16 input string, passed as a pointer to U8.
2711  * 'bytelen' is its length (must be even)
2712  * 'd' is the pointer to the destination buffer.  The caller must ensure that
2713  *     the space is large enough.  The maximum expansion factor is 2 times
2714  *     'bytelen'.  1.5 if never going to run on an EBCDIC box.
2715  * '*newlen' will contain the number of bytes this function filled of 'd'.
2716  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2717  * 'low_byte' is 1  if UTF-16BE; 0 if UTF-16LE
2718  *
2719  * The expansion factor is because UTF-16 requires 2 bytes for every code point
2720  * below 0x10000; otherwise 4 bytes.  UTF-8 requires 1-3 bytes for every code
2721  * point below 0x1000; otherwise 4 bytes.  UTF-EBCDIC requires 1-4 bytes for
2722  * every code point below 0x1000; otherwise 4-5 bytes.
2723  *
2724  * The worst case is where every code point is below U+10000, hence requiring 2
2725  * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2726  * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2727  *
2728  * Do not use in-place. */
2729
2730 U8*
2731 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2732                               const bool high_byte, /* Which of next two bytes is
2733                                                   high order */
2734                               const bool low_byte)
2735 {
2736     U8* pend;
2737     U8* dstart = d;
2738
2739     PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2740
2741     if (bytelen & 1)
2742         Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2743                 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2744     pend = p + bytelen;
2745
2746     while (p < pend) {
2747
2748         /* Next 16 bits is what we want.  (The bool is cast to U8 because on
2749          * platforms where a bool is implemented as a signed char, a compiler
2750          * warning may be generated) */
2751         U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2752         p += 2;
2753
2754         /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2755          * */
2756         if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2757
2758 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2759 #define LAST_HIGH_SURROGATE  0xDBFF
2760 #define FIRST_LOW_SURROGATE  0xDC00
2761 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
2762 #define FIRST_IN_PLANE1      0x10000
2763
2764             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2765                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2766             }
2767             else {
2768                 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2769                 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2770                                                        LAST_LOW_SURROGATE)))
2771                 {
2772                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2773                 }
2774
2775                 p += 2;
2776
2777                 /* Here uv is the high surrogate.  Combine with low surrogate
2778                  * just computed to form the actual U32 code point.
2779                  *
2780                  * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2781                 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2782                                      + low_surrogate - FIRST_LOW_SURROGATE;
2783             }
2784         }
2785
2786         /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2787         d = uvchr_to_utf8(d, uv);
2788     }
2789
2790     *newlen = d - dstart;
2791     return d;
2792 }
2793
2794 U8*
2795 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2796 {
2797     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2798
2799     return utf16_to_utf8(p, d, bytelen, newlen);
2800 }
2801
2802 U8*
2803 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2804 {
2805     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2806
2807     return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2808 }
2809
2810 /*
2811  * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2812  * big-endian and utf8_to_utf16_reversed() for little-endian,
2813  *
2814  * 's' is the UTF-8 input string, passed as a pointer to U8.
2815  * 'bytelen' is its length
2816  * 'd' is the pointer to the destination buffer, currently passed as U8 *.  The
2817  *     caller must ensure that the space is large enough.  The maximum
2818  *     expansion factor is 2 times 'bytelen'.  This happens when the input is
2819  *     entirely single-byte ASCII, expanding to two-byte UTF-16.
2820  * '*newlen' will contain the number of bytes this function filled of 'd'.
2821  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2822  * 'low_byte'  is 1 if UTF-16BE; 0 if UTF-16LE
2823  *
2824  * Do not use in-place. */
2825 U8*
2826 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2827                               const bool high_byte, /* Which of next two bytes
2828                                                        is high order */
2829                               const bool low_byte)
2830 {
2831     U8* send;
2832     U8* dstart = d;
2833
2834     PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2835
2836     send = s + bytelen;
2837
2838     while (s < send) {
2839         STRLEN retlen;
2840         UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2841                                /* No surrogates nor above-Unicode */
2842                                UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2843
2844         /* The modern method is to keep going with malformed input,
2845          * substituting the REPLACEMENT CHARACTER */
2846         if (UNLIKELY(uv == 0 && *s != '\0')) {
2847             uv = UNICODE_REPLACEMENT;
2848         }
2849
2850         if (uv >= FIRST_IN_PLANE1) {    /* Requires a surrogate pair */
2851
2852             /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2853             U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2854                                + FIRST_HIGH_SURROGATE;
2855
2856             /* (The bool is cast to U8 because on platforms where a bool is
2857              * implemented as a signed char, a compiler warning may be
2858              * generated) */
2859             d[(U8) high_byte] = high_surrogate >> 8;
2860             d[(U8) low_byte]  = high_surrogate & nBIT_MASK(8);
2861             d += 2;
2862
2863             /* The low surrogate is the lower 10 bits plus the offset */
2864             uv &= nBIT_MASK(10);
2865             uv += FIRST_LOW_SURROGATE;
2866
2867             /* Drop down to output the low surrogate like it were a
2868              * non-surrogate */
2869         }
2870
2871         d[(U8) high_byte] = uv >> 8;
2872         d[(U8) low_byte] = uv & nBIT_MASK(8);
2873         d += 2;
2874
2875         s += retlen;
2876     }
2877
2878     *newlen = d - dstart;
2879     return d;
2880 }
2881
2882 bool
2883 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2884 {
2885     return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2886 }
2887
2888 bool
2889 Perl__is_uni_perl_idcont(pTHX_ UV c)
2890 {
2891     return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2892 }
2893
2894 bool
2895 Perl__is_uni_perl_idstart(pTHX_ UV c)
2896 {
2897     return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2898 }
2899
2900 UV
2901 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2902                                   const char S_or_s)
2903 {
2904     /* We have the latin1-range values compiled into the core, so just use
2905      * those, converting the result to UTF-8.  The only difference between upper
2906      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2907      * either "SS" or "Ss".  Which one to use is passed into the routine in
2908      * 'S_or_s' to avoid a test */
2909
2910     UV converted = toUPPER_LATIN1_MOD(c);
2911
2912     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2913
2914     assert(S_or_s == 'S' || S_or_s == 's');
2915
2916     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2917                                              characters in this range */
2918         *p = (U8) converted;
2919         *lenp = 1;
2920         return converted;
2921     }
2922
2923     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2924      * which it maps to one of them, so as to only have to have one check for
2925      * it in the main case */
2926     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2927         switch (c) {
2928             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2929                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2930                 break;
2931             case MICRO_SIGN:
2932                 converted = GREEK_CAPITAL_LETTER_MU;
2933                 break;
2934 #if    UNICODE_MAJOR_VERSION > 2                                        \
2935    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
2936                                   && UNICODE_DOT_DOT_VERSION >= 8)
2937             case LATIN_SMALL_LETTER_SHARP_S:
2938                 *(p)++ = 'S';
2939                 *p = S_or_s;
2940                 *lenp = 2;
2941                 return 'S';
2942 #endif
2943             default:
2944                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2945                                  " '%c' to map to '%c'",
2946                                  c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2947                 NOT_REACHED; /* NOTREACHED */
2948         }
2949     }
2950
2951     *(p)++ = UTF8_TWO_BYTE_HI(converted);
2952     *p = UTF8_TWO_BYTE_LO(converted);
2953     *lenp = 2;
2954
2955     return converted;
2956 }
2957
2958 /* If compiled on an early Unicode version, there may not be auxiliary tables
2959  * */
2960 #ifndef HAS_UC_AUX_TABLES
2961 #  define UC_AUX_TABLE_ptrs     NULL
2962 #  define UC_AUX_TABLE_lengths  NULL
2963 #endif
2964 #ifndef HAS_TC_AUX_TABLES
2965 #  define TC_AUX_TABLE_ptrs     NULL
2966 #  define TC_AUX_TABLE_lengths  NULL
2967 #endif
2968 #ifndef HAS_LC_AUX_TABLES
2969 #  define LC_AUX_TABLE_ptrs     NULL
2970 #  define LC_AUX_TABLE_lengths  NULL
2971 #endif
2972 #ifndef HAS_CF_AUX_TABLES
2973 #  define CF_AUX_TABLE_ptrs     NULL
2974 #  define CF_AUX_TABLE_lengths  NULL
2975 #endif
2976
2977 /* Call the function to convert a UTF-8 encoded character to the specified case.
2978  * Note that there may be more than one character in the result.
2979  * 's' is a pointer to the first byte of the input character
2980  * 'd' will be set to the first byte of the string of changed characters.  It
2981  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2982  * 'lenp' will be set to the length in bytes of the string of changed characters
2983  *
2984  * The functions return the ordinal of the first character in the string of
2985  * 'd' */
2986 #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
2987                 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
2988                                               Uppercase_Mapping_invmap,     \
2989                                               UC_AUX_TABLE_ptrs,            \
2990                                               UC_AUX_TABLE_lengths,         \
2991                                               "uppercase")
2992 #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
2993                 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
2994                                               Titlecase_Mapping_invmap,     \
2995                                               TC_AUX_TABLE_ptrs,            \
2996                                               TC_AUX_TABLE_lengths,         \
2997                                               "titlecase")
2998 #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
2999                 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
3000                                               Lowercase_Mapping_invmap,     \
3001                                               LC_AUX_TABLE_ptrs,            \
3002                                               LC_AUX_TABLE_lengths,         \
3003                                               "lowercase")
3004
3005
3006 /* This additionally has the input parameter 'specials', which if non-zero will
3007  * cause this to use the specials hash for folding (meaning get full case
3008  * folding); otherwise, when zero, this implies a simple case fold */
3009 #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
3010         (specials)                                                          \
3011         ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
3012                                           Case_Folding_invmap,              \
3013                                           CF_AUX_TABLE_ptrs,                \
3014                                           CF_AUX_TABLE_lengths,             \
3015                                           "foldcase")                       \
3016         : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
3017                                          Simple_Case_Folding_invmap,        \
3018                                          NULL, NULL,                        \
3019                                          "foldcase")
3020
3021 UV
3022 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
3023 {
3024     /* Convert the Unicode character whose ordinal is <c> to its uppercase
3025      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
3026      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
3027      * the changed version may be longer than the original character.
3028      *
3029      * The ordinal of the first character of the changed version is returned
3030      * (but note, as explained above, that there may be more.) */
3031
3032     PERL_ARGS_ASSERT_TO_UNI_UPPER;
3033
3034     if (c < 256) {
3035         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
3036     }
3037
3038     return CALL_UPPER_CASE(c, NULL, p, lenp);
3039 }
3040
3041 UV
3042 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
3043 {
3044     PERL_ARGS_ASSERT_TO_UNI_TITLE;
3045
3046     if (c < 256) {
3047         return _to_upper_title_latin1((U8) c, p, lenp, 's');
3048     }
3049
3050     return CALL_TITLE_CASE(c, NULL, p, lenp);
3051 }
3052
3053 STATIC U8
3054 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
3055 {
3056     /* We have the latin1-range values compiled into the core, so just use
3057      * those, converting the result to UTF-8.  Since the result is always just
3058      * one character, we allow <p> to be NULL */
3059
3060     U8 converted = toLOWER_LATIN1(c);
3061
3062     PERL_UNUSED_ARG(dummy);
3063
3064     if (p != NULL) {
3065         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
3066             *p = converted;
3067             *lenp = 1;
3068         }
3069         else {
3070             /* Result is known to always be < 256, so can use the EIGHT_BIT
3071              * macros */
3072             *p = UTF8_EIGHT_BIT_HI(converted);
3073             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
3074             *lenp = 2;
3075         }
3076     }
3077     return converted;
3078 }
3079
3080 UV
3081 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
3082 {
3083     PERL_ARGS_ASSERT_TO_UNI_LOWER;
3084
3085     if (c < 256) {
3086         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
3087     }
3088
3089     return CALL_LOWER_CASE(c, NULL, p, lenp);
3090 }
3091
3092 UV
3093 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
3094 {
3095     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
3096      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3097      *      FOLD_FLAGS_FULL  iff full folding is to be used;
3098      *
3099      *  Not to be used for locale folds
3100      */
3101
3102     UV converted;
3103
3104     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3105
3106     assert (! (flags & FOLD_FLAGS_LOCALE));
3107
3108     if (UNLIKELY(c == MICRO_SIGN)) {
3109         converted = GREEK_SMALL_LETTER_MU;
3110     }
3111 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3112    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3113                                       || UNICODE_DOT_DOT_VERSION > 0)
3114     else if (   (flags & FOLD_FLAGS_FULL)
3115              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3116     {
3117         /* If can't cross 127/128 boundary, can't return "ss"; instead return
3118          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3119          * under those circumstances. */
3120         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3121             *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3122             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3123                  p, *lenp, U8);
3124             return LATIN_SMALL_LETTER_LONG_S;
3125         }
3126         else {
3127             *(p)++ = 's';
3128             *p = 's';
3129             *lenp = 2;
3130             return 's';
3131         }
3132     }
3133 #endif
3134     else { /* In this range the fold of all other characters is their lower
3135               case */
3136         converted = toLOWER_LATIN1(c);
3137     }
3138
3139     if (UVCHR_IS_INVARIANT(converted)) {
3140         *p = (U8) converted;
3141         *lenp = 1;
3142     }
3143     else {
3144         *(p)++ = UTF8_TWO_BYTE_HI(converted);
3145         *p = UTF8_TWO_BYTE_LO(converted);
3146         *lenp = 2;
3147     }
3148
3149     return converted;
3150 }
3151
3152 UV
3153 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3154 {
3155
3156     /* Not currently externally documented, and subject to change
3157      *  <flags> bits meanings:
3158      *      FOLD_FLAGS_FULL  iff full folding is to be used;
3159      *      FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3160      *                        locale are to be used.
3161      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3162      */
3163
3164     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3165
3166     if (flags & FOLD_FLAGS_LOCALE) {
3167         /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3168          * except for potentially warning */
3169         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3170         if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
3171             flags &= ~FOLD_FLAGS_LOCALE;
3172         }
3173         else {
3174             goto needs_full_generality;
3175         }
3176     }
3177
3178     if (c < 256) {
3179         return _to_fold_latin1((U8) c, p, lenp,
3180                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3181     }
3182
3183     /* Here, above 255.  If no special needs, just use the macro */
3184     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3185         return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3186     }
3187     else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3188                the special flags. */
3189         U8 utf8_c[UTF8_MAXBYTES + 1];
3190
3191       needs_full_generality:
3192         uvchr_to_utf8(utf8_c, c);
3193         return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3194                                   p, lenp, flags);
3195     }
3196 }
3197
3198 PERL_STATIC_INLINE bool
3199 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3200                        SV* const invlist)
3201 {
3202     /* returns a boolean giving whether or not the UTF8-encoded character that
3203      * starts at <p>, and extending no further than <e - 1> is in the inversion
3204      * list <invlist>. */
3205
3206     UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3207
3208     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3209
3210     if (cp == 0 && (p >= e || *p != '\0')) {
3211         _force_out_malformed_utf8_message(p, e, 0, 1);
3212         NOT_REACHED; /* NOTREACHED */
3213     }
3214
3215     assert(invlist);
3216     return _invlist_contains_cp(invlist, cp);
3217 }
3218
3219 #if 0   /* Not currently used, but may be needed in the future */
3220 PERLVAR(I, seen_deprecated_macro, HV *)
3221
3222 STATIC void
3223 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3224                                      const char * const alternative,
3225                                      const bool use_locale,
3226                                      const char * const file,
3227                                      const unsigned line)
3228 {
3229     const char * key;
3230
3231     PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3232
3233     if (ckWARN_d(WARN_DEPRECATED)) {
3234
3235         key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3236         if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3237             if (! PL_seen_deprecated_macro) {
3238                 PL_seen_deprecated_macro = newHV();
3239             }
3240             if (! hv_store(PL_seen_deprecated_macro, key,
3241                            strlen(key), &PL_sv_undef, 0))
3242             {
3243                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3244             }
3245
3246             if (instr(file, "mathoms.c")) {
3247                 Perl_warner(aTHX_ WARN_DEPRECATED,
3248                             "In %s, line %d, starting in Perl v5.32, %s()"
3249                             " will be removed.  Avoid this message by"
3250                             " converting to use %s().\n",
3251                             file, line, name, alternative);
3252             }
3253             else {
3254                 Perl_warner(aTHX_ WARN_DEPRECATED,
3255                             "In %s, line %d, starting in Perl v5.32, %s() will"
3256                             " require an additional parameter.  Avoid this"
3257                             " message by converting to use %s().\n",
3258                             file, line, name, alternative);
3259             }
3260         }
3261     }
3262 }
3263 #endif
3264
3265 bool
3266 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3267 {
3268     PERL_ARGS_ASSERT__IS_UTF8_FOO;
3269
3270     return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3271 }
3272
3273 bool
3274 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3275 {
3276     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3277
3278     return is_utf8_common(p, e, PL_utf8_perl_idstart);
3279 }
3280
3281 bool
3282 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3283 {
3284     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3285
3286     return is_utf8_common(p, e, PL_utf8_perl_idcont);
3287 }
3288
3289 STATIC UV
3290 S_to_case_cp_list(pTHX_
3291                   const UV original,
3292                   const U32 ** const remaining_list,
3293                   Size_t * remaining_count,
3294                   SV *invlist, const I32 * const invmap,
3295                   const U32 * const * const aux_tables,
3296                   const U8 * const aux_table_lengths,
3297                   const char * const normal)
3298 {
3299     SSize_t index;
3300     I32 base;
3301
3302     /* Calculate the changed case of code point 'original'.  The first code
3303      * point of the changed case is returned.
3304      *
3305      * If 'remaining_count' is not NULL, *remaining_count will be set to how
3306      * many *other* code points are in the changed case.  If non-zero and
3307      * 'remaining_list' is also not NULL, *remaining_list will be set to point
3308      * to a non-modifiable array containing the second and potentially third
3309      * code points in the changed case.  (Unicode guarantees a maximum of 3.)
3310      * Note that this means that *remaining_list is undefined unless there are
3311      * multiple code points, and the caller has chosen to find out how many by
3312      * making 'remaining_count' not NULL.
3313      *
3314      * 'normal' is a string to use to name the new case in any generated
3315      * messages, as a fallback if the operation being used is not available.
3316      *
3317      * The casing to use is given by the data structures in the remaining
3318      * arguments.
3319      */
3320
3321     PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3322
3323     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3324      * that covers all possible inputs.  See [perl #133365] */
3325     index = _invlist_search(invlist, original);
3326     base = invmap[index];
3327
3328     /* Most likely, the case change will contain just a single code point */
3329     if (remaining_count) {
3330         *remaining_count = 0;
3331     }
3332
3333     if (LIKELY(base == 0)) {    /* 0 => original was unchanged by casing */
3334
3335         /* At this bottom level routine is where we warn about illegal code
3336          * points */
3337         if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3338             if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3339                 if (ckWARN_d(WARN_SURROGATE)) {
3340                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3341                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3342                         "Operation \"%s\" returns its argument for"
3343                         " UTF-16 surrogate U+%04" UVXf, desc, original);
3344                 }
3345             }
3346             else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3347                 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3348                     Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3349                 }
3350                 if (ckWARN_d(WARN_NON_UNICODE)) {
3351                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3352                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3353                         "Operation \"%s\" returns its argument for"
3354                         " non-Unicode code point 0x%04" UVXf, desc, original);
3355                 }
3356             }
3357
3358             /* Note that non-characters are perfectly legal, so no warning
3359              * should be given. */
3360         }
3361
3362         return original;
3363     }
3364
3365     if (LIKELY(base > 0)) {  /* means original mapped to a single code point,
3366                                 different from itself */
3367         return base + original - invlist_array(invlist)[index];
3368     }
3369
3370     /* Here 'base' is negative.  That means the mapping is 1-to-many, and
3371      * requires an auxiliary table look up.  abs(base) gives the index into a
3372      * list of such tables which points to the proper aux table.  And a
3373      * parallel list gives the length of each corresponding aux table.  Skip
3374      * the first entry in the *remaining returns, as it is returned by the
3375      * function. */
3376     base = -base;
3377     if (remaining_count) {
3378         *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3379
3380         if (remaining_list) {
3381             *remaining_list  = aux_tables[base] + 1;
3382         }
3383     }
3384
3385     return (UV) aux_tables[base][0];
3386 }
3387
3388 STATIC UV
3389 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3390                       U8* ustrp, STRLEN *lenp,
3391                       SV *invlist, const I32 * const invmap,
3392                       const U32 * const * const aux_tables,
3393                       const U8 * const aux_table_lengths,
3394                       const char * const normal)
3395 {
3396     /* Change the case of code point 'original'.  If 'p' is non-NULL, it points to
3397      * the beginning of the (assumed to be valid) UTF-8 representation of
3398      * 'original'.  'normal' is a string to use to name the new case in any
3399      * generated messages, as a fallback if the operation being used is not
3400      * available.  The new case is given by the data structures in the
3401      * remaining arguments.
3402      *
3403      * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3404      * entire changed case string, and the return value is the first code point
3405      * in that string
3406      *
3407      * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3408      * since the changed version may be longer than the original character. */
3409
3410     const U32 * remaining_list;
3411     Size_t remaining_count;
3412     UV first = to_case_cp_list(original,
3413                                &remaining_list, &remaining_count,
3414                                invlist, invmap,
3415                                aux_tables, aux_table_lengths,
3416                                normal);
3417
3418     PERL_ARGS_ASSERT__TO_UTF8_CASE;
3419
3420     /* If the code point maps to itself and we already have its representation,
3421      * copy it instead of recalculating */
3422     if (original == first && p) {
3423         *lenp = UTF8SKIP(p);
3424
3425         if (p != ustrp) {   /* Don't copy onto itself */
3426             Copy(p, ustrp, *lenp, U8);
3427         }
3428     }
3429     else {
3430         U8 * d = ustrp;
3431         Size_t i;
3432
3433         d = uvchr_to_utf8(d, first);
3434
3435         for (i = 0; i < remaining_count; i++) {
3436             d = uvchr_to_utf8(d, remaining_list[i]);
3437         }
3438
3439         *d = '\0';
3440         *lenp = d - ustrp;
3441     }
3442
3443     return first;
3444 }
3445
3446 Size_t
3447 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3448                           const U32 ** remaining_folds_to)
3449 {
3450     /* Returns the count of the number of code points that fold to the input
3451      * 'cp' (besides itself).
3452      *
3453      * If the return is 0, there is nothing else that folds to it, and
3454      * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3455      *
3456      * If the return is 1, '*first_folds_to' is set to the single code point,
3457      * and '*remaining_folds_to' is set to NULL.
3458      *
3459      * Otherwise, '*first_folds_to' is set to a code point, and
3460      * '*remaining_fold_to' is set to an array that contains the others.  The
3461      * length of this array is the returned count minus 1.
3462      *
3463      * The reason for this convolution is to avoid having to deal with
3464      * allocating and freeing memory.  The lists are already constructed, so
3465      * the return can point to them, but single code points aren't, so would
3466      * need to be constructed if we didn't employ something like this API
3467      *
3468      * The code points returned by this function are all legal Unicode, which
3469      * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3470      * constructed with this size (to save space and memory), and we return
3471      * pointers, so they must be this size */
3472
3473     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3474      * that covers all possible inputs.  See [perl #133365] */
3475     SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3476     I32 base = _Perl_IVCF_invmap[index];
3477
3478     PERL_ARGS_ASSERT__INVERSE_FOLDS;
3479
3480     if (base == 0) {            /* No fold */
3481         *first_folds_to = 0;
3482         *remaining_folds_to = NULL;
3483         return 0;
3484     }
3485
3486 #ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
3487
3488     assert(base > 0);
3489
3490 #else
3491
3492     if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
3493
3494         /* The data structure is set up so that the absolute value of 'base' is
3495          * an index into a table of pointers to arrays, with the array
3496          * corresponding to the index being the list of code points that fold
3497          * to 'cp', and the parallel array containing the length of the list
3498          * array */
3499         *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3500         *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3501                                                 /* +1 excludes first_folds_to */
3502         return IVCF_AUX_TABLE_lengths[-base];
3503     }
3504
3505 #endif
3506
3507     /* Only the single code point.  This works like 'fc(G) = G - A + a' */
3508     *first_folds_to = (U32) (base + cp
3509                                   - invlist_array(PL_utf8_foldclosures)[index]);
3510     *remaining_folds_to = NULL;
3511     return 1;
3512 }
3513
3514 STATIC UV
3515 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3516                                        U8* const ustrp, STRLEN *lenp)
3517 {
3518     /* This is called when changing the case of a UTF-8-encoded character above
3519      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
3520      * result contains a character that crosses the 255/256 boundary, disallow
3521      * the change, and return the original code point.  See L<perlfunc/lc> for
3522      * why;
3523      *
3524      * p        points to the original string whose case was changed; assumed
3525      *          by this routine to be well-formed
3526      * result   the code point of the first character in the changed-case string
3527      * ustrp    points to the changed-case string (<result> represents its
3528      *          first char)
3529      * lenp     points to the length of <ustrp> */
3530
3531     UV original;    /* To store the first code point of <p> */
3532
3533     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3534
3535     assert(UTF8_IS_ABOVE_LATIN1(*p));
3536
3537     /* We know immediately if the first character in the string crosses the
3538      * boundary, so can skip testing */
3539     if (result > 255) {
3540
3541         /* Look at every character in the result; if any cross the
3542         * boundary, the whole thing is disallowed */
3543         U8* s = ustrp + UTF8SKIP(ustrp);
3544         U8* e = ustrp + *lenp;
3545         while (s < e) {
3546             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3547                 goto bad_crossing;
3548             }
3549             s += UTF8SKIP(s);
3550         }
3551
3552         /* Here, no characters crossed, result is ok as-is, but we warn. */
3553         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3554         return result;
3555     }
3556
3557   bad_crossing:
3558
3559     /* Failed, have to return the original */
3560     original = valid_utf8_to_uvchr(p, lenp);
3561
3562     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3563     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3564                            "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3565                            " locale; resolved to \"\\x{%" UVXf "}\".",
3566                            OP_DESC(PL_op),
3567                            original,
3568                            original);
3569     Copy(p, ustrp, *lenp, char);
3570     return original;
3571 }
3572
3573 STATIC UV
3574 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3575                         U8 * ustrp, STRLEN *lenp)
3576 {
3577     /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3578      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3579      * Otherwise, it returns the first code point of the Turkic foldcased
3580      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3581      * contain *lenp bytes
3582      *
3583      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3584      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3585      * DOTLESS I */
3586
3587     PERL_ARGS_ASSERT_TURKIC_FC;
3588     assert(e > p);
3589
3590     if (UNLIKELY(*p == 'I')) {
3591         *lenp = 2;
3592         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3593         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3594         return LATIN_SMALL_LETTER_DOTLESS_I;
3595     }
3596
3597     if (UNLIKELY(memBEGINs(p, e - p,
3598                            LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3599     {
3600         *lenp = 1;
3601         *ustrp = 'i';
3602         return 'i';
3603     }
3604
3605     return 0;
3606 }
3607
3608 STATIC UV
3609 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3610                         U8 * ustrp, STRLEN *lenp)
3611 {
3612     /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3613      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3614      * Otherwise, it returns the first code point of the Turkic lowercased
3615      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3616      * contain *lenp bytes */
3617
3618     PERL_ARGS_ASSERT_TURKIC_LC;
3619     assert(e > p0);
3620
3621     /* A 'I' requires context as to what to do */
3622     if (UNLIKELY(*p0 == 'I')) {
3623         const U8 * p = p0 + 1;
3624
3625         /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3626          * modified by a dot above lowercases to 'i' even in turkic locales. */
3627         while (p < e) {
3628             UV cp;
3629
3630             if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3631                 ustrp[0] = 'i';
3632                 *lenp = 1;
3633                 return 'i';
3634             }
3635
3636             /* For the dot above to modify the 'I', it must be part of a
3637              * combining sequence immediately following the 'I', and no other
3638              * modifier with a ccc of 230 may intervene */
3639             cp = utf8_to_uvchr_buf(p, e, NULL);
3640             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3641                 break;
3642             }
3643
3644             /* Here the combining sequence continues */
3645             p += UTF8SKIP(p);
3646         }
3647     }
3648
3649     /* In all other cases the lc is the same as the fold */
3650     return turkic_fc(p0, e, ustrp, lenp);
3651 }
3652
3653 STATIC UV
3654 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3655                         U8 * ustrp, STRLEN *lenp)
3656 {
3657     /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3658      * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3659      * Otherwise, it returns the first code point of the Turkic upper or
3660      * title-cased sequence, and the entire sequence will be stored in *ustrp.
3661      * ustrp will contain *lenp bytes
3662      *
3663      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3664      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3665      * DOTLESS I */
3666
3667     PERL_ARGS_ASSERT_TURKIC_UC;
3668     assert(e > p);
3669
3670     if (*p == 'i') {
3671         *lenp = 2;
3672         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3673         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3674         return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3675     }
3676
3677     if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3678         *lenp = 1;
3679         *ustrp = 'I';
3680         return 'I';
3681     }
3682
3683     return 0;
3684 }
3685
3686 /* The process for changing the case is essentially the same for the four case
3687  * change types, except there are complications for folding.  Otherwise the
3688  * difference is only which case to change to.  To make sure that they all do
3689  * the same thing, the bodies of the functions are extracted out into the
3690  * following two macros.  The functions are written with the same variable
3691  * names, and these are known and used inside these macros.  It would be
3692  * better, of course, to have inline functions to do it, but since different
3693  * macros are called, depending on which case is being changed to, this is not
3694  * feasible in C (to khw's knowledge).  Two macros are created so that the fold
3695  * function can start with the common start macro, then finish with its special
3696  * handling; while the other three cases can just use the common end macro.
3697  *
3698  * The algorithm is to use the proper (passed in) macro or function to change
3699  * the case for code points that are below 256.  The macro is used if using
3700  * locale rules for the case change; the function if not.  If the code point is
3701  * above 255, it is computed from the input UTF-8, and another macro is called
3702  * to do the conversion.  If necessary, the output is converted to UTF-8.  If
3703  * using a locale, we have to check that the change did not cross the 255/256
3704  * boundary, see check_locale_boundary_crossing() for further details.
3705  *
3706  * The macros are split with the correct case change for the below-256 case
3707  * stored into 'result', and in the middle of an else clause for the above-255
3708  * case.  At that point in the 'else', 'result' is not the final result, but is
3709  * the input code point calculated from the UTF-8.  The fold code needs to
3710  * realize all this and take it from there.
3711  *
3712  * To deal with Turkic locales, the function specified by the parameter
3713  * 'turkic' is called when appropriate.
3714  *
3715  * If you read the two macros as sequential, it's easier to understand what's
3716  * going on. */
3717 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func,  \
3718                                L1_func_extra_param, turkic)                  \
3719                                                                              \
3720     if (flags & (locale_flags)) {                                            \
3721         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                  \
3722         if (IN_UTF8_CTYPE_LOCALE) {                                          \
3723             if (UNLIKELY(PL_in_utf8_turkic_locale)) {                        \
3724                 UV ret = turkic(p, e, ustrp, lenp);                          \
3725                 if (ret) return ret;                                         \
3726             }                                                                \
3727                                                                              \
3728             /* Otherwise, treat a UTF-8 locale as not being in locale at     \
3729              * all */                                                        \
3730             flags &= ~(locale_flags);                                        \
3731         }                                                                    \
3732     }                                                                        \
3733                                                                              \
3734     if (UTF8_IS_INVARIANT(*p)) {                                             \
3735         if (flags & (locale_flags)) {                                        \
3736             result = libc_change_function(*p);                               \
3737         }                                                                    \
3738         else {                                                               \
3739             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
3740         }                                                                    \
3741     }                                                                        \
3742     else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
3743         U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
3744         if (flags & (locale_flags)) {                                        \
3745             result = libc_change_function(c);                                \
3746         }                                                                    \
3747         else {                                                               \
3748             return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
3749         }                                                                    \
3750     }                                                                        \
3751     else {  /* malformed UTF-8 or ord above 255 */                           \
3752         STRLEN len_result;                                                   \
3753         result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
3754         if (len_result == (STRLEN) -1) {                                     \
3755             _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
3756         }
3757
3758 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
3759         result = change_macro(result, p, ustrp, lenp);                       \
3760                                                                              \
3761         if (flags & (locale_flags)) {                                        \
3762             result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3763         }                                                                    \
3764         return result;                                                       \
3765     }                                                                        \
3766                                                                              \
3767     /* Here, used locale rules.  Convert back to UTF-8 */                    \
3768     if (UTF8_IS_INVARIANT(result)) {                                         \
3769         *ustrp = (U8) result;                                                \
3770         *lenp = 1;                                                           \
3771     }                                                                        \
3772     else {                                                                   \
3773         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);                             \
3774         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);                       \
3775         *lenp = 2;                                                           \
3776     }                                                                        \
3777                                                                              \
3778     return result;
3779
3780 /* Not currently externally documented, and subject to change:
3781  * <flags> is set iff the rules from the current underlying locale are to
3782  *         be used. */
3783
3784 UV
3785 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3786                                 const U8 *e,
3787                                 U8* ustrp,
3788                                 STRLEN *lenp,
3789                                 bool flags)
3790 {
3791     UV result;
3792
3793     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3794
3795     /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3796     /* 2nd char of uc(U+DF) is 'S' */
3797     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3798                                                                     turkic_uc);
3799     CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
3800 }
3801
3802 /* Not currently externally documented, and subject to change:
3803  * <flags> is set iff the rules from the current underlying locale are to be
3804  *         used.  Since titlecase is not defined in POSIX, for other than a
3805  *         UTF-8 locale, uppercase is used instead for code points < 256.
3806  */
3807
3808 UV
3809 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3810                                 const U8 *e,
3811                                 U8* ustrp,
3812                                 STRLEN *lenp,
3813                                 bool flags)
3814 {
3815     UV result;
3816
3817     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3818
3819     /* 2nd char of ucfirst(U+DF) is 's' */
3820     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3821                                                                     turkic_uc);
3822     CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
3823 }
3824
3825 /* Not currently externally documented, and subject to change:
3826  * <flags> is set iff the rules from the current underlying locale are to
3827  *         be used.
3828  */
3829
3830 UV
3831 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3832                                 const U8 *e,
3833                                 U8* ustrp,
3834                                 STRLEN *lenp,
3835                                 bool flags)
3836 {
3837     UV result;
3838
3839     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3840
3841     CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3842                                                                     turkic_lc);
3843     CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
3844 }
3845
3846 /* Not currently externally documented, and subject to change,
3847  * in <flags>
3848  *      bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3849  *                            locale are to be used.
3850  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
3851  *                            otherwise simple folds
3852  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3853  *                            prohibited
3854  */
3855
3856 UV
3857 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3858                                const U8 *e,
3859                                U8* ustrp,
3860                                STRLEN *lenp,
3861                                U8 flags)
3862 {
3863     UV result;
3864
3865     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3866
3867     /* These are mutually exclusive */
3868     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3869
3870     assert(p != ustrp); /* Otherwise overwrites */
3871
3872     CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3873                  ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3874                                                                     turkic_fc);
3875
3876         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3877
3878         if (flags & FOLD_FLAGS_LOCALE) {
3879
3880 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3881 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3882 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3883
3884             /* Special case these two characters, as what normally gets
3885              * returned under locale doesn't work */
3886             if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3887             {
3888                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3889                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3890                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3891                               "resolved to \"\\x{17F}\\x{17F}\".");
3892                 goto return_long_s;
3893             }
3894             else
3895 #endif
3896                  if (memBEGINs((char *) p, e - p, LONG_S_T))
3897             {
3898                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3899                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3900                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3901                               "resolved to \"\\x{FB06}\".");
3902                 goto return_ligature_st;
3903             }
3904
3905 #if    UNICODE_MAJOR_VERSION   == 3         \
3906     && UNICODE_DOT_VERSION     == 0         \
3907     && UNICODE_DOT_DOT_VERSION == 1
3908 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3909
3910             /* And special case this on this Unicode version only, for the same
3911              * reaons the other two are special cased.  They would cross the
3912              * 255/256 boundary which is forbidden under /l, and so the code
3913              * wouldn't catch that they are equivalent (which they are only in
3914              * this release) */
3915             else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
3916                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3917                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3918                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3919                               "resolved to \"\\x{0131}\".");
3920                 goto return_dotless_i;
3921             }
3922 #endif
3923
3924             return check_locale_boundary_crossing(p, result, ustrp, lenp);
3925         }
3926         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3927             return result;
3928         }
3929         else {
3930             /* This is called when changing the case of a UTF-8-encoded
3931              * character above the ASCII range, and the result should not
3932              * contain an ASCII character. */
3933
3934             UV original;    /* To store the first code point of <p> */
3935
3936             /* Look at every character in the result; if any cross the
3937             * boundary, the whole thing is disallowed */
3938             U8* s = ustrp;
3939             U8* send = ustrp + *lenp;
3940             while (s < send) {
3941                 if (isASCII(*s)) {
3942                     /* Crossed, have to return the original */
3943                     original = valid_utf8_to_uvchr(p, lenp);
3944
3945                     /* But in these instances, there is an alternative we can
3946                      * return that is valid */
3947                     if (original == LATIN_SMALL_LETTER_SHARP_S
3948 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3949                         || original == LATIN_CAPITAL_LETTER_SHARP_S
3950 #endif
3951                     ) {
3952                         goto return_long_s;
3953                     }
3954                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3955                         goto return_ligature_st;
3956                     }
3957 #if    UNICODE_MAJOR_VERSION   == 3         \
3958     && UNICODE_DOT_VERSION     == 0         \
3959     && UNICODE_DOT_DOT_VERSION == 1
3960
3961                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3962                         goto return_dotless_i;
3963                     }
3964 #endif
3965                     Copy(p, ustrp, *lenp, char);
3966                     return original;
3967                 }
3968                 s += UTF8SKIP(s);
3969             }
3970
3971             /* Here, no characters crossed, result is ok as-is */
3972             return result;
3973         }
3974     }
3975
3976     /* Here, used locale rules.  Convert back to UTF-8 */
3977     if (UTF8_IS_INVARIANT(result)) {
3978         *ustrp = (U8) result;
3979         *lenp = 1;
3980     }
3981     else {
3982         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3983         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3984         *lenp = 2;
3985     }
3986
3987     return result;
3988
3989   return_long_s:
3990     /* Certain folds to 'ss' are prohibited by the options, but they do allow
3991      * folds to a string of two of these characters.  By returning this
3992      * instead, then, e.g.,
3993      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3994      * works. */
3995
3996     *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3997     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
3998         ustrp, *lenp, U8);
3999     return LATIN_SMALL_LETTER_LONG_S;
4000
4001   return_ligature_st:
4002     /* Two folds to 'st' are prohibited by the options; instead we pick one and
4003      * have the other one fold to it */
4004
4005     *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
4006     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
4007     return LATIN_SMALL_LIGATURE_ST;
4008
4009 #if    UNICODE_MAJOR_VERSION   == 3         \
4010     && UNICODE_DOT_VERSION     == 0         \
4011     && UNICODE_DOT_DOT_VERSION == 1
4012
4013   return_dotless_i:
4014     *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
4015     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
4016     return LATIN_SMALL_LETTER_DOTLESS_I;
4017
4018 #endif
4019
4020 }
4021
4022 bool
4023 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4024 {
4025     /* May change: warns if surrogates, non-character code points, or
4026      * non-Unicode code points are in 's' which has length 'len' bytes.
4027      * Returns TRUE if none found; FALSE otherwise.  The only other validity
4028      * check is to make sure that this won't exceed the string's length nor
4029      * overflow */
4030
4031     const U8* const e = s + len;
4032     bool ok = TRUE;
4033
4034     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4035
4036     while (s < e) {
4037         if (UTF8SKIP(s) > len) {
4038             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4039                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4040             return FALSE;
4041         }
4042         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
4043             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
4044                 if (   ckWARN_d(WARN_NON_UNICODE)
4045                     || UNLIKELY(0 < does_utf8_overflow(s, s + len,
4046                                                0 /* Don't consider overlongs */
4047                                                )))
4048                 {
4049                     /* A side effect of this function will be to warn */
4050                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
4051                     ok = FALSE;
4052                 }
4053             }
4054             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4055                 if (ckWARN_d(WARN_SURROGATE)) {
4056                     /* This has a different warning than the one the called
4057                      * function would output, so can't just call it, unlike we
4058                      * do for the non-chars and above-unicodes */
4059                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
4060                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4061                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
4062                                              uv);
4063                     ok = FALSE;
4064                 }
4065             }
4066             else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
4067                      && (ckWARN_d(WARN_NONCHAR)))
4068             {
4069                 /* A side effect of this function will be to warn */
4070                 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
4071                 ok = FALSE;
4072             }
4073         }
4074         s += UTF8SKIP(s);
4075     }
4076
4077     return ok;
4078 }
4079
4080 /*
4081 =for apidoc pv_uni_display
4082
4083 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
4084 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
4085 long (if longer, the rest is truncated and C<"..."> will be appended).
4086
4087 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4088 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4089 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4090 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4091 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4092 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
4093
4094 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
4095 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
4096
4097 The pointer to the PV of the C<dsv> is returned.
4098
4099 See also L</sv_uni_display>.
4100
4101 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
4102 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
4103 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
4104 =for apidoc Amnh||UNI_DISPLAY_QQ
4105 =for apidoc Amnh||UNI_DISPLAY_REGEX
4106 =cut
4107 */
4108 char *
4109 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
4110                           UV flags)
4111 {
4112     int truncated = 0;
4113     const char *s, *e;
4114
4115     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4116
4117     SvPVCLEAR(dsv);
4118     SvUTF8_off(dsv);
4119     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4120          UV u;
4121          bool ok = 0;
4122
4123          if (pvlim && SvCUR(dsv) >= pvlim) {
4124               truncated++;
4125               break;
4126          }
4127          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4128          if (u < 256) {
4129              const U8 c = (U8) u;
4130              if (flags & UNI_DISPLAY_BACKSLASH) {
4131                  if (    isMNEMONIC_CNTRL(c)
4132                      && (   c != '\b'
4133                          || (flags & UNI_DISPLAY_BACKSPACE)))
4134                  {
4135                     const char * mnemonic = cntrl_to_mnemonic(c);
4136                     sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4137                     ok = 1;
4138                  }
4139                  else if (c == '\\') {
4140                     sv_catpvs(dsv, "\\\\");
4141                     ok = 1;
4142                  }
4143              }
4144              /* isPRINT() is the locale-blind version. */
4145              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4146                  const char string = c;
4147                  sv_catpvn(dsv, &string, 1);
4148                  ok = 1;
4149              }
4150          }
4151          if (!ok)
4152              Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4153     }
4154     if (truncated)
4155          sv_catpvs(dsv, "...");
4156
4157     return SvPVX(dsv);
4158 }
4159
4160 /*
4161 =for apidoc sv_uni_display
4162
4163 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4164 the displayable version being at most C<pvlim> bytes long
4165 (if longer, the rest is truncated and "..." will be appended).
4166
4167 The C<flags> argument is as in L</pv_uni_display>().
4168
4169 The pointer to the PV of the C<dsv> is returned.
4170
4171 =cut
4172 */
4173 char *
4174 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4175 {
4176     const char * const ptr =
4177         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4178
4179     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4180
4181     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4182                                 SvCUR(ssv), pvlim, flags);
4183 }
4184
4185 /*
4186 =for apidoc foldEQ_utf8
4187
4188 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4189 both of which may be in UTF-8) are the same case-insensitively; false
4190 otherwise.  How far into the strings to compare is determined by other input
4191 parameters.
4192
4193 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4194 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
4195 C<u2> with respect to C<s2>.
4196
4197 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4198 fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
4199 The scan will not be considered to be a match unless the goal is reached, and
4200 scanning won't continue past that goal.  Correspondingly for C<l2> with respect
4201 to C<s2>.
4202
4203 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4204 pointer is considered an end pointer to the position 1 byte past the maximum
4205 point in C<s1> beyond which scanning will not continue under any circumstances.
4206 (This routine assumes that UTF-8 encoded input strings are not malformed;
4207 malformed input can cause it to read past C<pe1>).  This means that if both
4208 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4209 will never be successful because it can never
4210 get as far as its goal (and in fact is asserted against).  Correspondingly for
4211 C<pe2> with respect to C<s2>.
4212
4213 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4214 C<l2> must be non-zero), and if both do, both have to be
4215 reached for a successful match.   Also, if the fold of a character is multiple
4216 characters, all of them must be matched (see tr21 reference below for
4217 'folding').
4218
4219 Upon a successful match, if C<pe1> is non-C<NULL>,
4220 it will be set to point to the beginning of the I<next> character of C<s1>
4221 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4222
4223 For case-insensitiveness, the "casefolding" of Unicode is used
4224 instead of upper/lowercasing both the characters, see
4225 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4226
4227 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4228 =for apidoc Cmnh||FOLDEQ_LOCALE
4229 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4230 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4231 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4232 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4233
4234 =cut */
4235
4236 /* A flags parameter has been added which may change, and hence isn't
4237  * externally documented.  Currently it is:
4238  *  0 for as-documented above
4239  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4240                             ASCII one, to not match
4241  *  FOLDEQ_LOCALE           is set iff the rules from the current underlying
4242  *                          locale are to be used.
4243  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4244  *                          routine.  This allows that step to be skipped.
4245  *                          Currently, this requires s1 to be encoded as UTF-8
4246  *                          (u1 must be true), which is asserted for.
4247  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4248  *                          cross certain boundaries.  Hence, the caller should
4249  *                          let this function do the folding instead of
4250  *                          pre-folding.  This code contains an assertion to
4251  *                          that effect.  However, if the caller knows what
4252  *                          it's doing, it can pass this flag to indicate that,
4253  *                          and the assertion is skipped.
4254  *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4255  *                          to s2, and s2 doesn't have to be UTF-8 encoded.
4256  *                          This introduces an asymmetry to save a few branches
4257  *                          in a loop.  Currently, this is not a problem, as
4258  *                          never are both inputs pre-folded.  Simply call this
4259  *                          function with the pre-folded one as the second
4260  *                          string.
4261  *  FOLDEQ_S2_FOLDS_SANE
4262  */
4263
4264 I32
4265 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4266                              const char *s2, char **pe2, UV l2, bool u2,
4267                              U32 flags)
4268 {
4269     const U8 *p1  = (const U8*)s1; /* Point to current char */
4270     const U8 *p2  = (const U8*)s2;
4271     const U8 *g1 = NULL;       /* goal for s1 */
4272     const U8 *g2 = NULL;
4273     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4274     U8 *f1 = NULL;             /* Point to current folded */
4275     const U8 *e2 = NULL;
4276     U8 *f2 = NULL;
4277     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4278     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4279     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4280     U8 flags_for_folder = FOLD_FLAGS_FULL;
4281
4282     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4283
4284     assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4285                && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
4286                         && !(flags &  FOLDEQ_S1_FOLDS_SANE))
4287                     || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
4288                         && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
4289     /* The algorithm is to trial the folds without regard to the flags on
4290      * the first line of the above assert(), and then see if the result
4291      * violates them.  This means that the inputs can't be pre-folded to a
4292      * violating result, hence the assert.  This could be changed, with the
4293      * addition of extra tests here for the already-folded case, which would
4294      * slow it down.  That cost is more than any possible gain for when these
4295      * flags are specified, as the flags indicate /il or /iaa matching which
4296      * is less common than /iu, and I (khw) also believe that real-world /il
4297      * and /iaa matches are most likely to involve code points 0-255, and this
4298      * function only under rare conditions gets called for 0-255. */
4299
4300     if (flags & FOLDEQ_LOCALE) {
4301         if (IN_UTF8_CTYPE_LOCALE) {
4302             if (UNLIKELY(PL_in_utf8_turkic_locale)) {
4303                 flags_for_folder |= FOLD_FLAGS_LOCALE;
4304             }
4305             else {
4306                 flags &= ~FOLDEQ_LOCALE;
4307             }
4308         }
4309         else {
4310             flags_for_folder |= FOLD_FLAGS_LOCALE;
4311         }
4312     }
4313     if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4314         flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4315     }
4316
4317     if (pe1) {
4318         e1 = *(U8**)pe1;
4319     }
4320
4321     if (l1) {
4322         g1 = (const U8*)s1 + l1;
4323     }
4324
4325     if (pe2) {
4326         e2 = *(U8**)pe2;
4327     }
4328
4329     if (l2) {
4330         g2 = (const U8*)s2 + l2;
4331     }
4332
4333     /* Must have at least one goal */
4334     assert(g1 || g2);
4335
4336     if (g1) {
4337
4338         /* Will never match if goal is out-of-bounds */
4339         assert(! e1  || e1 >= g1);
4340
4341         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4342         * only go as far as the goal */
4343         e1 = g1;
4344     }
4345     else {
4346         assert(e1);    /* Must have an end for looking at s1 */
4347     }
4348
4349     /* Same for goal for s2 */
4350     if (g2) {
4351         assert(! e2  || e2 >= g2);
4352         e2 = g2;
4353     }
4354     else {
4355         assert(e2);
4356     }
4357
4358     /* If both operands are already folded, we could just do a memEQ on the
4359      * whole strings at once, but it would be better if the caller realized
4360      * this and didn't even call us */
4361
4362     /* Look through both strings, a character at a time */
4363     while (p1 < e1 && p2 < e2) {
4364
4365         /* If at the beginning of a new character in s1, get its fold to use
4366          * and the length of the fold. */
4367         if (n1 == 0) {
4368             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4369                 f1 = (U8 *) p1;
4370                 assert(u1);
4371                 n1 = UTF8SKIP(f1);
4372             }
4373             else {
4374                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4375
4376                     /* We have to forbid mixing ASCII with non-ASCII if the
4377                      * flags so indicate.  And, we can short circuit having to
4378                      * call the general functions for this common ASCII case,
4379                      * all of whose non-locale folds are also ASCII, and hence
4380                      * UTF-8 invariants, so the UTF8ness of the strings is not
4381                      * relevant. */
4382                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4383                         return 0;
4384                     }
4385                     n1 = 1;
4386                     *foldbuf1 = toFOLD(*p1);
4387                 }
4388                 else if (u1) {
4389                     _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4390                 }
4391                 else {  /* Not UTF-8, get UTF-8 fold */
4392                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4393                 }
4394                 f1 = foldbuf1;
4395             }
4396         }
4397
4398         if (n2 == 0) {    /* Same for s2 */
4399             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4400
4401                 /* Point to the already-folded character.  But for non-UTF-8
4402                  * variants, convert to UTF-8 for the algorithm below */
4403                 if (UTF8_IS_INVARIANT(*p2)) {
4404                     f2 = (U8 *) p2;
4405                     n2 = 1;
4406                 }
4407                 else if (u2) {
4408                     f2 = (U8 *) p2;
4409                     n2 = UTF8SKIP(f2);
4410                 }
4411                 else {
4412                     foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4413                     foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4414                     f2 = foldbuf2;
4415                     n2 = 2;
4416                 }
4417             }
4418             else {
4419                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4420                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4421                         return 0;
4422                     }
4423                     n2 = 1;
4424                     *foldbuf2 = toFOLD(*p2);
4425                 }
4426                 else if (u2) {
4427                     _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4428                 }
4429                 else {
4430                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4431                 }
4432                 f2 = foldbuf2;
4433             }
4434         }
4435
4436         /* Here f1 and f2 point to the beginning of the strings to compare.
4437          * These strings are the folds of the next character from each input
4438          * string, stored in UTF-8. */
4439
4440         /* While there is more to look for in both folds, see if they
4441         * continue to match */
4442         while (n1 && n2) {
4443             U8 fold_length = UTF8SKIP(f1);
4444             if (fold_length != UTF8SKIP(f2)
4445                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4446                                                        function call for single
4447                                                        byte */
4448                 || memNE((char*)f1, (char*)f2, fold_length))
4449             {
4450                 return 0; /* mismatch */
4451             }
4452
4453             /* Here, they matched, advance past them */
4454             n1 -= fold_length;
4455             f1 += fold_length;
4456             n2 -= fold_length;
4457             f2 += fold_length;
4458         }
4459
4460         /* When reach the end of any fold, advance the input past it */
4461         if (n1 == 0) {
4462             p1 += u1 ? UTF8SKIP(p1) : 1;
4463         }
4464         if (n2 == 0) {
4465             p2 += u2 ? UTF8SKIP(p2) : 1;
4466         }
4467     } /* End of loop through both strings */
4468
4469     /* A match is defined by each scan that specified an explicit length
4470     * reaching its final goal, and the other not having matched a partial
4471     * character (which can happen when the fold of a character is more than one
4472     * character). */
4473     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4474         return 0;
4475     }
4476
4477     /* Successful match.  Set output pointers */
4478     if (pe1) {
4479         *pe1 = (char*)p1;
4480     }
4481     if (pe2) {
4482         *pe2 = (char*)p2;
4483     }
4484     return 1;
4485 }
4486
4487 /*
4488  * ex: set ts=8 sts=4 sw=4 et:
4489  */