utf8.c

   1 /*    utf8.c
   2  *
   3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   4  *    by Larry Wall and others
   5  *
   6  *    You may distribute under the terms of either the GNU General Public
   7  *    License or the Artistic License, as specified in the README file.
   8  *
   9  */
  10
  11 /*
  12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
  13  *  heard of that we don't want to see any closer; and that's the one place
  14  *  we're trying to get to!  And that's just where we can't get, nohow.'
  15  *
  16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
  17  *
  18  * 'Well do I understand your speech,' he answered in the same language;
  19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
  20  *  as is the custom in the West, if you wish to be answered?'
  21  *                           --Gandalf, addressing Théoden's door wardens
  22  *
  23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  24  *
  25  * ...the travellers perceived that the floor was paved with stones of many
  26  * hues; branching runes and strange devices intertwined beneath their feet.
  27  *
  28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
  29  */
  30
  31 #include "EXTERN.h"
  32 #define PERL_IN_UTF8_C
  33 #include "perl.h"
  34 #include "invlist_inline.h"
  35
  36 static const char malformed_text[] = "Malformed UTF-8 character";
  37 static const char unees[] =
  38                         "Malformed UTF-8 character (unexpected end of string)";
  39
  40 /* Be sure to synchronize this message with the similar one in regcomp.c */
  41 static const char cp_above_legal_max[] =
  42                         "Use of code point 0x%" UVXf " is not allowed; the"
  43                         " permissible max is 0x%" UVXf;
  44
  45 /*
  46 =head1 Unicode Support
  47 These are various utility functions for manipulating UTF8-encoded
  48 strings.  For the uninitiated, this is a method of representing arbitrary
  49 Unicode characters as a variable number of bytes, in such a way that
  50 characters in the ASCII range are unmodified, and a zero byte never appears
  51 within non-zero characters.
  52
  53 =cut
  54 */
  55
  56 /* helper for Perl__force_out_malformed_utf8_message(). Like
  57  * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
  58  * PL_compiling */
  59
  60 static void
  61 S_restore_cop_warnings(pTHX_ void *p)
  62 {
  63     if (!specialWARN(PL_curcop->cop_warnings))
  64         PerlMemShared_free(PL_curcop->cop_warnings);
  65     PL_curcop->cop_warnings = (STRLEN*)p;
  66 }
  67
  68
  69 void
  70 Perl__force_out_malformed_utf8_message(pTHX_
  71             const U8 *const p,      /* First byte in UTF-8 sequence */
  72             const U8 * const e,     /* Final byte in sequence (may include
  73                                        multiple chars */
  74             const U32 flags,        /* Flags to pass to utf8n_to_uvchr(),
  75                                        usually 0, or some DISALLOW flags */
  76             const bool die_here)    /* If TRUE, this function does not return */
  77 {
  78     /* This core-only function is to be called when a malformed UTF-8 character
  79      * is found, in order to output the detailed information about the
  80      * malformation before dieing.  The reason it exists is for the occasions
  81      * when such a malformation is fatal, but warnings might be turned off, so
  82      * that normally they would not be actually output.  This ensures that they
  83      * do get output.  Because a sequence may be malformed in more than one
  84      * way, multiple messages may be generated, so we can't make them fatal, as
  85      * that would cause the first one to die.
  86      *
  87      * Instead we pretend -W was passed to perl, then die afterwards.  The
  88      * flexibility is here to return to the caller so they can finish up and
  89      * die themselves */
  90     U32 errors;
  91
  92     PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
  93
  94     ENTER;
  95     SAVEI8(PL_dowarn);
  96     SAVESPTR(PL_curcop);
  97
  98     PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
  99     if (PL_curcop) {
 100         /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
 101          * than PL_compiling */
 102         SAVEDESTRUCTOR_X(S_restore_cop_warnings,
 103                 (void*)PL_curcop->cop_warnings);
 104         PL_curcop->cop_warnings = pWARN_ALL;
 105     }
 106
 107     (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
 108
 109     LEAVE;
 110
 111     if (! errors) {
 112         Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
 113                          " be called only when there are errors found");
 114     }
 115
 116     if (die_here) {
 117         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
 118     }
 119 }
 120
 121 STATIC HV *
 122 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
 123                    U32 categories,  /* Packed warning categories */
 124                    U32 flag)        /* Flag associated with this message */
 125 {
 126     /* Creates, populates, and returns an HV* that describes an error message
 127      * for the translators between UTF8 and code point */
 128
 129     SV* msg_sv = newSVpv(message, 0);
 130     SV* category_sv = newSVuv(categories);
 131     SV* flag_bit_sv = newSVuv(flag);
 132
 133     HV* msg_hv = newHV();
 134
 135     PERL_ARGS_ASSERT_NEW_MSG_HV;
 136
 137     (void) hv_stores(msg_hv, "text", msg_sv);
 138     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
 139     (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
 140
 141     return msg_hv;
 142 }
 143
 144 /*
 145 =for apidoc uvoffuni_to_utf8_flags
 146
 147 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 148 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
 149 L<perlapi/uvchr_to_utf8_flags>>.
 150
 151 This function is like them, but the input is a strict Unicode
 152 (as opposed to native) code point.  Only in very rare circumstances should code
 153 not be using the native code point.
 154
 155 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
 156
 157 =cut
 158 */
 159
 160 U8 *
 161 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
 162 {
 163     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
 164
 165     return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
 166 }
 167
 168 /* All these formats take a single UV code point argument */
 169 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
 170 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
 171                                    " is not recommended for open interchange";
 172 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
 173                                    " may not be portable";
 174 const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not"        \
 175                                        " Unicode, requires a Perl extension," \
 176                                        " and so is not portable";
 177
 178 #define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                   \
 179     STMT_START {                                                    \
 180         if (flags & UNICODE_WARN_SURROGATE) {                       \
 181             U32 category = packWARN(WARN_SURROGATE);                \
 182             const char * format = surrogate_cp_format;              \
 183             if (msgs) {                                             \
 184                 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
 185                                    category,                        \
 186                                    UNICODE_GOT_SURROGATE);          \
 187             }                                                       \
 188             else {                                                  \
 189                 Perl_ck_warner_d(aTHX_ category, format, uv);       \
 190             }                                                       \
 191         }                                                           \
 192         if (flags & UNICODE_DISALLOW_SURROGATE) {                   \
 193             return NULL;                                            \
 194         }                                                           \
 195     } STMT_END;
 196
 197 #define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                     \
 198     STMT_START {                                                    \
 199         if (flags & UNICODE_WARN_NONCHAR) {                         \
 200             U32 category = packWARN(WARN_NONCHAR);                  \
 201             const char * format = nonchar_cp_format;                \
 202             if (msgs) {                                             \
 203                 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),     \
 204                                    category,                        \
 205                                    UNICODE_GOT_NONCHAR);            \
 206             }                                                       \
 207             else {                                                  \
 208                 Perl_ck_warner_d(aTHX_ category, format, uv);       \
 209             }                                                       \
 210         }                                                           \
 211         if (flags & UNICODE_DISALLOW_NONCHAR) {                     \
 212             return NULL;                                            \
 213         }                                                           \
 214     } STMT_END;
 215
 216 /*  Use shorter names internally in this file */
 217 #define SHIFT   UTF_ACCUMULATION_SHIFT
 218 #undef  MARK
 219 #define MARK    UTF_CONTINUATION_MARK
 220 #define MASK    UTF_CONTINUATION_MASK
 221
 222 /*
 223 =for apidoc uvchr_to_utf8_flags_msgs
 224
 225 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 226
 227 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
 228
 229 This function is for code that wants any warning and/or error messages to be
 230 returned to the caller rather than be displayed.  All messages that would have
 231 been displayed if all lexical warnings are enabled will be returned.
 232
 233 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
 234 placed after all the others, C<msgs>.  If this parameter is 0, this function
 235 behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
 236 be a pointer to an C<HV *> variable, in which this function creates a new HV to
 237 contain any appropriate messages.  The hash has three key-value pairs, as
 238 follows:
 239
 240 =over 4
 241
 242 =item C<text>
 243
 244 The text of the message as a C<SVpv>.
 245
 246 =item C<warn_categories>
 247
 248 The warning category (or categories) packed into a C<SVuv>.
 249
 250 =item C<flag>
 251
 252 A single flag bit associated with this message, in a C<SVuv>.
 253 The bit corresponds to some bit in the C<*errors> return value,
 254 such as C<UNICODE_GOT_SURROGATE>.
 255
 256 =back
 257
 258 It's important to note that specifying this parameter as non-null will cause
 259 any warnings this function would otherwise generate to be suppressed, and
 260 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
 261 (or not) when choosing what to do with the returned messages.
 262
 263 The caller, of course, is responsible for freeing any returned HV.
 264
 265 =cut
 266 */
 267
 268 /* Undocumented; we don't want people using this.  Instead they should use
 269  * uvchr_to_utf8_flags_msgs() */
 270 U8 *
 271 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV uv, const UV flags, HV** msgs)
 272 {
 273     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
 274
 275     if (msgs) {
 276         *msgs = NULL;
 277     }
 278
 279     if (OFFUNI_IS_INVARIANT(uv)) {
 280         *d++ = LATIN1_TO_NATIVE(uv);
 281         return d;
 282     }
 283
 284     if (uv <= MAX_UTF8_TWO_BYTE) {
 285         *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
 286         *d++ = I8_TO_NATIVE_UTF8(( uv           & MASK) |   MARK);
 287         return d;
 288     }
 289
 290     /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
 291      * below, the 16 is for start bytes E0-EF (which are all the possible ones
 292      * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
 293      * contribute SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
 294      * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
 295      * 0x800-0xFFFF on ASCII */
 296     if (uv < (16 * (1U << (2 * SHIFT)))) {
 297         *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
 298         *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
 299         *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */        & MASK) |   MARK);
 300
 301 #ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
 302                    aren't tested here */
 303         /* The most likely code points in this range are below the surrogates.
 304          * Do an extra test to quickly exclude those. */
 305         if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
 306             if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
 307                          || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
 308             {
 309                 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
 310             }
 311             else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
 312                 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
 313             }
 314         }
 315 #endif
 316         return d;
 317     }
 318
 319     /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
 320      * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
 321      * happen starting with 4-byte characters on ASCII platforms.  We unify the
 322      * code for these with EBCDIC, even though some of them require 5-bytes on
 323      * those, because khw believes the code saving is worth the very slight
 324      * performance hit on these high EBCDIC code points. */
 325
 326     if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
 327         if (UNLIKELY(      uv > MAX_LEGAL_CP
 328                      && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
 329         {
 330             Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
 331         }
 332         if (       (flags & UNICODE_WARN_SUPER)
 333             || (   (flags & UNICODE_WARN_PERL_EXTENDED)
 334                 && UNICODE_IS_PERL_EXTENDED(uv)))
 335         {
 336             const char * format = super_cp_format;
 337             U32 category = packWARN(WARN_NON_UNICODE);
 338             U32 flag = UNICODE_GOT_SUPER;
 339
 340             /* Choose the more dire applicable warning */
 341             if (UNICODE_IS_PERL_EXTENDED(uv)) {
 342                 format = perl_extended_cp_format;
 343                 if (flags & (UNICODE_WARN_PERL_EXTENDED
 344                             |UNICODE_DISALLOW_PERL_EXTENDED))
 345                 {
 346                     flag = UNICODE_GOT_PERL_EXTENDED;
 347                 }
 348             }
 349
 350             if (msgs) {
 351                 *msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
 352                                    category, flag);
 353             }
 354             else {
 355                 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
 356             }
 357         }
 358         if (       (flags & UNICODE_DISALLOW_SUPER)
 359             || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
 360                 &&  UNICODE_IS_PERL_EXTENDED(uv)))
 361         {
 362             return NULL;
 363         }
 364     }
 365     else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
 366         HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
 367     }
 368
 369     /* Test for and handle 4-byte result.   In the test immediately below, the
 370      * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
 371      * characters).  The 3 is for 3 continuation bytes; these each contribute
 372      * SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
 373      * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
 374      * 0x1_0000-0x1F_FFFF on ASCII */
 375     if (uv < (8 * (1U << (3 * SHIFT)))) {
 376         *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
 377         *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) |   MARK);
 378         *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) |   MARK);
 379         *d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */        & MASK) |   MARK);
 380
 381 #ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
 382                    characters.  The end-plane non-characters for EBCDIC were
 383                    handled just above */
 384         if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
 385             HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
 386         }
 387         else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
 388             HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
 389         }
 390 #endif
 391
 392         return d;
 393     }
 394
 395     /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
 396      * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
 397      * format.  The unrolled version above turns out to not save all that much
 398      * time, and at these high code points (well above the legal Unicode range
 399      * on ASCII platforms, and well above anything in common use in EBCDIC),
 400      * khw believes that less code outweighs slight performance gains. */
 401
 402     {
 403         STRLEN len  = OFFUNISKIP(uv);
 404         U8 *p = d+len-1;
 405         while (p > d) {
 406             *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
 407             uv >>= SHIFT;
 408         }
 409         *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
 410         return d+len;
 411     }
 412 }
 413
 414 /*
 415 =for apidoc uvchr_to_utf8
 416
 417 Adds the UTF-8 representation of the native code point C<uv> to the end
 418 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 419 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 420 the byte after the end of the new character.  In other words,
 421
 422     d = uvchr_to_utf8(d, uv);
 423
 424 is the recommended wide native character-aware way of saying
 425
 426     *(d++) = uv;
 427
 428 This function accepts any code point from 0..C<IV_MAX> as input.
 429 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
 430
 431 It is possible to forbid or warn on non-Unicode code points, or those that may
 432 be problematic by using L</uvchr_to_utf8_flags>.
 433
 434 =cut
 435 */
 436
 437 /* This is also a macro */
 438 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
 439
 440 U8 *
 441 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
 442 {
 443     return uvchr_to_utf8(d, uv);
 444 }
 445
 446 /*
 447 =for apidoc uvchr_to_utf8_flags
 448
 449 Adds the UTF-8 representation of the native code point C<uv> to the end
 450 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
 451 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
 452 the byte after the end of the new character.  In other words,
 453
 454     d = uvchr_to_utf8_flags(d, uv, flags);
 455
 456 or, in most cases,
 457
 458     d = uvchr_to_utf8_flags(d, uv, 0);
 459
 460 This is the Unicode-aware way of saying
 461
 462     *(d++) = uv;
 463
 464 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
 465 input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
 466
 467 Specifying C<flags> can further restrict what is allowed and not warned on, as
 468 follows:
 469
 470 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
 471 the function will raise a warning, provided UTF8 warnings are enabled.  If
 472 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
 473 NULL.  If both flags are set, the function will both warn and return NULL.
 474
 475 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
 476 affect how the function handles a Unicode non-character.
 477
 478 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
 479 affect the handling of code points that are above the Unicode maximum of
 480 0x10FFFF.  Languages other than Perl may not be able to accept files that
 481 contain these.
 482
 483 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
 484 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
 485 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
 486 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
 487 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
 488 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
 489 above-Unicode and surrogate flags, but not the non-character ones, as
 490 defined in
 491 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
 492 See L<perlunicode/Noncharacter code points>.
 493
 494 Extremely high code points were never specified in any standard, and require an
 495 extension to UTF-8 to express, which Perl does.  It is likely that programs
 496 written in something other than Perl would not be able to read files that
 497 contain these; nor would Perl understand files written by something that uses a
 498 different extension.  For these reasons, there is a separate set of flags that
 499 can warn and/or disallow these extremely high code points, even if other
 500 above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
 501 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
 502 L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
 503 treat all above-Unicode code points, including these, as malformations.  (Note
 504 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
 505 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
 506
 507 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
 508 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
 509 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
 510 C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
 511 platforms,these flags can apply to code points that actually do fit in 31 bits.
 512 The new names accurately describe the situation in all cases.
 513
 514 =cut
 515 */
 516
 517 /* This is also a macro */
 518 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
 519
 520 U8 *
 521 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 522 {
 523     return uvchr_to_utf8_flags(d, uv, flags);
 524 }
 525
 526 #ifndef UV_IS_QUAD
 527
 528 STATIC int
 529 S_is_utf8_cp_above_31_bits(const U8 * const s,
 530                            const U8 * const e,
 531                            const bool consider_overlongs)
 532 {
 533     /* Returns TRUE if the first code point represented by the Perl-extended-
 534      * UTF-8-encoded string starting at 's', and looking no further than 'e -
 535      * 1' doesn't fit into 31 bytes.  That is, that if it is >= 2**31.
 536      *
 537      * The function handles the case where the input bytes do not include all
 538      * the ones necessary to represent a full character.  That is, they may be
 539      * the intial bytes of the representation of a code point, but possibly
 540      * the final ones necessary for the complete representation may be beyond
 541      * 'e - 1'.
 542      *
 543      * The function also can handle the case where the input is an overlong
 544      * sequence.  If 'consider_overlongs' is 0, the function assumes the
 545      * input is not overlong, without checking, and will return based on that
 546      * assumption.  If this parameter is 1, the function will go to the trouble
 547      * of figuring out if it actually evaluates to above or below 31 bits.
 548      *
 549      * The sequence is otherwise assumed to be well-formed, without checking.
 550      */
 551
 552     const STRLEN len = e - s;
 553     int is_overlong;
 554
 555     PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
 556
 557     assert(! UTF8_IS_INVARIANT(*s) && e > s);
 558
 559 #ifdef EBCDIC
 560
 561     PERL_UNUSED_ARG(consider_overlongs);
 562
 563     /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
 564      * mean a 32-bit or larger code point (0xFF is an invariant).  0xFE can
 565      * also be the start byte for a 31-bit code point; we need at least 2
 566      * bytes, and maybe up through 8 bytes, to determine that.  (It can also be
 567      * the start byte for an overlong sequence, but for 30-bit or smaller code
 568      * points, so we don't have to worry about overlongs on EBCDIC.) */
 569     if (*s != 0xFE) {
 570         return 0;
 571     }
 572
 573     if (len == 1) {
 574         return -1;
 575     }
 576
 577 #else
 578
 579     /* On ASCII, FE and FF are the only start bytes that can evaluate to
 580      * needing more than 31 bits. */
 581     if (LIKELY(*s < 0xFE)) {
 582         return 0;
 583     }
 584
 585     /* What we have left are FE and FF.  Both of these require more than 31
 586      * bits unless they are for overlongs. */
 587     if (! consider_overlongs) {
 588         return 1;
 589     }
 590
 591     /* Here, we have FE or FF.  If the input isn't overlong, it evaluates to
 592      * above 31 bits.  But we need more than one byte to discern this, so if
 593      * passed just the start byte, it could be an overlong evaluating to
 594      * smaller */
 595     if (len == 1) {
 596         return -1;
 597     }
 598
 599     /* Having excluded len==1, and knowing that FE and FF are both valid start
 600      * bytes, we can call the function below to see if the sequence is
 601      * overlong.  (We don't need the full generality of the called function,
 602      * but for these huge code points, speed shouldn't be a consideration, and
 603      * the compiler does have enough information, since it's static to this
 604      * file, to optimize to just the needed parts.) */
 605     is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
 606
 607     /* If it isn't overlong, more than 31 bits are required. */
 608     if (is_overlong == 0) {
 609         return 1;
 610     }
 611
 612     /* If it is indeterminate if it is overlong, return that */
 613     if (is_overlong < 0) {
 614         return -1;
 615     }
 616
 617     /* Here is overlong.  Such a sequence starting with FE is below 31 bits, as
 618      * the max it can be is 2**31 - 1 */
 619     if (*s == 0xFE) {
 620         return 0;
 621     }
 622
 623 #endif
 624
 625     /* Here, ASCII and EBCDIC rejoin:
 626     *  On ASCII:   We have an overlong sequence starting with FF
 627     *  On EBCDIC:  We have a sequence starting with FE. */
 628
 629     {   /* For C89, use a block so the declaration can be close to its use */
 630
 631 #ifdef EBCDIC
 632
 633         /* U+7FFFFFFF (2 ** 31 - 1)
 634          *              [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10  11  12  13
 635          *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
 636          *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
 637          *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
 638          *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
 639          * U+80000000 (2 ** 31):
 640          *   IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
 641          *    IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
 642          *   POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
 643          *         I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
 644          *
 645          * and since we know that *s = \xfe, any continuation sequcence
 646          * following it that is gt the below is above 31 bits
 647                                                 [0] [1] [2] [3] [4] [5] [6] */
 648         const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
 649
 650 #else
 651
 652         /* FF overlong for U+7FFFFFFF (2 ** 31 - 1)
 653          *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x81\xBF\xBF\xBF\xBF\xBF
 654          * FF overlong for U+80000000 (2 ** 31):
 655          *      ASCII: \xFF\x80\x80\x80\x80\x80\x80\x82\x80\x80\x80\x80\x80
 656          * and since we know that *s = \xff, any continuation sequcence
 657          * following it that is gt the below is above 30 bits
 658                                                 [0] [1] [2] [3] [4] [5] [6] */
 659         const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
 660
 661
 662 #endif
 663         const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
 664         const STRLEN cmp_len = MIN(conts_len, len - 1);
 665
 666         /* Now compare the continuation bytes in s with the ones we have
 667          * compiled in that are for the largest 30 bit code point.  If we have
 668          * enough bytes available to determine the answer, or the bytes we do
 669          * have differ from them, we can compare the two to get a definitive
 670          * answer (Note that in UTF-EBCDIC, the two lowest possible
 671          * continuation bytes are \x41 and \x42.) */
 672         if (cmp_len >= conts_len || memNE(s + 1,
 673                                           conts_for_highest_30_bit,
 674                                           cmp_len))
 675         {
 676             return cBOOL(memGT(s + 1, conts_for_highest_30_bit, cmp_len));
 677         }
 678
 679         /* Here, all the bytes we have are the same as the highest 30-bit code
 680          * point, but we are missing so many bytes that we can't make the
 681          * determination */
 682         return -1;
 683     }
 684 }
 685
 686 #endif
 687
 688 PERL_STATIC_INLINE int
 689 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
 690 {
 691     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 692      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
 693      * it isn't, and -1 if there isn't enough information to tell.  This last
 694      * return value can happen if the sequence is incomplete, missing some
 695      * trailing bytes that would form a complete character.  If there are
 696      * enough bytes to make a definitive decision, this function does so.
 697      * Usually 2 bytes sufficient.
 698      *
 699      * Overlongs can occur whenever the number of continuation bytes changes.
 700      * That means whenever the number of leading 1 bits in a start byte
 701      * increases from the next lower start byte.  That happens for start bytes
 702      * C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following illegal
 703      * start bytes have already been excluded, so don't need to be tested here;
 704      * ASCII platforms: C0, C1
 705      * EBCDIC platforms C0, C1, C2, C3, C4, E0
 706      */
 707
 708     const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
 709     const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
 710
 711     PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
 712     assert(len > 1 && UTF8_IS_START(*s));
 713
 714     /* Each platform has overlongs after the start bytes given above (expressed
 715      * in I8 for EBCDIC).  What constitutes an overlong varies by platform, but
 716      * the logic is the same, except the E0 overlong has already been excluded
 717      * on EBCDIC platforms.   The  values below were found by manually
 718      * inspecting the UTF-8 patterns.  See the tables in utf8.h and
 719      * utfebcdic.h. */
 720
 721 #       ifdef EBCDIC
 722 #           define F0_ABOVE_OVERLONG 0xB0
 723 #           define F8_ABOVE_OVERLONG 0xA8
 724 #           define FC_ABOVE_OVERLONG 0xA4
 725 #           define FE_ABOVE_OVERLONG 0xA2
 726 #           define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
 727                                     /* I8(0xfe) is FF */
 728 #       else
 729
 730     if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
 731         return 1;
 732     }
 733
 734 #           define F0_ABOVE_OVERLONG 0x90
 735 #           define F8_ABOVE_OVERLONG 0x88
 736 #           define FC_ABOVE_OVERLONG 0x84
 737 #           define FE_ABOVE_OVERLONG 0x82
 738 #           define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
 739 #       endif
 740
 741
 742     if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
 743         || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
 744         || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
 745         || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
 746     {
 747         return 1;
 748     }
 749
 750     /* Check for the FF overlong */
 751     return isFF_OVERLONG(s, len);
 752 }
 753
 754 PERL_STATIC_INLINE int
 755 S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
 756 {
 757     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 758      * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
 759      * it isn't, and -1 if there isn't enough information to tell.  This last
 760      * return value can happen if the sequence is incomplete, missing some
 761      * trailing bytes that would form a complete character.  If there are
 762      * enough bytes to make a definitive decision, this function does so. */
 763
 764     PERL_ARGS_ASSERT_ISFF_OVERLONG;
 765
 766     /* To be an FF overlong, all the available bytes must match */
 767     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
 768                      MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
 769     {
 770         return 0;
 771     }
 772
 773     /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
 774      * be there; what comes after them doesn't matter.  See tables in utf8.h,
 775      * utfebcdic.h. */
 776     if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
 777         return 1;
 778     }
 779
 780     /* The missing bytes could cause the result to go one way or the other, so
 781      * the result is indeterminate */
 782     return -1;
 783 }
 784
 785 #if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
 786 #  ifdef EBCDIC     /* Actually is I8 */
 787 #   define HIGHEST_REPRESENTABLE_UTF8                                       \
 788                 "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
 789 #  else
 790 #   define HIGHEST_REPRESENTABLE_UTF8                                       \
 791                 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
 792 #  endif
 793 #endif
 794
 795 PERL_STATIC_INLINE int
 796 S_does_utf8_overflow(const U8 * const s,
 797                      const U8 * e,
 798                      const bool consider_overlongs)
 799 {
 800     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
 801      * 'e' - 1 would overflow an IV on this platform; that is if it represents
 802      * a code point larger than the highest representable code point.  It
 803      * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
 804      * enough information to tell.  This last return value can happen if the
 805      * sequence is incomplete, missing some trailing bytes that would form a
 806      * complete character.  If there are enough bytes to make a definitive
 807      * decision, this function does so.
 808      *
 809      * If 'consider_overlongs' is TRUE, the function checks for the possibility
 810      * that the sequence is an overlong that doesn't overflow.  Otherwise, it
 811      * assumes the sequence is not an overlong.  This can give different
 812      * results only on ASCII 32-bit platforms.
 813      *
 814      * (For ASCII platforms, we could use memcmp() because we don't have to
 815      * convert each byte to I8, but it's very rare input indeed that would
 816      * approach overflow, so the loop below will likely only get executed once.)
 817      *
 818      * 'e' - 1 must not be beyond a full character. */
 819
 820
 821     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
 822     assert(s <= e && s + UTF8SKIP(s) >= e);
 823
 824 #if ! defined(UV_IS_QUAD)
 825
 826     return is_utf8_cp_above_31_bits(s, e, consider_overlongs);
 827
 828 #else
 829
 830     PERL_UNUSED_ARG(consider_overlongs);
 831
 832     {
 833         const STRLEN len = e - s;
 834         const U8 *x;
 835         const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
 836
 837         for (x = s; x < e; x++, y++) {
 838
 839             if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
 840                 continue;
 841             }
 842
 843             /* If this byte is larger than the corresponding highest UTF-8
 844              * byte, the sequence overflow; otherwise the byte is less than,
 845              * and so the sequence doesn't overflow */
 846             return NATIVE_UTF8_TO_I8(*x) > *y;
 847
 848         }
 849
 850         /* Got to the end and all bytes are the same.  If the input is a whole
 851          * character, it doesn't overflow.  And if it is a partial character,
 852          * there's not enough information to tell */
 853         if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
 854             return -1;
 855         }
 856
 857         return 0;
 858     }
 859
 860 #endif
 861
 862 }
 863
 864 #if 0
 865
 866 /* This is the portions of the above function that deal with UV_MAX instead of
 867  * IV_MAX.  They are left here in case we want to combine them so that internal
 868  * uses can have larger code points.  The only logic difference is that the
 869  * 32-bit EBCDIC platform is treate like the 64-bit, and the 32-bit ASCII has
 870  * different logic.
 871  */
 872
 873 /* Anything larger than this will overflow the word if it were converted into a UV */
 874 #if defined(UV_IS_QUAD)
 875 #  ifdef EBCDIC     /* Actually is I8 */
 876 #   define HIGHEST_REPRESENTABLE_UTF8                                       \
 877                 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
 878 #  else
 879 #   define HIGHEST_REPRESENTABLE_UTF8                                       \
 880                 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
 881 #  endif
 882 #else   /* 32-bit */
 883 #  ifdef EBCDIC
 884 #   define HIGHEST_REPRESENTABLE_UTF8                                       \
 885                 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
 886 #  else
 887 #   define HIGHEST_REPRESENTABLE_UTF8  "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
 888 #  endif
 889 #endif
 890
 891 #if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
 892
 893     /* On 32 bit ASCII machines, many overlongs that start with FF don't
 894      * overflow */
 895     if (consider_overlongs && isFF_OVERLONG(s, len) > 0) {
 896
 897         /* To be such an overlong, the first bytes of 's' must match
 898          * FF_OVERLONG_PREFIX, which is "\xff\x80\x80\x80\x80\x80\x80".  If we
 899          * don't have any additional bytes available, the sequence, when
 900          * completed might or might not fit in 32 bits.  But if we have that
 901          * next byte, we can tell for sure.  If it is <= 0x83, then it does
 902          * fit. */
 903         if (len <= sizeof(FF_OVERLONG_PREFIX) - 1) {
 904             return -1;
 905         }
 906
 907         return s[sizeof(FF_OVERLONG_PREFIX) - 1] > 0x83;
 908     }
 909
 910 /* Starting with the #else, the rest of the function is identical except
 911  *      1.  we need to move the 'len' declaration to be global to the function
 912  *      2.  the endif move to just after the UNUSED_ARG.
 913  * An empty endif is given just below to satisfy the preprocessor
 914  */
 915 #endif
 916
 917 #endif
 918
 919 #undef F0_ABOVE_OVERLONG
 920 #undef F8_ABOVE_OVERLONG
 921 #undef FC_ABOVE_OVERLONG
 922 #undef FE_ABOVE_OVERLONG
 923 #undef FF_OVERLONG_PREFIX
 924
 925 STRLEN
 926 Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
 927 {
 928     STRLEN len;
 929     const U8 *x;
 930
 931     /* A helper function that should not be called directly.
 932      *
 933      * This function returns non-zero if the string beginning at 's' and
 934      * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
 935      * code point; otherwise it returns 0.  The examination stops after the
 936      * first code point in 's' is validated, not looking at the rest of the
 937      * input.  If 'e' is such that there are not enough bytes to represent a
 938      * complete code point, this function will return non-zero anyway, if the
 939      * bytes it does have are well-formed UTF-8 as far as they go, and aren't
 940      * excluded by 'flags'.
 941      *
 942      * A non-zero return gives the number of bytes required to represent the
 943      * code point.  Be aware that if the input is for a partial character, the
 944      * return will be larger than 'e - s'.
 945      *
 946      * This function assumes that the code point represented is UTF-8 variant.
 947      * The caller should have excluded the possibility of it being invariant
 948      * before calling this function.
 949      *
 950      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
 951      * accepted by L</utf8n_to_uvchr>.  If non-zero, this function will return
 952      * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
 953      * disallowed by the flags.  If the input is only for a partial character,
 954      * the function will return non-zero if there is any sequence of
 955      * well-formed UTF-8 that, when appended to the input sequence, could
 956      * result in an allowed code point; otherwise it returns 0.  Non characters
 957      * cannot be determined based on partial character input.  But many  of the
 958      * other excluded types can be determined with just the first one or two
 959      * bytes.
 960      *
 961      */
 962
 963     PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
 964
 965     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 966                           |UTF8_DISALLOW_PERL_EXTENDED)));
 967     assert(! UTF8_IS_INVARIANT(*s));
 968
 969     /* A variant char must begin with a start byte */
 970     if (UNLIKELY(! UTF8_IS_START(*s))) {
 971         return 0;
 972     }
 973
 974     /* Examine a maximum of a single whole code point */
 975     if (e - s > UTF8SKIP(s)) {
 976         e = s + UTF8SKIP(s);
 977     }
 978
 979     len = e - s;
 980
 981     if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
 982         const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
 983
 984         /* Here, we are disallowing some set of largish code points, and the
 985          * first byte indicates the sequence is for a code point that could be
 986          * in the excluded set.  We generally don't have to look beyond this or
 987          * the second byte to see if the sequence is actually for one of the
 988          * excluded classes.  The code below is derived from this table:
 989          *
 990          *              UTF-8            UTF-EBCDIC I8
 991          *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
 992          *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
 993          * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
 994          *
 995          * Keep in mind that legal continuation bytes range between \x80..\xBF
 996          * for UTF-8, and \xA0..\xBF for I8.  Anything above those aren't
 997          * continuation bytes.  Hence, we don't have to test the upper edge
 998          * because if any of those is encountered, the sequence is malformed,
 999          * and would fail elsewhere in this function.
1000          *
1001          * The code here likewise assumes that there aren't other
1002          * malformations; again the function should fail elsewhere because of
1003          * these.  For example, an overlong beginning with FC doesn't actually
1004          * have to be a super; it could actually represent a small code point,
1005          * even U+0000.  But, since overlongs (and other malformations) are
1006          * illegal, the function should return FALSE in either case.
1007          */
1008
1009 #ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
1010 #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xFA
1011 #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF9 && (s1) >= 0xA2)
1012
1013 #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xF1              \
1014                                                        /* B6 and B7 */      \
1015                                               && ((s1) & 0xFE ) == 0xB6)
1016 #  define isUTF8_PERL_EXTENDED(s)   (*s == I8_TO_NATIVE_UTF8(0xFF))
1017 #else
1018 #  define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
1019 #  define IS_UTF8_2_BYTE_SUPER(s0, s1)           ((s0) == 0xF4 && (s1) >= 0x90)
1020 #  define IS_UTF8_2_BYTE_SURROGATE(s0, s1)       ((s0) == 0xED && (s1) >= 0xA0)
1021 #  define isUTF8_PERL_EXTENDED(s)   (*s >= 0xFE)
1022 #endif
1023
1024         if (  (flags & UTF8_DISALLOW_SUPER)
1025             && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1026         {
1027             return 0;           /* Above Unicode */
1028         }
1029
1030         if (   (flags & UTF8_DISALLOW_PERL_EXTENDED)
1031             &&  UNLIKELY(isUTF8_PERL_EXTENDED(s)))
1032         {
1033             return 0;
1034         }
1035
1036         if (len > 1) {
1037             const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
1038
1039             if (   (flags & UTF8_DISALLOW_SUPER)
1040                 &&  UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
1041             {
1042                 return 0;       /* Above Unicode */
1043             }
1044
1045             if (   (flags & UTF8_DISALLOW_SURROGATE)
1046                 &&  UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
1047             {
1048                 return 0;       /* Surrogate */
1049             }
1050
1051             if (  (flags & UTF8_DISALLOW_NONCHAR)
1052                 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
1053             {
1054                 return 0;       /* Noncharacter code point */
1055             }
1056         }
1057     }
1058
1059     /* Make sure that all that follows are continuation bytes */
1060     for (x = s + 1; x < e; x++) {
1061         if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
1062             return 0;
1063         }
1064     }
1065
1066     /* Here is syntactically valid.  Next, make sure this isn't the start of an
1067      * overlong. */
1068     if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
1069         return 0;
1070     }
1071
1072     /* And finally, that the code point represented fits in a word on this
1073      * platform */
1074     if (0 < does_utf8_overflow(s, e,
1075                                0 /* Don't consider overlongs */
1076                               ))
1077     {
1078         return 0;
1079     }
1080
1081     return UTF8SKIP(s);
1082 }
1083
1084 char *
1085 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
1086 {
1087     /* Returns a mortalized C string that is a displayable copy of the 'len'
1088      * bytes starting at 'start'.  'format' gives how to display each byte.
1089      * Currently, there are only two formats, so it is currently a bool:
1090      *      0   \xab
1091      *      1    ab         (that is a space between two hex digit bytes)
1092      */
1093
1094     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
1095                                                trailing NUL */
1096     const U8 * s = start;
1097     const U8 * const e = start + len;
1098     char * output;
1099     char * d;
1100
1101     PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
1102
1103     Newx(output, output_len, char);
1104     SAVEFREEPV(output);
1105
1106     d = output;
1107     for (s = start; s < e; s++) {
1108         const unsigned high_nibble = (*s & 0xF0) >> 4;
1109         const unsigned low_nibble =  (*s & 0x0F);
1110
1111         if (format) {
1112             if (s > start) {
1113                 *d++ = ' ';
1114             }
1115         }
1116         else {
1117             *d++ = '\\';
1118             *d++ = 'x';
1119         }
1120
1121         if (high_nibble < 10) {
1122             *d++ = high_nibble + '0';
1123         }
1124         else {
1125             *d++ = high_nibble - 10 + 'a';
1126         }
1127
1128         if (low_nibble < 10) {
1129             *d++ = low_nibble + '0';
1130         }
1131         else {
1132             *d++ = low_nibble - 10 + 'a';
1133         }
1134     }
1135
1136     *d = '\0';
1137     return output;
1138 }
1139
1140 PERL_STATIC_INLINE char *
1141 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1142
1143                                          /* Max number of bytes to print */
1144                                          STRLEN print_len,
1145
1146                                          /* Which one is the non-continuation */
1147                                          const STRLEN non_cont_byte_pos,
1148
1149                                          /* How many bytes should there be? */
1150                                          const STRLEN expect_len)
1151 {
1152     /* Return the malformation warning text for an unexpected continuation
1153      * byte. */
1154
1155     const char * const where = (non_cont_byte_pos == 1)
1156                                ? "immediately"
1157                                : Perl_form(aTHX_ "%d bytes",
1158                                                  (int) non_cont_byte_pos);
1159     const U8 * x = s + non_cont_byte_pos;
1160     const U8 * e = s + print_len;
1161
1162     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1163
1164     /* We don't need to pass this parameter, but since it has already been
1165      * calculated, it's likely faster to pass it; verify under DEBUGGING */
1166     assert(expect_len == UTF8SKIP(s));
1167
1168     /* As a defensive coding measure, don't output anything past a NUL.  Such
1169      * bytes shouldn't be in the middle of a malformation, and could mark the
1170      * end of the allocated string, and what comes after is undefined */
1171     for (; x < e; x++) {
1172         if (*x == '\0') {
1173             x++;            /* Output this particular NUL */
1174             break;
1175         }
1176     }
1177
1178     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1179                            " %s after start byte 0x%02x; need %d bytes, got %d)",
1180                            malformed_text,
1181                            _byte_dump_string(s, x - s, 0),
1182                            *(s + non_cont_byte_pos),
1183                            where,
1184                            *s,
1185                            (int) expect_len,
1186                            (int) non_cont_byte_pos);
1187 }
1188
1189 /*
1190
1191 =for apidoc utf8n_to_uvchr
1192
1193 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1194 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1195 directly.
1196
1197 Bottom level UTF-8 decode routine.
1198 Returns the native code point value of the first character in the string C<s>,
1199 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1200 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1201 the length, in bytes, of that character.
1202
1203 The value of C<flags> determines the behavior when C<s> does not point to a
1204 well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
1205 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1206 is the next possible position in C<s> that could begin a non-malformed
1207 character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
1208 is raised.  Some UTF-8 input sequences may contain multiple malformations.
1209 This function tries to find every possible one in each call, so multiple
1210 warnings can be raised for the same sequence.
1211
1212 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1213 individual types of malformations, such as the sequence being overlong (that
1214 is, when there is a shorter sequence that can express the same code point;
1215 overlong sequences are expressly forbidden in the UTF-8 standard due to
1216 potential security issues).  Another malformation example is the first byte of
1217 a character not being a legal first byte.  See F<utf8.h> for the list of such
1218 flags.  Even if allowed, this function generally returns the Unicode
1219 REPLACEMENT CHARACTER when it encounters a malformation.  There are flags in
1220 F<utf8.h> to override this behavior for the overlong malformations, but don't
1221 do that except for very specialized purposes.
1222
1223 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1224 flags) malformation is found.  If this flag is set, the routine assumes that
1225 the caller will raise a warning, and this function will silently just set
1226 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1227
1228 Note that this API requires disambiguation between successful decoding a C<NUL>
1229 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1230 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1231 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
1232 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
1233 error.  Or you can use C<L</utf8n_to_uvchr_error>>.
1234
1235 Certain code points are considered problematic.  These are Unicode surrogates,
1236 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1237 By default these are considered regular code points, but certain situations
1238 warrant special handling for them, which can be specified using the C<flags>
1239 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1240 three classes are treated as malformations and handled as such.  The flags
1241 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1242 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1243 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1244 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1245 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1246 definition given by
1247 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1248 The difference between traditional strictness and C9 strictness is that the
1249 latter does not forbid non-character code points.  (They are still discouraged,
1250 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
1251
1252 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1253 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1254 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1255 raised for their respective categories, but otherwise the code points are
1256 considered valid (not malformations).  To get a category to both be treated as
1257 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1258 (But note that warnings are not raised if lexically disabled nor if
1259 C<UTF8_CHECK_ONLY> is also specified.)
1260
1261 Extremely high code points were never specified in any standard, and require an
1262 extension to UTF-8 to express, which Perl does.  It is likely that programs
1263 written in something other than Perl would not be able to read files that
1264 contain these; nor would Perl understand files written by something that uses a
1265 different extension.  For these reasons, there is a separate set of flags that
1266 can warn and/or disallow these extremely high code points, even if other
1267 above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
1268 C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
1269 L</C<UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
1270 above-Unicode code points, including these, as malformations.
1271 (Note that the Unicode standard considers anything above 0x10FFFF to be
1272 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1273 (2**31 -1))
1274
1275 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1276 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
1277 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1278 C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
1279 can apply to code points that actually do fit in 31 bits.  This happens on
1280 EBCDIC platforms, and sometimes when the L<overlong
1281 malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
1282 describe the situation in all cases.
1283
1284
1285 All other code points corresponding to Unicode characters, including private
1286 use and those yet to be assigned, are never considered malformed and never
1287 warn.
1288
1289 =cut
1290
1291 Also implemented as a macro in utf8.h
1292 */
1293
1294 UV
1295 Perl_utf8n_to_uvchr(const U8 *s,
1296                     STRLEN curlen,
1297                     STRLEN *retlen,
1298                     const U32 flags)
1299 {
1300     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1301
1302     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1303 }
1304
1305 /*
1306
1307 =for apidoc utf8n_to_uvchr_error
1308
1309 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1310 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1311 directly.
1312
1313 This function is for code that needs to know what the precise malformation(s)
1314 are when an error is found.  If you also need to know the generated warning
1315 messages, use L</utf8n_to_uvchr_msgs>() instead.
1316
1317 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1318 all the others, C<errors>.  If this parameter is 0, this function behaves
1319 identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
1320 to a C<U32> variable, which this function sets to indicate any errors found.
1321 Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
1322 C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
1323 of these bits will be set if a malformation is found, even if the input
1324 C<flags> parameter indicates that the given malformation is allowed; those
1325 exceptions are noted:
1326
1327 =over 4
1328
1329 =item C<UTF8_GOT_PERL_EXTENDED>
1330
1331 The input sequence is not standard UTF-8, but a Perl extension.  This bit is
1332 set only if the input C<flags> parameter contains either the
1333 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1334
1335 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1336 and so some extension must be used to express them.  Perl uses a natural
1337 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1338 extension to represent even higher ones, so that any code point that fits in a
1339 64-bit word can be represented.  Text using these extensions is not likely to
1340 be portable to non-Perl code.  We lump both of these extensions together and
1341 refer to them as Perl extended UTF-8.  There exist other extensions that people
1342 have invented, incompatible with Perl's.
1343
1344 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1345 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1346 than on ASCII.  Prior to that, code points 2**31 and higher were simply
1347 unrepresentable, and a different, incompatible method was used to represent
1348 code points between 2**30 and 2**31 - 1.
1349
1350 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1351 Perl extended UTF-8 is used.
1352
1353 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1354 may use for backward compatibility.  That name is misleading, as this flag may
1355 be set when the code point actually does fit in 31 bits.  This happens on
1356 EBCDIC platforms, and sometimes when the L<overlong
1357 malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
1358 describes the situation in all cases.
1359
1360 =item C<UTF8_GOT_CONTINUATION>
1361
1362 The input sequence was malformed in that the first byte was a a UTF-8
1363 continuation byte.
1364
1365 =item C<UTF8_GOT_EMPTY>
1366
1367 The input C<curlen> parameter was 0.
1368
1369 =item C<UTF8_GOT_LONG>
1370
1371 The input sequence was malformed in that there is some other sequence that
1372 evaluates to the same code point, but that sequence is shorter than this one.
1373
1374 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1375 it was discovered that this created security issues.
1376
1377 =item C<UTF8_GOT_NONCHAR>
1378
1379 The code point represented by the input UTF-8 sequence is for a Unicode
1380 non-character code point.
1381 This bit is set only if the input C<flags> parameter contains either the
1382 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1383
1384 =item C<UTF8_GOT_NON_CONTINUATION>
1385
1386 The input sequence was malformed in that a non-continuation type byte was found
1387 in a position where only a continuation type one should be.  See also
1388 L</C<UTF8_GOT_SHORT>>.
1389
1390 =item C<UTF8_GOT_OVERFLOW>
1391
1392 The input sequence was malformed in that it is for a code point that is not
1393 representable in the number of bits available in an IV on the current platform.
1394
1395 =item C<UTF8_GOT_SHORT>
1396
1397 The input sequence was malformed in that C<curlen> is smaller than required for
1398 a complete sequence.  In other words, the input is for a partial character
1399 sequence.
1400
1401
1402 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1403 sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1404 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1405 sequence was looked at.   If no other flags are present, it means that the
1406 sequence was valid as far as it went.  Depending on the application, this could
1407 mean one of three things:
1408
1409 =over
1410
1411 =item *
1412
1413 The C<curlen> length parameter passed in was too small, and the function was
1414 prevented from examining all the necessary bytes.
1415
1416 =item *
1417
1418 The buffer being looked at is based on reading data, and the data received so
1419 far stopped in the middle of a character, so that the next read will
1420 read the remainder of this character.  (It is up to the caller to deal with the
1421 split bytes somehow.)
1422
1423 =item *
1424
1425 This is a real error, and the partial sequence is all we're going to get.
1426
1427 =back
1428
1429 =item C<UTF8_GOT_SUPER>
1430
1431 The input sequence was malformed in that it is for a non-Unicode code point;
1432 that is, one above the legal Unicode maximum.
1433 This bit is set only if the input C<flags> parameter contains either the
1434 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1435
1436 =item C<UTF8_GOT_SURROGATE>
1437
1438 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1439 code point.
1440 This bit is set only if the input C<flags> parameter contains either the
1441 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1442
1443 =back
1444
1445 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1446 flag to suppress any warnings, and then examine the C<*errors> return.
1447
1448 =cut
1449
1450 Also implemented as a macro in utf8.h
1451 */
1452
1453 UV
1454 Perl_utf8n_to_uvchr_error(const U8 *s,
1455                           STRLEN curlen,
1456                           STRLEN *retlen,
1457                           const U32 flags,
1458                           U32 * errors)
1459 {
1460     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1461
1462     return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1463 }
1464
1465 /*
1466
1467 =for apidoc utf8n_to_uvchr_msgs
1468
1469 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1470 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1471 directly.
1472
1473 This function is for code that needs to know what the precise malformation(s)
1474 are when an error is found, and wants the corresponding warning and/or error
1475 messages to be returned to the caller rather than be displayed.  All messages
1476 that would have been displayed if all lexcial warnings are enabled will be
1477 returned.
1478
1479 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1480 placed after all the others, C<msgs>.  If this parameter is 0, this function
1481 behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
1482 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1483 contain any appropriate messages.  The elements of the array are ordered so
1484 that the first message that would have been displayed is in the 0th element,
1485 and so on.  Each element is a hash with three key-value pairs, as follows:
1486
1487 =over 4
1488
1489 =item C<text>
1490
1491 The text of the message as a C<SVpv>.
1492
1493 =item C<warn_categories>
1494
1495 The warning category (or categories) packed into a C<SVuv>.
1496
1497 =item C<flag>
1498
1499 A single flag bit associated with this message, in a C<SVuv>.
1500 The bit corresponds to some bit in the C<*errors> return value,
1501 such as C<UTF8_GOT_LONG>.
1502
1503 =back
1504
1505 It's important to note that specifying this parameter as non-null will cause
1506 any warnings this function would otherwise generate to be suppressed, and
1507 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
1508 (or not) when choosing what to do with the returned messages.
1509
1510 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1511 no AV is created.
1512
1513 The caller, of course, is responsible for freeing any returned AV.
1514
1515 =cut
1516 */
1517
1518 UV
1519 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1520                                STRLEN curlen,
1521                                STRLEN *retlen,
1522                                const U32 flags,
1523                                U32 * errors,
1524                                AV ** msgs)
1525 {
1526     const U8 * const s0 = s;
1527     const U8 * send = s0 + curlen;
1528     U32 possible_problems;  /* A bit is set here for each potential problem
1529                                found as we go along */
1530     UV uv;
1531     STRLEN expectlen;     /* How long should this sequence be? */
1532     STRLEN avail_len;     /* When input is too short, gives what that is */
1533     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
1534                              gets set and discarded */
1535
1536     /* The below are used only if there is both an overlong malformation and a
1537      * too short one.  Otherwise the first two are set to 's0' and 'send', and
1538      * the third not used at all */
1539     U8 * adjusted_s0;
1540     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1541                                             routine; see [perl #130921] */
1542     UV uv_so_far;
1543     dTHX;
1544
1545     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1546
1547     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1548      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1549      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
1550      * final case. */
1551
1552 #ifndef EBCDIC
1553
1554     /* Each of the affected Hanguls starts with \xED */
1555
1556     if (is_HANGUL_ED_utf8_safe(s0, send)) {
1557         if (retlen) {
1558             *retlen = 3;
1559         }
1560         if (errors) {
1561             *errors = 0;
1562         }
1563         if (msgs) {
1564             *msgs = NULL;
1565         }
1566
1567         return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1568              | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1569              |  (s0[2] & UTF_CONTINUATION_MASK);
1570     }
1571
1572 #endif
1573
1574     /* In conjunction with the exhaustive tests that can be enabled in
1575      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1576      * what it is intended to do, and that no flaws in it are masked by
1577      * dropping down and executing the code below
1578     assert(! isUTF8_CHAR(s0, send)
1579           || UTF8_IS_SURROGATE(s0, send)
1580           || UTF8_IS_SUPER(s0, send)
1581           || UTF8_IS_NONCHAR(s0,send));
1582     */
1583
1584     s = s0;
1585     uv = *s0;
1586     possible_problems = 0;
1587     expectlen = 0;
1588     avail_len = 0;
1589     discard_errors = 0;
1590     adjusted_s0 = (U8 *) s0;
1591     uv_so_far = 0;
1592
1593     if (errors) {
1594         *errors = 0;
1595     }
1596     else {
1597         errors = &discard_errors;
1598     }
1599
1600     /* The order of malformation tests here is important.  We should consume as
1601      * few bytes as possible in order to not skip any valid character.  This is
1602      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1603      * https://unicode.org/reports/tr36 for more discussion as to why.  For
1604      * example, once we've done a UTF8SKIP, we can tell the expected number of
1605      * bytes, and could fail right off the bat if the input parameters indicate
1606      * that there are too few available.  But it could be that just that first
1607      * byte is garbled, and the intended character occupies fewer bytes.  If we
1608      * blindly assumed that the first byte is correct, and skipped based on
1609      * that number, we could skip over a valid input character.  So instead, we
1610      * always examine the sequence byte-by-byte.
1611      *
1612      * We also should not consume too few bytes, otherwise someone could inject
1613      * things.  For example, an input could be deliberately designed to
1614      * overflow, and if this code bailed out immediately upon discovering that,
1615      * returning to the caller C<*retlen> pointing to the very next byte (one
1616      * which is actually part of of the overflowing sequence), that could look
1617      * legitimate to the caller, which could discard the initial partial
1618      * sequence and process the rest, inappropriately.
1619      *
1620      * Some possible input sequences are malformed in more than one way.  This
1621      * function goes to lengths to try to find all of them.  This is necessary
1622      * for correctness, as the inputs may allow one malformation but not
1623      * another, and if we abandon searching for others after finding the
1624      * allowed one, we could allow in something that shouldn't have been.
1625      */
1626
1627     if (UNLIKELY(curlen == 0)) {
1628         possible_problems |= UTF8_GOT_EMPTY;
1629         curlen = 0;
1630         uv = UNICODE_REPLACEMENT;
1631         goto ready_to_handle_errors;
1632     }
1633
1634     expectlen = UTF8SKIP(s);
1635
1636     /* A well-formed UTF-8 character, as the vast majority of calls to this
1637      * function will be for, has this expected length.  For efficiency, set
1638      * things up here to return it.  It will be overriden only in those rare
1639      * cases where a malformation is found */
1640     if (retlen) {
1641         *retlen = expectlen;
1642     }
1643
1644     /* A continuation character can't start a valid sequence */
1645     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1646         possible_problems |= UTF8_GOT_CONTINUATION;
1647         curlen = 1;
1648         uv = UNICODE_REPLACEMENT;
1649         goto ready_to_handle_errors;
1650     }
1651
1652     /* Here is not a continuation byte, nor an invariant.  The only thing left
1653      * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
1654      * because it excludes start bytes like \xC0 that always lead to
1655      * overlongs.) */
1656
1657     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1658      * that indicate the number of bytes in the character's whole UTF-8
1659      * sequence, leaving just the bits that are part of the value.  */
1660     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1661
1662     /* Setup the loop end point, making sure to not look past the end of the
1663      * input string, and flag it as too short if the size isn't big enough. */
1664     if (UNLIKELY(curlen < expectlen)) {
1665         possible_problems |= UTF8_GOT_SHORT;
1666         avail_len = curlen;
1667     }
1668     else {
1669         send = (U8*) s0 + expectlen;
1670     }
1671
1672     /* Now, loop through the remaining bytes in the character's sequence,
1673      * accumulating each into the working value as we go. */
1674     for (s = s0 + 1; s < send; s++) {
1675         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1676             uv = UTF8_ACCUMULATE(uv, *s);
1677             continue;
1678         }
1679
1680         /* Here, found a non-continuation before processing all expected bytes.
1681          * This byte indicates the beginning of a new character, so quit, even
1682          * if allowing this malformation. */
1683         possible_problems |= UTF8_GOT_NON_CONTINUATION;
1684         break;
1685     } /* End of loop through the character's bytes */
1686
1687     /* Save how many bytes were actually in the character */
1688     curlen = s - s0;
1689
1690     /* Note that there are two types of too-short malformation.  One is when
1691      * there is actual wrong data before the normal termination of the
1692      * sequence.  The other is that the sequence wasn't complete before the end
1693      * of the data we are allowed to look at, based on the input 'curlen'.
1694      * This means that we were passed data for a partial character, but it is
1695      * valid as far as we saw.  The other is definitely invalid.  This
1696      * distinction could be important to a caller, so the two types are kept
1697      * separate.
1698      *
1699      * A convenience macro that matches either of the too-short conditions.  */
1700 #   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1701
1702     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1703         uv_so_far = uv;
1704         uv = UNICODE_REPLACEMENT;
1705     }
1706
1707     /* Check for overflow.  The algorithm requires us to not look past the end
1708      * of the current character, even if partial, so the upper limit is 's' */
1709     if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1710                                          1 /* Do consider overlongs */
1711                                         )))
1712     {
1713         possible_problems |= UTF8_GOT_OVERFLOW;
1714         uv = UNICODE_REPLACEMENT;
1715     }
1716
1717     /* Check for overlong.  If no problems so far, 'uv' is the correct code
1718      * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
1719      * we must look at the UTF-8 byte sequence itself to see if it is for an
1720      * overlong */
1721     if (     (   LIKELY(! possible_problems)
1722               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1723         || (       UNLIKELY(possible_problems)
1724             && (   UNLIKELY(! UTF8_IS_START(*s0))
1725                 || (   curlen > 1
1726                     && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
1727                                                                 s - s0))))))
1728     {
1729         possible_problems |= UTF8_GOT_LONG;
1730
1731         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
1732
1733                           /* The calculation in the 'true' branch of this 'if'
1734                            * below won't work if overflows, and isn't needed
1735                            * anyway.  Further below we handle all overflow
1736                            * cases */
1737             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1738         {
1739             UV min_uv = uv_so_far;
1740             STRLEN i;
1741
1742             /* Here, the input is both overlong and is missing some trailing
1743              * bytes.  There is no single code point it could be for, but there
1744              * may be enough information present to determine if what we have
1745              * so far is for an unallowed code point, such as for a surrogate.
1746              * The code further below has the intelligence to determine this,
1747              * but just for non-overlong UTF-8 sequences.  What we do here is
1748              * calculate the smallest code point the input could represent if
1749              * there were no too short malformation.  Then we compute and save
1750              * the UTF-8 for that, which is what the code below looks at
1751              * instead of the raw input.  It turns out that the smallest such
1752              * code point is all we need. */
1753             for (i = curlen; i < expectlen; i++) {
1754                 min_uv = UTF8_ACCUMULATE(min_uv,
1755                                      I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1756             }
1757
1758             adjusted_s0 = temp_char_buf;
1759             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1760         }
1761     }
1762
1763     /* Here, we have found all the possible problems, except for when the input
1764      * is for a problematic code point not allowed by the input parameters. */
1765
1766                                 /* uv is valid for overlongs */
1767     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1768
1769                       /* isn't problematic if < this */
1770                    && uv >= UNICODE_SURROGATE_FIRST)
1771             || (   UNLIKELY(possible_problems)
1772
1773                           /* if overflow, we know without looking further
1774                            * precisely which of the problematic types it is,
1775                            * and we deal with those in the overflow handling
1776                            * code */
1777                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1778                 && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1779                     || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
1780         && ((flags & ( UTF8_DISALLOW_NONCHAR
1781                       |UTF8_DISALLOW_SURROGATE
1782                       |UTF8_DISALLOW_SUPER
1783                       |UTF8_DISALLOW_PERL_EXTENDED
1784                       |UTF8_WARN_NONCHAR
1785                       |UTF8_WARN_SURROGATE
1786                       |UTF8_WARN_SUPER
1787                       |UTF8_WARN_PERL_EXTENDED))))
1788     {
1789         /* If there were no malformations, or the only malformation is an
1790          * overlong, 'uv' is valid */
1791         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1792             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1793                 possible_problems |= UTF8_GOT_SURROGATE;
1794             }
1795             else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1796                 possible_problems |= UTF8_GOT_SUPER;
1797             }
1798             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1799                 possible_problems |= UTF8_GOT_NONCHAR;
1800             }
1801         }
1802         else {  /* Otherwise, need to look at the source UTF-8, possibly
1803                    adjusted to be non-overlong */
1804
1805             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1806                                 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1807             {
1808                 possible_problems |= UTF8_GOT_SUPER;
1809             }
1810             else if (curlen > 1) {
1811                 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1812                                       NATIVE_UTF8_TO_I8(*adjusted_s0),
1813                                       NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1814                 {
1815                     possible_problems |= UTF8_GOT_SUPER;
1816                 }
1817                 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1818                                       NATIVE_UTF8_TO_I8(*adjusted_s0),
1819                                       NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1820                 {
1821                     possible_problems |= UTF8_GOT_SURROGATE;
1822                 }
1823             }
1824
1825             /* We need a complete well-formed UTF-8 character to discern
1826              * non-characters, so can't look for them here */
1827         }
1828     }
1829
1830   ready_to_handle_errors:
1831
1832     /* At this point:
1833      * curlen               contains the number of bytes in the sequence that
1834      *                      this call should advance the input by.
1835      * avail_len            gives the available number of bytes passed in, but
1836      *                      only if this is less than the expected number of
1837      *                      bytes, based on the code point's start byte.
1838      * possible_problems'   is 0 if there weren't any problems; otherwise a bit
1839      *                      is set in it for each potential problem found.
1840      * uv                   contains the code point the input sequence
1841      *                      represents; or if there is a problem that prevents
1842      *                      a well-defined value from being computed, it is
1843      *                      some subsitute value, typically the REPLACEMENT
1844      *                      CHARACTER.
1845      * s0                   points to the first byte of the character
1846      * s                    points to just after were we left off processing
1847      *                      the character
1848      * send                 points to just after where that character should
1849      *                      end, based on how many bytes the start byte tells
1850      *                      us should be in it, but no further than s0 +
1851      *                      avail_len
1852      */
1853
1854     if (UNLIKELY(possible_problems)) {
1855         bool disallowed = FALSE;
1856         const U32 orig_problems = possible_problems;
1857
1858         if (msgs) {
1859             *msgs = NULL;
1860         }
1861
1862         while (possible_problems) { /* Handle each possible problem */
1863             UV pack_warn = 0;
1864             char * message = NULL;
1865             U32 this_flag_bit = 0;
1866
1867             /* Each 'if' clause handles one problem.  They are ordered so that
1868              * the first ones' messages will be displayed before the later
1869              * ones; this is kinda in decreasing severity order.  But the
1870              * overlong must come last, as it changes 'uv' looked at by the
1871              * others */
1872             if (possible_problems & UTF8_GOT_OVERFLOW) {
1873
1874                 /* Overflow means also got a super and are using Perl's
1875                  * extended UTF-8, but we handle all three cases here */
1876                 possible_problems
1877                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1878                 *errors |= UTF8_GOT_OVERFLOW;
1879
1880                 /* But the API says we flag all errors found */
1881                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1882                     *errors |= UTF8_GOT_SUPER;
1883                 }
1884                 if (flags
1885                         & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1886                 {
1887                     *errors |= UTF8_GOT_PERL_EXTENDED;
1888                 }
1889
1890                 /* Disallow if any of the three categories say to */
1891                 if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
1892                     || (flags & ( UTF8_DISALLOW_SUPER
1893                                  |UTF8_DISALLOW_PERL_EXTENDED)))
1894                 {
1895                     disallowed = TRUE;
1896                 }
1897
1898                 /* Likewise, warn if any say to */
1899                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)
1900                     ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1901                 {
1902
1903                     /* The warnings code explicitly says it doesn't handle the
1904                      * case of packWARN2 and two categories which have
1905                      * parent-child relationship.  Even if it works now to
1906                      * raise the warning if either is enabled, it wouldn't
1907                      * necessarily do so in the future.  We output (only) the
1908                      * most dire warning */
1909                     if (! (flags & UTF8_CHECK_ONLY)) {
1910                         if (msgs || ckWARN_d(WARN_UTF8)) {
1911                             pack_warn = packWARN(WARN_UTF8);
1912                         }
1913                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1914                             pack_warn = packWARN(WARN_NON_UNICODE);
1915                         }
1916                         if (pack_warn) {
1917                             message = Perl_form(aTHX_ "%s: %s (overflows)",
1918                                             malformed_text,
1919                                             _byte_dump_string(s0, curlen, 0));
1920                             this_flag_bit = UTF8_GOT_OVERFLOW;
1921                         }
1922                     }
1923                 }
1924             }
1925             else if (possible_problems & UTF8_GOT_EMPTY) {
1926                 possible_problems &= ~UTF8_GOT_EMPTY;
1927                 *errors |= UTF8_GOT_EMPTY;
1928
1929                 if (! (flags & UTF8_ALLOW_EMPTY)) {
1930
1931                     /* This so-called malformation is now treated as a bug in
1932                      * the caller.  If you have nothing to decode, skip calling
1933                      * this function */
1934                     assert(0);
1935
1936                     disallowed = TRUE;
1937                     if (  (msgs
1938                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1939                     {
1940                         pack_warn = packWARN(WARN_UTF8);
1941                         message = Perl_form(aTHX_ "%s (empty string)",
1942                                                    malformed_text);
1943                         this_flag_bit = UTF8_GOT_EMPTY;
1944                     }
1945                 }
1946             }
1947             else if (possible_problems & UTF8_GOT_CONTINUATION) {
1948                 possible_problems &= ~UTF8_GOT_CONTINUATION;
1949                 *errors |= UTF8_GOT_CONTINUATION;
1950
1951                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1952                     disallowed = TRUE;
1953                     if ((   msgs
1954                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1955                     {
1956                         pack_warn = packWARN(WARN_UTF8);
1957                         message = Perl_form(aTHX_
1958                                 "%s: %s (unexpected continuation byte 0x%02x,"
1959                                 " with no preceding start byte)",
1960                                 malformed_text,
1961                                 _byte_dump_string(s0, 1, 0), *s0);
1962                         this_flag_bit = UTF8_GOT_CONTINUATION;
1963                     }
1964                 }
1965             }
1966             else if (possible_problems & UTF8_GOT_SHORT) {
1967                 possible_problems &= ~UTF8_GOT_SHORT;
1968                 *errors |= UTF8_GOT_SHORT;
1969
1970                 if (! (flags & UTF8_ALLOW_SHORT)) {
1971                     disallowed = TRUE;
1972                     if ((   msgs
1973                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1974                     {
1975                         pack_warn = packWARN(WARN_UTF8);
1976                         message = Perl_form(aTHX_
1977                              "%s: %s (too short; %d byte%s available, need %d)",
1978                              malformed_text,
1979                              _byte_dump_string(s0, send - s0, 0),
1980                              (int)avail_len,
1981                              avail_len == 1 ? "" : "s",
1982                              (int)expectlen);
1983                         this_flag_bit = UTF8_GOT_SHORT;
1984                     }
1985                 }
1986
1987             }
1988             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1989                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1990                 *errors |= UTF8_GOT_NON_CONTINUATION;
1991
1992                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1993                     disallowed = TRUE;
1994                     if ((   msgs
1995                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1996                     {
1997
1998                         /* If we don't know for sure that the input length is
1999                          * valid, avoid as much as possible reading past the
2000                          * end of the buffer */
2001                         int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
2002                                        ? s - s0
2003                                        : send - s0;
2004                         pack_warn = packWARN(WARN_UTF8);
2005                         message = Perl_form(aTHX_ "%s",
2006                             unexpected_non_continuation_text(s0,
2007                                                             printlen,
2008                                                             s - s0,
2009                                                             (int) expectlen));
2010                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
2011                     }
2012                 }
2013             }
2014             else if (possible_problems & UTF8_GOT_SURROGATE) {
2015                 possible_problems &= ~UTF8_GOT_SURROGATE;
2016
2017                 if (flags & UTF8_WARN_SURROGATE) {
2018                     *errors |= UTF8_GOT_SURROGATE;
2019
2020                     if (   ! (flags & UTF8_CHECK_ONLY)
2021                         && (msgs || ckWARN_d(WARN_SURROGATE)))
2022                     {
2023                         pack_warn = packWARN(WARN_SURROGATE);
2024
2025                         /* These are the only errors that can occur with a
2026                         * surrogate when the 'uv' isn't valid */
2027                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
2028                             message = Perl_form(aTHX_
2029                                     "UTF-16 surrogate (any UTF-8 sequence that"
2030                                     " starts with \"%s\" is for a surrogate)",
2031                                     _byte_dump_string(s0, curlen, 0));
2032                         }
2033                         else {
2034                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
2035                         }
2036                         this_flag_bit = UTF8_GOT_SURROGATE;
2037                     }
2038                 }
2039
2040                 if (flags & UTF8_DISALLOW_SURROGATE) {
2041                     disallowed = TRUE;
2042                     *errors |= UTF8_GOT_SURROGATE;
2043                 }
2044             }
2045             else if (possible_problems & UTF8_GOT_SUPER) {
2046                 possible_problems &= ~UTF8_GOT_SUPER;
2047
2048                 if (flags & UTF8_WARN_SUPER) {
2049                     *errors |= UTF8_GOT_SUPER;
2050
2051                     if (   ! (flags & UTF8_CHECK_ONLY)
2052                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
2053                     {
2054                         pack_warn = packWARN(WARN_NON_UNICODE);
2055
2056                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
2057                             message = Perl_form(aTHX_
2058                                     "Any UTF-8 sequence that starts with"
2059                                     " \"%s\" is for a non-Unicode code point,"
2060                                     " may not be portable",
2061                                     _byte_dump_string(s0, curlen, 0));
2062                         }
2063                         else {
2064                             message = Perl_form(aTHX_ super_cp_format, uv);
2065                         }
2066                         this_flag_bit = UTF8_GOT_SUPER;
2067                     }
2068                 }
2069
2070                 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
2071                  * and before possibly bailing out, so that the more dire
2072                  * warning will override the regular one. */
2073                 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
2074                     if (  ! (flags & UTF8_CHECK_ONLY)
2075                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
2076                         &&  (msgs || ckWARN_d(WARN_NON_UNICODE)))
2077                     {
2078                         pack_warn = packWARN(WARN_NON_UNICODE);
2079
2080                         /* If it is an overlong that evaluates to a code point
2081                          * that doesn't have to use the Perl extended UTF-8, it
2082                          * still used it, and so we output a message that
2083                          * doesn't refer to the code point.  The same is true
2084                          * if there was a SHORT malformation where the code
2085                          * point is not valid.  In that case, 'uv' will have
2086                          * been set to the REPLACEMENT CHAR, and the message
2087                          * below without the code point in it will be selected
2088                          * */
2089                         if (UNICODE_IS_PERL_EXTENDED(uv)) {
2090                             message = Perl_form(aTHX_
2091                                             perl_extended_cp_format, uv);
2092                         }
2093                         else {
2094                             message = Perl_form(aTHX_
2095                                         "Any UTF-8 sequence that starts with"
2096                                         " \"%s\" is a Perl extension, and"
2097                                         " so is not portable",
2098                                         _byte_dump_string(s0, curlen, 0));
2099                         }
2100                         this_flag_bit = UTF8_GOT_PERL_EXTENDED;
2101                     }
2102
2103                     if (flags & ( UTF8_WARN_PERL_EXTENDED
2104                                  |UTF8_DISALLOW_PERL_EXTENDED))
2105                     {
2106                         *errors |= UTF8_GOT_PERL_EXTENDED;
2107
2108                         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
2109                             disallowed = TRUE;
2110                         }
2111                     }
2112                 }
2113
2114                 if (flags & UTF8_DISALLOW_SUPER) {
2115                     *errors |= UTF8_GOT_SUPER;
2116                     disallowed = TRUE;
2117                 }
2118             }
2119             else if (possible_problems & UTF8_GOT_NONCHAR) {
2120                 possible_problems &= ~UTF8_GOT_NONCHAR;
2121
2122                 if (flags & UTF8_WARN_NONCHAR) {
2123                     *errors |= UTF8_GOT_NONCHAR;
2124
2125                     if (  ! (flags & UTF8_CHECK_ONLY)
2126                         && (msgs || ckWARN_d(WARN_NONCHAR)))
2127                     {
2128                         /* The code above should have guaranteed that we don't
2129                          * get here with errors other than overlong */
2130                         assert (! (orig_problems
2131                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2132
2133                         pack_warn = packWARN(WARN_NONCHAR);
2134                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
2135                         this_flag_bit = UTF8_GOT_NONCHAR;
2136                     }
2137                 }
2138
2139                 if (flags & UTF8_DISALLOW_NONCHAR) {
2140                     disallowed = TRUE;
2141                     *errors |= UTF8_GOT_NONCHAR;
2142                 }
2143             }
2144             else if (possible_problems & UTF8_GOT_LONG) {
2145                 possible_problems &= ~UTF8_GOT_LONG;
2146                 *errors |= UTF8_GOT_LONG;
2147
2148                 if (flags & UTF8_ALLOW_LONG) {
2149
2150                     /* We don't allow the actual overlong value, unless the
2151                      * special extra bit is also set */
2152                     if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
2153                                     & ~UTF8_ALLOW_LONG)))
2154                     {
2155                         uv = UNICODE_REPLACEMENT;
2156                     }
2157                 }
2158                 else {
2159                     disallowed = TRUE;
2160
2161                     if ((   msgs
2162                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2163                     {
2164                         pack_warn = packWARN(WARN_UTF8);
2165
2166                         /* These error types cause 'uv' to be something that
2167                          * isn't what was intended, so can't use it in the
2168                          * message.  The other error types either can't
2169                          * generate an overlong, or else the 'uv' is valid */
2170                         if (orig_problems &
2171                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2172                         {
2173                             message = Perl_form(aTHX_
2174                                     "%s: %s (any UTF-8 sequence that starts"
2175                                     " with \"%s\" is overlong which can and"
2176                                     " should be represented with a"
2177                                     " different, shorter sequence)",
2178                                     malformed_text,
2179                                     _byte_dump_string(s0, send - s0, 0),
2180                                     _byte_dump_string(s0, curlen, 0));
2181                         }
2182                         else {
2183                             U8 tmpbuf[UTF8_MAXBYTES+1];
2184                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2185                                                                         uv, 0);
2186                             /* Don't use U+ for non-Unicode code points, which
2187                              * includes those in the Latin1 range */
2188                             const char * preface = (    uv > PERL_UNICODE_MAX
2189 #ifdef EBCDIC
2190                                                      || uv <= 0xFF
2191 #endif
2192                                                     )
2193                                                    ? "0x"
2194                                                    : "U+";
2195                             message = Perl_form(aTHX_
2196                                 "%s: %s (overlong; instead use %s to represent"
2197                                 " %s%0*" UVXf ")",
2198                                 malformed_text,
2199                                 _byte_dump_string(s0, send - s0, 0),
2200                                 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2201                                 preface,
2202                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2203                                                          small code points */
2204                                 UNI_TO_NATIVE(uv));
2205                         }
2206                         this_flag_bit = UTF8_GOT_LONG;
2207                     }
2208                 }
2209             } /* End of looking through the possible flags */
2210
2211             /* Display the message (if any) for the problem being handled in
2212              * this iteration of the loop */
2213             if (message) {
2214                 if (msgs) {
2215                     assert(this_flag_bit);
2216
2217                     if (*msgs == NULL) {
2218                         *msgs = newAV();
2219                     }
2220
2221                     av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2222                                                                 pack_warn,
2223                                                                 this_flag_bit)));
2224                 }
2225                 else if (PL_op)
2226                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2227                                                  OP_DESC(PL_op));
2228                 else
2229                     Perl_warner(aTHX_ pack_warn, "%s", message);
2230             }
2231         }   /* End of 'while (possible_problems)' */
2232
2233         /* Since there was a possible problem, the returned length may need to
2234          * be changed from the one stored at the beginning of this function.
2235          * Instead of trying to figure out if that's needed, just do it. */
2236         if (retlen) {
2237             *retlen = curlen;
2238         }
2239
2240         if (disallowed) {
2241             if (flags & UTF8_CHECK_ONLY && retlen) {
2242                 *retlen = ((STRLEN) -1);
2243             }
2244             return 0;
2245         }
2246     }
2247
2248     return UNI_TO_NATIVE(uv);
2249 }
2250
2251 /*
2252 =for apidoc utf8_to_uvchr_buf
2253
2254 Returns the native code point of the first character in the string C<s> which
2255 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2256 C<*retlen> will be set to the length, in bytes, of that character.
2257
2258 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2259 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2260 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
2261 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2262 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2263 the next possible position in C<s> that could begin a non-malformed character.
2264 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2265 returned.
2266
2267 =cut
2268
2269 Also implemented as a macro in utf8.h
2270
2271 */
2272
2273
2274 UV
2275 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2276 {
2277     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2278
2279     return utf8_to_uvchr_buf_helper(s, send, retlen);
2280 }
2281
2282 /* This is marked as deprecated
2283  *
2284 =for apidoc utf8_to_uvuni_buf
2285
2286 Only in very rare circumstances should code need to be dealing in Unicode
2287 (as opposed to native) code points.  In those few cases, use
2288 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
2289 If you are not absolutely sure this is one of those cases, then assume it isn't
2290 and use plain C<utf8_to_uvchr_buf> instead.
2291
2292 Returns the Unicode (not-native) code point of the first character in the
2293 string C<s> which
2294 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2295 C<retlen> will be set to the length, in bytes, of that character.
2296
2297 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2298 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2299 NULL) to -1.  If those warnings are off, the computed value if well-defined (or
2300 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
2301 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
2302 next possible position in C<s> that could begin a non-malformed character.
2303 See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2304 returned.
2305
2306 =cut
2307 */
2308
2309 UV
2310 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2311 {
2312     PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
2313
2314     assert(send > s);
2315
2316     return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
2317 }
2318
2319 /*
2320 =for apidoc utf8_length
2321
2322 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2323 at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
2324 same place, it returns 0 with no warning raised.
2325
2326 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2327 and returns the number of valid characters.
2328
2329 =cut
2330 */
2331
2332 STRLEN
2333 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
2334 {
2335     STRLEN len = 0;
2336
2337     PERL_ARGS_ASSERT_UTF8_LENGTH;
2338
2339     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
2340      * the bitops (especially ~) can create illegal UTF-8.
2341      * In other words: in Perl UTF-8 is not just for Unicode. */
2342
2343     if (UNLIKELY(e < s))
2344         goto warn_and_return;
2345     while (s < e) {
2346         s += UTF8SKIP(s);
2347         len++;
2348     }
2349
2350     if (UNLIKELY(e != s)) {
2351         len--;
2352         warn_and_return:
2353         if (PL_op)
2354             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2355                              "%s in %s", unees, OP_DESC(PL_op));
2356         else
2357             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2358     }
2359
2360     return len;
2361 }
2362
2363 /*
2364 =for apidoc bytes_cmp_utf8
2365
2366 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2367 sequence of characters (stored as UTF-8)
2368 in C<u>, C<ulen>.  Returns 0 if they are
2369 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2370 if the first string is greater than the second string.
2371
2372 -1 or +1 is returned if the shorter string was identical to the start of the
2373 longer string.  -2 or +2 is returned if
2374 there was a difference between characters
2375 within the strings.
2376
2377 =cut
2378 */
2379
2380 int
2381 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2382 {
2383     const U8 *const bend = b + blen;
2384     const U8 *const uend = u + ulen;
2385
2386     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2387
2388     while (b < bend && u < uend) {
2389         U8 c = *u++;
2390         if (!UTF8_IS_INVARIANT(c)) {
2391             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2392                 if (u < uend) {
2393                     U8 c1 = *u++;
2394                     if (UTF8_IS_CONTINUATION(c1)) {
2395                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2396                     } else {
2397                         /* diag_listed_as: Malformed UTF-8 character%s */
2398                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2399                               "%s %s%s",
2400                               unexpected_non_continuation_text(u - 2, 2, 1, 2),
2401                               PL_op ? " in " : "",
2402                               PL_op ? OP_DESC(PL_op) : "");
2403                         return -2;
2404                     }
2405                 } else {
2406                     if (PL_op)
2407                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2408                                          "%s in %s", unees, OP_DESC(PL_op));
2409                     else
2410                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2411                     return -2; /* Really want to return undef :-)  */
2412                 }
2413             } else {
2414                 return -2;
2415             }
2416         }
2417         if (*b != c) {
2418             return *b < c ? -2 : +2;
2419         }
2420         ++b;
2421     }
2422
2423     if (b == bend && u == uend)
2424         return 0;
2425
2426     return b < bend ? +1 : -1;
2427 }
2428
2429 /*
2430 =for apidoc utf8_to_bytes
2431
2432 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2433 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2434 updates C<*lenp> to contain the new length.
2435 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2436
2437 Upon successful return, the number of variants in the string can be computed by
2438 having saved the value of C<*lenp> before the call, and subtracting the
2439 after-call value of C<*lenp> from it.
2440
2441 If you need a copy of the string, see L</bytes_from_utf8>.
2442
2443 =cut
2444 */
2445
2446 U8 *
2447 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2448 {
2449     U8 * first_variant;
2450
2451     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2452     PERL_UNUSED_CONTEXT;
2453
2454     /* This is a no-op if no variants at all in the input */
2455     if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2456         return s;
2457     }
2458
2459     {
2460         U8 * const save = s;
2461         U8 * const send = s + *lenp;
2462         U8 * d;
2463
2464         /* Nothing before the first variant needs to be changed, so start the real
2465          * work there */
2466         s = first_variant;
2467         while (s < send) {
2468             if (! UTF8_IS_INVARIANT(*s)) {
2469                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2470                     *lenp = ((STRLEN) -1);
2471                     return 0;
2472                 }
2473                 s++;
2474             }
2475             s++;
2476         }
2477
2478         /* Is downgradable, so do it */
2479         d = s = first_variant;
2480         while (s < send) {
2481             U8 c = *s++;
2482             if (! UVCHR_IS_INVARIANT(c)) {
2483                 /* Then it is two-byte encoded */
2484                 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2485                 s++;
2486             }
2487             *d++ = c;
2488         }
2489         *d = '\0';
2490         *lenp = d - save;
2491
2492         return save;
2493     }
2494 }
2495
2496 /*
2497 =for apidoc bytes_from_utf8
2498
2499 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2500 byte encoding.  On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2501 actually encoded in UTF-8.
2502
2503 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2504 the input string.
2505
2506 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2507 not expressible in native byte encoding.  In these cases, C<*is_utf8p> and
2508 C<*lenp> are unchanged, and the return value is the original C<s>.
2509
2510 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2511 newly created string containing a downgraded copy of C<s>, and whose length is
2512 returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
2513 caller is responsible for arranging for the memory used by this string to get
2514 freed.
2515
2516 Upon successful return, the number of variants in the string can be computed by
2517 having saved the value of C<*lenp> before the call, and subtracting the
2518 after-call value of C<*lenp> from it.
2519
2520 =cut
2521
2522 There is a macro that avoids this function call, but this is retained for
2523 anyone who calls it with the Perl_ prefix */
2524
2525 U8 *
2526 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2527 {
2528     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2529     PERL_UNUSED_CONTEXT;
2530
2531     return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2532 }
2533
2534 /*
2535 No = here because currently externally undocumented
2536 for apidoc bytes_from_utf8_loc
2537
2538 Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2539 to store the location of the first character in C<"s"> that cannot be
2540 converted to non-UTF8.
2541
2542 If that parameter is C<NULL>, this function behaves identically to
2543 C<bytes_from_utf8>.
2544
2545 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2546 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2547
2548 Otherwise, the function returns a newly created C<NUL>-terminated string
2549 containing the non-UTF8 equivalent of the convertible first portion of
2550 C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
2551 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2552 and C<*first_non_downgradable> is set to C<NULL>.
2553
2554 Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2555 first character in the original string that wasn't converted.  C<*is_utf8p> is
2556 unchanged.  Note that the new string may have length 0.
2557
2558 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2559 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2560 converts as many characters in it as possible stopping at the first one it
2561 finds that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
2562 set to point to that.  The function returns the portion that could be converted
2563 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2564 not including the terminating C<NUL>.  If the very first character in the
2565 original could not be converted, C<*lenp> will be 0, and the new string will
2566 contain just a single C<NUL>.  If the entire input string was converted,
2567 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2568
2569 Upon successful return, the number of variants in the converted portion of the
2570 string can be computed by having saved the value of C<*lenp> before the call,
2571 and subtracting the after-call value of C<*lenp> from it.
2572
2573 =cut
2574
2575
2576 */
2577
2578 U8 *
2579 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2580 {
2581     U8 *d;
2582     const U8 *original = s;
2583     U8 *converted_start;
2584     const U8 *send = s + *lenp;
2585
2586     PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2587
2588     if (! *is_utf8p) {
2589         if (first_unconverted) {
2590             *first_unconverted = NULL;
2591         }
2592
2593         return (U8 *) original;
2594     }
2595
2596     Newx(d, (*lenp) + 1, U8);
2597
2598     converted_start = d;
2599     while (s < send) {
2600         U8 c = *s++;
2601         if (! UTF8_IS_INVARIANT(c)) {
2602
2603             /* Then it is multi-byte encoded.  If the code point is above 0xFF,
2604              * have to stop now */
2605             if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2606                 if (first_unconverted) {
2607                     *first_unconverted = s - 1;
2608                     goto finish_and_return;
2609                 }
2610                 else {
2611                     Safefree(converted_start);
2612                     return (U8 *) original;
2613                 }
2614             }
2615
2616             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2617             s++;
2618         }
2619         *d++ = c;
2620     }
2621
2622     /* Here, converted the whole of the input */
2623     *is_utf8p = FALSE;
2624     if (first_unconverted) {
2625         *first_unconverted = NULL;
2626     }
2627
2628   finish_and_return:
2629     *d = '\0';
2630     *lenp = d - converted_start;
2631
2632     /* Trim unused space */
2633     Renew(converted_start, *lenp + 1, U8);
2634
2635     return converted_start;
2636 }
2637
2638 /*
2639 =for apidoc bytes_to_utf8
2640
2641 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2642 UTF-8.
2643 Returns a pointer to the newly-created string, and sets C<*lenp> to
2644 reflect the new length in bytes.  The caller is responsible for arranging for
2645 the memory used by this string to get freed.
2646
2647 Upon successful return, the number of variants in the string can be computed by
2648 having saved the value of C<*lenp> before the call, and subtracting it from the
2649 after-call value of C<*lenp>.
2650
2651 A C<NUL> character will be written after the end of the string.
2652
2653 If you want to convert to UTF-8 from encodings other than
2654 the native (Latin1 or EBCDIC),
2655 see L</sv_recode_to_utf8>().
2656
2657 =cut
2658 */
2659
2660 U8*
2661 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2662 {
2663     const U8 * const send = s + (*lenp);
2664     U8 *d;
2665     U8 *dst;
2666
2667     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2668     PERL_UNUSED_CONTEXT;
2669
2670     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2671     Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2672     dst = d;
2673
2674     while (s < send) {
2675         append_utf8_from_native_byte(*s, &d);
2676         s++;
2677     }
2678
2679     *d = '\0';
2680     *lenp = d-dst;
2681
2682     return dst;
2683 }
2684
2685 /*
2686  * Convert native (big-endian) UTF-16 to UTF-8.  For reversed (little-endian),
2687  * use utf16_to_utf8_reversed().
2688  *
2689  * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
2690  * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
2691  * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
2692  *
2693  * These functions don't check for overflow.  The worst case is every code
2694  * point in the input is 2 bytes, and requires 4 bytes on output.  (If the code
2695  * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.)  Therefore the
2696  * destination must be pre-extended to 2 times the source length.
2697  *
2698  * Do not use in-place.  We optimize for native, for obvious reasons. */
2699
2700 U8*
2701 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
2702 {
2703     U8* pend;
2704     U8* dstart = d;
2705
2706     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2707
2708     if (bytelen & 1)
2709         Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
2710                                                                (UV)bytelen);
2711
2712     pend = p + bytelen;
2713
2714     while (p < pend) {
2715         UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2716         p += 2;
2717         if (OFFUNI_IS_INVARIANT(uv)) {
2718             *d++ = LATIN1_TO_NATIVE((U8) uv);
2719             continue;
2720         }
2721         if (uv <= MAX_UTF8_TWO_BYTE) {
2722             *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2723             *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
2724             continue;
2725         }
2726
2727 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2728 #define LAST_HIGH_SURROGATE  0xDBFF
2729 #define FIRST_LOW_SURROGATE  0xDC00
2730 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
2731 #define FIRST_IN_PLANE1      0x10000
2732
2733         /* This assumes that most uses will be in the first Unicode plane, not
2734          * needing surrogates */
2735         if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2736                   && uv <= UNICODE_SURROGATE_LAST))
2737         {
2738             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2739                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2740             }
2741             else {
2742                 UV low = (p[0] << 8) + p[1];
2743                 if (   UNLIKELY(low < FIRST_LOW_SURROGATE)
2744                     || UNLIKELY(low > LAST_LOW_SURROGATE))
2745                 {
2746                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2747                 }
2748                 p += 2;
2749                 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2750                                 + (low - FIRST_LOW_SURROGATE) + FIRST_IN_PLANE1;
2751             }
2752         }
2753 #ifdef EBCDIC
2754         d = uvoffuni_to_utf8_flags(d, uv, 0);
2755 #else
2756         if (uv < FIRST_IN_PLANE1) {
2757             *d++ = (U8)(( uv >> 12)         | 0xe0);
2758             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
2759             *d++ = (U8)(( uv        & 0x3f) | 0x80);
2760             continue;
2761         }
2762         else {
2763             *d++ = (U8)(( uv >> 18)         | 0xf0);
2764             *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2765             *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
2766             *d++ = (U8)(( uv        & 0x3f) | 0x80);
2767             continue;
2768         }
2769 #endif
2770     }
2771     *newlen = d - dstart;
2772     return d;
2773 }
2774
2775 /* Note: this one is slightly destructive of the source. */
2776
2777 U8*
2778 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
2779 {
2780     U8* s = (U8*)p;
2781     U8* const send = s + bytelen;
2782
2783     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2784
2785     if (bytelen & 1)
2786         Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
2787                    (UV)bytelen);
2788
2789     while (s < send) {
2790         const U8 tmp = s[0];
2791         s[0] = s[1];
2792         s[1] = tmp;
2793         s += 2;
2794     }
2795     return utf16_to_utf8(p, d, bytelen, newlen);
2796 }
2797
2798 bool
2799 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2800 {
2801     dVAR;
2802     return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2803 }
2804
2805 bool
2806 Perl__is_uni_perl_idcont(pTHX_ UV c)
2807 {
2808     dVAR;
2809     return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2810 }
2811
2812 bool
2813 Perl__is_uni_perl_idstart(pTHX_ UV c)
2814 {
2815     dVAR;
2816     return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2817 }
2818
2819 UV
2820 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2821                                   const char S_or_s)
2822 {
2823     /* We have the latin1-range values compiled into the core, so just use
2824      * those, converting the result to UTF-8.  The only difference between upper
2825      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2826      * either "SS" or "Ss".  Which one to use is passed into the routine in
2827      * 'S_or_s' to avoid a test */
2828
2829     UV converted = toUPPER_LATIN1_MOD(c);
2830
2831     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2832
2833     assert(S_or_s == 'S' || S_or_s == 's');
2834
2835     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2836                                              characters in this range */
2837         *p = (U8) converted;
2838         *lenp = 1;
2839         return converted;
2840     }
2841
2842     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2843      * which it maps to one of them, so as to only have to have one check for
2844      * it in the main case */
2845     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2846         switch (c) {
2847             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2848                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2849                 break;
2850             case MICRO_SIGN:
2851                 converted = GREEK_CAPITAL_LETTER_MU;
2852                 break;
2853 #if    UNICODE_MAJOR_VERSION > 2                                        \
2854    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
2855                                   && UNICODE_DOT_DOT_VERSION >= 8)
2856             case LATIN_SMALL_LETTER_SHARP_S:
2857                 *(p)++ = 'S';
2858                 *p = S_or_s;
2859                 *lenp = 2;
2860                 return 'S';
2861 #endif
2862             default:
2863                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2864                                  " '%c' to map to '%c'",
2865                                  c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2866                 NOT_REACHED; /* NOTREACHED */
2867         }
2868     }
2869
2870     *(p)++ = UTF8_TWO_BYTE_HI(converted);
2871     *p = UTF8_TWO_BYTE_LO(converted);
2872     *lenp = 2;
2873
2874     return converted;
2875 }
2876
2877 /* If compiled on an early Unicode version, there may not be auxiliary tables
2878  * */
2879 #ifndef HAS_UC_AUX_TABLES
2880 #  define UC_AUX_TABLE_ptrs     NULL
2881 #  define UC_AUX_TABLE_lengths  NULL
2882 #endif
2883 #ifndef HAS_TC_AUX_TABLES
2884 #  define TC_AUX_TABLE_ptrs     NULL
2885 #  define TC_AUX_TABLE_lengths  NULL
2886 #endif
2887 #ifndef HAS_LC_AUX_TABLES
2888 #  define LC_AUX_TABLE_ptrs     NULL
2889 #  define LC_AUX_TABLE_lengths  NULL
2890 #endif
2891 #ifndef HAS_CF_AUX_TABLES
2892 #  define CF_AUX_TABLE_ptrs     NULL
2893 #  define CF_AUX_TABLE_lengths  NULL
2894 #endif
2895 #ifndef HAS_UC_AUX_TABLES
2896 #  define UC_AUX_TABLE_ptrs     NULL
2897 #  define UC_AUX_TABLE_lengths  NULL
2898 #endif
2899
2900 /* Call the function to convert a UTF-8 encoded character to the specified case.
2901  * Note that there may be more than one character in the result.
2902  * 's' is a pointer to the first byte of the input character
2903  * 'd' will be set to the first byte of the string of changed characters.  It
2904  *      needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2905  * 'lenp' will be set to the length in bytes of the string of changed characters
2906  *
2907  * The functions return the ordinal of the first character in the string of
2908  * 'd' */
2909 #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
2910                 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
2911                                               Uppercase_Mapping_invmap,     \
2912                                               UC_AUX_TABLE_ptrs,            \
2913                                               UC_AUX_TABLE_lengths,         \
2914                                               "uppercase")
2915 #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
2916                 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
2917                                               Titlecase_Mapping_invmap,     \
2918                                               TC_AUX_TABLE_ptrs,            \
2919                                               TC_AUX_TABLE_lengths,         \
2920                                               "titlecase")
2921 #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
2922                 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
2923                                               Lowercase_Mapping_invmap,     \
2924                                               LC_AUX_TABLE_ptrs,            \
2925                                               LC_AUX_TABLE_lengths,         \
2926                                               "lowercase")
2927
2928
2929 /* This additionally has the input parameter 'specials', which if non-zero will
2930  * cause this to use the specials hash for folding (meaning get full case
2931  * folding); otherwise, when zero, this implies a simple case fold */
2932 #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
2933         (specials)                                                          \
2934         ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
2935                                           Case_Folding_invmap,              \
2936                                           CF_AUX_TABLE_ptrs,                \
2937                                           CF_AUX_TABLE_lengths,             \
2938                                           "foldcase")                       \
2939         : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
2940                                          Simple_Case_Folding_invmap,        \
2941                                          NULL, NULL,                        \
2942                                          "foldcase")
2943
2944 UV
2945 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
2946 {
2947     /* Convert the Unicode character whose ordinal is <c> to its uppercase
2948      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2949      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2950      * the changed version may be longer than the original character.
2951      *
2952      * The ordinal of the first character of the changed version is returned
2953      * (but note, as explained above, that there may be more.) */
2954
2955     dVAR;
2956     PERL_ARGS_ASSERT_TO_UNI_UPPER;
2957
2958     if (c < 256) {
2959         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2960     }
2961
2962     return CALL_UPPER_CASE(c, NULL, p, lenp);
2963 }
2964
2965 UV
2966 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
2967 {
2968     dVAR;
2969     PERL_ARGS_ASSERT_TO_UNI_TITLE;
2970
2971     if (c < 256) {
2972         return _to_upper_title_latin1((U8) c, p, lenp, 's');
2973     }
2974
2975     return CALL_TITLE_CASE(c, NULL, p, lenp);
2976 }
2977
2978 STATIC U8
2979 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
2980 {
2981     /* We have the latin1-range values compiled into the core, so just use
2982      * those, converting the result to UTF-8.  Since the result is always just
2983      * one character, we allow <p> to be NULL */
2984
2985     U8 converted = toLOWER_LATIN1(c);
2986
2987     PERL_UNUSED_ARG(dummy);
2988
2989     if (p != NULL) {
2990         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
2991             *p = converted;
2992             *lenp = 1;
2993         }
2994         else {
2995             /* Result is known to always be < 256, so can use the EIGHT_BIT
2996              * macros */
2997             *p = UTF8_EIGHT_BIT_HI(converted);
2998             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
2999             *lenp = 2;
3000         }
3001     }
3002     return converted;
3003 }
3004
3005 UV
3006 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
3007 {
3008     dVAR;
3009     PERL_ARGS_ASSERT_TO_UNI_LOWER;
3010
3011     if (c < 256) {
3012         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
3013     }
3014
3015     return CALL_LOWER_CASE(c, NULL, p, lenp);
3016 }
3017
3018 UV
3019 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
3020 {
3021     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
3022      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3023      *      FOLD_FLAGS_FULL  iff full folding is to be used;
3024      *
3025      *  Not to be used for locale folds
3026      */
3027
3028     UV converted;
3029
3030     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3031
3032     assert (! (flags & FOLD_FLAGS_LOCALE));
3033
3034     if (UNLIKELY(c == MICRO_SIGN)) {
3035         converted = GREEK_SMALL_LETTER_MU;
3036     }
3037 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3038    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3039                                       || UNICODE_DOT_DOT_VERSION > 0)
3040     else if (   (flags & FOLD_FLAGS_FULL)
3041              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3042     {
3043         /* If can't cross 127/128 boundary, can't return "ss"; instead return
3044          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3045          * under those circumstances. */
3046         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3047             *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3048             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3049                  p, *lenp, U8);
3050             return LATIN_SMALL_LETTER_LONG_S;
3051         }
3052         else {
3053             *(p)++ = 's';
3054             *p = 's';
3055             *lenp = 2;
3056             return 's';
3057         }
3058     }
3059 #endif
3060     else { /* In this range the fold of all other characters is their lower
3061               case */
3062         converted = toLOWER_LATIN1(c);
3063     }
3064
3065     if (UVCHR_IS_INVARIANT(converted)) {
3066         *p = (U8) converted;
3067         *lenp = 1;
3068     }
3069     else {
3070         *(p)++ = UTF8_TWO_BYTE_HI(converted);
3071         *p = UTF8_TWO_BYTE_LO(converted);
3072         *lenp = 2;
3073     }
3074
3075     return converted;
3076 }
3077
3078 UV
3079 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3080 {
3081
3082     /* Not currently externally documented, and subject to change
3083      *  <flags> bits meanings:
3084      *      FOLD_FLAGS_FULL  iff full folding is to be used;
3085      *      FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3086      *                        locale are to be used.
3087      *      FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3088      */
3089
3090     dVAR;
3091     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3092
3093     if (flags & FOLD_FLAGS_LOCALE) {
3094         /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3095          * except for potentially warning */
3096         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3097         if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
3098             flags &= ~FOLD_FLAGS_LOCALE;
3099         }
3100         else {
3101             goto needs_full_generality;
3102         }
3103     }
3104
3105     if (c < 256) {
3106         return _to_fold_latin1((U8) c, p, lenp,
3107                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3108     }
3109
3110     /* Here, above 255.  If no special needs, just use the macro */
3111     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3112         return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3113     }
3114     else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3115                the special flags. */
3116         U8 utf8_c[UTF8_MAXBYTES + 1];
3117
3118       needs_full_generality:
3119         uvchr_to_utf8(utf8_c, c);
3120         return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
3121                                   p, lenp, flags);
3122     }
3123 }
3124
3125 PERL_STATIC_INLINE bool
3126 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3127                        SV* const invlist)
3128 {
3129     /* returns a boolean giving whether or not the UTF8-encoded character that
3130      * starts at <p>, and extending no further than <e - 1> is in the inversion
3131      * list <invlist>. */
3132
3133     UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3134
3135     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3136
3137     if (cp == 0 && (p >= e || *p != '\0')) {
3138         _force_out_malformed_utf8_message(p, e, 0, 1);
3139         NOT_REACHED; /* NOTREACHED */
3140     }
3141
3142     assert(invlist);
3143     return _invlist_contains_cp(invlist, cp);
3144 }
3145
3146 #if 0   /* Not currently used, but may be needed in the future */
3147 PERLVAR(I, seen_deprecated_macro, HV *)
3148
3149 STATIC void
3150 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
3151                                      const char * const alternative,
3152                                      const bool use_locale,
3153                                      const char * const file,
3154                                      const unsigned line)
3155 {
3156     const char * key;
3157
3158     PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3159
3160     if (ckWARN_d(WARN_DEPRECATED)) {
3161
3162         key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3163         if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3164             if (! PL_seen_deprecated_macro) {
3165                 PL_seen_deprecated_macro = newHV();
3166             }
3167             if (! hv_store(PL_seen_deprecated_macro, key,
3168                            strlen(key), &PL_sv_undef, 0))
3169             {
3170                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3171             }
3172
3173             if (instr(file, "mathoms.c")) {
3174                 Perl_warner(aTHX_ WARN_DEPRECATED,
3175                             "In %s, line %d, starting in Perl v5.32, %s()"
3176                             " will be removed.  Avoid this message by"
3177                             " converting to use %s().\n",
3178                             file, line, name, alternative);
3179             }
3180             else {
3181                 Perl_warner(aTHX_ WARN_DEPRECATED,
3182                             "In %s, line %d, starting in Perl v5.32, %s() will"
3183                             " require an additional parameter.  Avoid this"
3184                             " message by converting to use %s().\n",
3185                             file, line, name, alternative);
3186             }
3187         }
3188     }
3189 }
3190 #endif
3191
3192 bool
3193 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3194 {
3195     dVAR;
3196     PERL_ARGS_ASSERT__IS_UTF8_FOO;
3197
3198     return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3199 }
3200
3201 bool
3202 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3203 {
3204     dVAR;
3205     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3206
3207     return is_utf8_common(p, e, PL_utf8_perl_idstart);
3208 }
3209
3210 bool
3211 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3212 {
3213     dVAR;
3214     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3215
3216     return is_utf8_common(p, e, PL_utf8_perl_idcont);
3217 }
3218
3219 STATIC UV
3220 S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
3221                       U8* ustrp, STRLEN *lenp,
3222                       SV *invlist, const int * const invmap,
3223                       const unsigned int * const * const aux_tables,
3224                       const U8 * const aux_table_lengths,
3225                       const char * const normal)
3226 {
3227     STRLEN len = 0;
3228
3229     /* Change the case of code point 'uv1' whose UTF-8 representation (assumed
3230      * by this routine to be valid) begins at 'p'.  'normal' is a string to use
3231      * to name the new case in any generated messages, as a fallback if the
3232      * operation being used is not available.  The new case is given by the
3233      * data structures in the remaining arguments.
3234      *
3235      * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3236      * entire changed case string, and the return value is the first code point
3237      * in that string */
3238
3239     PERL_ARGS_ASSERT__TO_UTF8_CASE;
3240
3241     /* For code points that don't change case, we already know that the output
3242      * of this function is the unchanged input, so we can skip doing look-ups
3243      * for them.  Unfortunately the case-changing code points are scattered
3244      * around.  But there are some long consecutive ranges where there are no
3245      * case changing code points.  By adding tests, we can eliminate the lookup
3246      * for all the ones in such ranges.  This is currently done here only for
3247      * just a few cases where the scripts are in common use in modern commerce
3248      * (and scripts adjacent to those which can be included without additional
3249      * tests). */
3250
3251     if (uv1 >= 0x0590) {
3252         /* This keeps from needing further processing the code points most
3253          * likely to be used in the following non-cased scripts: Hebrew,
3254          * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
3255          * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
3256          * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
3257         if (uv1 < 0x10A0) {
3258             goto cases_to_self;
3259         }
3260
3261         /* The following largish code point ranges also don't have case
3262          * changes, but khw didn't think they warranted extra tests to speed
3263          * them up (which would slightly slow down everything else above them):
3264          * 1100..139F   Hangul Jamo, Ethiopic
3265          * 1400..1CFF   Unified Canadian Aboriginal Syllabics, Ogham, Runic,
3266          *              Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
3267          *              Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
3268          *              Combining Diacritical Marks Extended, Balinese,
3269          *              Sundanese, Batak, Lepcha, Ol Chiki
3270          * 2000..206F   General Punctuation
3271          */
3272
3273         if (uv1 >= 0x2D30) {
3274
3275             /* This keeps the from needing further processing the code points
3276              * most likely to be used in the following non-cased major scripts:
3277              * CJK, Katakana, Hiragana, plus some less-likely scripts.
3278              *
3279              * (0x2D30 above might have to be changed to 2F00 in the unlikely
3280              * event that Unicode eventually allocates the unused block as of
3281              * v8.0 2FE0..2FEF to code points that are cased.  khw has verified
3282              * that the test suite will start having failures to alert you
3283              * should that happen) */
3284             if (uv1 < 0xA640) {
3285                 goto cases_to_self;
3286             }
3287
3288             if (uv1 >= 0xAC00) {
3289                 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
3290                     if (ckWARN_d(WARN_SURROGATE)) {
3291                         const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3292                         Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3293                             "Operation \"%s\" returns its argument for"
3294                             " UTF-16 surrogate U+%04" UVXf, desc, uv1);
3295                     }
3296                     goto cases_to_self;
3297                 }
3298
3299                 /* AC00..FAFF Catches Hangul syllables and private use, plus
3300                  * some others */
3301                 if (uv1 < 0xFB00) {
3302                     goto cases_to_self;
3303                 }
3304
3305                 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
3306                     if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
3307                         Perl_croak(aTHX_ cp_above_legal_max, uv1,
3308                                          MAX_LEGAL_CP);
3309                     }
3310                     if (ckWARN_d(WARN_NON_UNICODE)) {
3311                         const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3312                         Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3313                             "Operation \"%s\" returns its argument for"
3314                             " non-Unicode code point 0x%04" UVXf, desc, uv1);
3315                     }
3316                     goto cases_to_self;
3317                 }
3318 #ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
3319                 if (UNLIKELY(uv1
3320                     > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
3321                 {
3322
3323                     goto cases_to_self;
3324                 }
3325 #endif
3326             }
3327         }
3328
3329         /* Note that non-characters are perfectly legal, so no warning should
3330          * be given. */
3331     }
3332
3333     {
3334         unsigned int i;
3335         const unsigned int * cp_list;
3336         U8 * d;
3337
3338         /* 'index' is guaranteed to be non-negative, as this is an inversion
3339          * map that covers all possible inputs.  See [perl #133365] */
3340         SSize_t index = _invlist_search(invlist, uv1);
3341         IV base = invmap[index];
3342
3343         /* The data structures are set up so that if 'base' is non-negative,
3344          * the case change is 1-to-1; and if 0, the change is to itself */
3345         if (base >= 0) {
3346             IV lc;
3347
3348             if (base == 0) {
3349                 goto cases_to_self;
3350             }
3351
3352             /* This computes, e.g. lc(H) as 'H - A + a', using the lc table */
3353             lc = base + uv1 - invlist_array(invlist)[index];
3354             *lenp = uvchr_to_utf8(ustrp, lc) - ustrp;
3355             return lc;
3356         }
3357
3358         /* Here 'base' is negative.  That means the mapping is 1-to-many, and
3359          * requires an auxiliary table look up.  abs(base) gives the index into
3360          * a list of such tables which points to the proper aux table.  And a
3361          * parallel list gives the length of each corresponding aux table. */
3362         cp_list = aux_tables[-base];
3363
3364         /* Create the string of UTF-8 from the mapped-to code points */
3365         d = ustrp;
3366         for (i = 0; i < aux_table_lengths[-base]; i++) {
3367             d = uvchr_to_utf8(d, cp_list[i]);
3368         }
3369         *d = '\0';
3370         *lenp = d - ustrp;
3371
3372         return cp_list[0];
3373     }
3374
3375     /* Here, there was no mapping defined, which means that the code point maps
3376      * to itself.  Return the inputs */
3377   cases_to_self:
3378     if (p) {
3379         len = UTF8SKIP(p);
3380         if (p != ustrp) {   /* Don't copy onto itself */
3381             Copy(p, ustrp, len, U8);
3382         }
3383         *lenp = len;
3384     }
3385     else {
3386         *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
3387     }
3388
3389     return uv1;
3390
3391 }
3392
3393 Size_t
3394 Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
3395                           const unsigned int ** remaining_folds_to)
3396 {
3397     /* Returns the count of the number of code points that fold to the input
3398      * 'cp' (besides itself).
3399      *
3400      * If the return is 0, there is nothing else that folds to it, and
3401      * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3402      *
3403      * If the return is 1, '*first_folds_to' is set to the single code point,
3404      * and '*remaining_folds_to' is set to NULL.
3405      *
3406      * Otherwise, '*first_folds_to' is set to a code point, and
3407      * '*remaining_fold_to' is set to an array that contains the others.  The
3408      * length of this array is the returned count minus 1.
3409      *
3410      * The reason for this convolution is to avoid having to deal with
3411      * allocating and freeing memory.  The lists are already constructed, so
3412      * the return can point to them, but single code points aren't, so would
3413      * need to be constructed if we didn't employ something like this API */
3414
3415     dVAR;
3416     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3417      * that covers all possible inputs.  See [perl #133365] */
3418     SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3419     int base = _Perl_IVCF_invmap[index];
3420
3421     PERL_ARGS_ASSERT__INVERSE_FOLDS;
3422
3423     if (base == 0) {            /* No fold */
3424         *first_folds_to = 0;
3425         *remaining_folds_to = NULL;
3426         return 0;
3427     }
3428
3429 #ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
3430
3431     assert(base > 0);
3432
3433 #else
3434
3435     if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
3436
3437         /* The data structure is set up so that the absolute value of 'base' is
3438          * an index into a table of pointers to arrays, with the array
3439          * corresponding to the index being the list of code points that fold
3440          * to 'cp', and the parallel array containing the length of the list
3441          * array */
3442         *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3443         *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
3444                                                                  *first_folds_to
3445                                                                 */
3446         return IVCF_AUX_TABLE_lengths[-base];
3447     }
3448
3449 #endif
3450
3451     /* Only the single code point.  This works like 'fc(G) = G - A + a' */
3452     *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
3453     *remaining_folds_to = NULL;
3454     return 1;
3455 }
3456
3457 STATIC UV
3458 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3459                                        U8* const ustrp, STRLEN *lenp)
3460 {
3461     /* This is called when changing the case of a UTF-8-encoded character above
3462      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
3463      * result contains a character that crosses the 255/256 boundary, disallow
3464      * the change, and return the original code point.  See L<perlfunc/lc> for
3465      * why;
3466      *
3467      * p        points to the original string whose case was changed; assumed
3468      *          by this routine to be well-formed
3469      * result   the code point of the first character in the changed-case string
3470      * ustrp    points to the changed-case string (<result> represents its
3471      *          first char)
3472      * lenp     points to the length of <ustrp> */
3473
3474     UV original;    /* To store the first code point of <p> */
3475
3476     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3477
3478     assert(UTF8_IS_ABOVE_LATIN1(*p));
3479
3480     /* We know immediately if the first character in the string crosses the
3481      * boundary, so can skip testing */
3482     if (result > 255) {
3483
3484         /* Look at every character in the result; if any cross the
3485         * boundary, the whole thing is disallowed */
3486         U8* s = ustrp + UTF8SKIP(ustrp);
3487         U8* e = ustrp + *lenp;
3488         while (s < e) {
3489             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3490                 goto bad_crossing;
3491             }
3492             s += UTF8SKIP(s);
3493         }
3494
3495         /* Here, no characters crossed, result is ok as-is, but we warn. */
3496         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3497         return result;
3498     }
3499
3500   bad_crossing:
3501
3502     /* Failed, have to return the original */
3503     original = valid_utf8_to_uvchr(p, lenp);
3504
3505     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3506     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3507                            "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3508                            " locale; resolved to \"\\x{%" UVXf "}\".",
3509                            OP_DESC(PL_op),
3510                            original,
3511                            original);
3512     Copy(p, ustrp, *lenp, char);
3513     return original;
3514 }
3515
3516 STATIC UV
3517 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3518                         U8 * ustrp, STRLEN *lenp)
3519 {
3520     /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3521      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3522      * Otherwise, it returns the first code point of the Turkic foldcased
3523      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3524      * contain *lenp bytes
3525      *
3526      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3527      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3528      * DOTLESS I */
3529
3530     PERL_ARGS_ASSERT_TURKIC_FC;
3531     assert(e > p);
3532
3533     if (UNLIKELY(*p == 'I')) {
3534         *lenp = 2;
3535         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3536         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3537         return LATIN_SMALL_LETTER_DOTLESS_I;
3538     }
3539
3540     if (UNLIKELY(memBEGINs(p, e - p,
3541                            LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3542     {
3543         *lenp = 1;
3544         *ustrp = 'i';
3545         return 'i';
3546     }
3547
3548     return 0;
3549 }
3550
3551 STATIC UV
3552 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3553                         U8 * ustrp, STRLEN *lenp)
3554 {
3555     /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3556      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3557      * Otherwise, it returns the first code point of the Turkic lowercased
3558      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3559      * contain *lenp bytes */
3560
3561     dVAR;
3562     PERL_ARGS_ASSERT_TURKIC_LC;
3563     assert(e > p0);
3564
3565     /* A 'I' requires context as to what to do */
3566     if (UNLIKELY(*p0 == 'I')) {
3567         const U8 * p = p0 + 1;
3568
3569         /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3570          * modified by a dot above lowercases to 'i' even in turkic locales. */
3571         while (p < e) {
3572             UV cp;
3573
3574             if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3575                 ustrp[0] = 'i';
3576                 *lenp = 1;
3577                 return 'i';
3578             }
3579
3580             /* For the dot above to modify the 'I', it must be part of a
3581              * combining sequence immediately following the 'I', and no other
3582              * modifier with a ccc of 230 may intervene */
3583             cp = utf8_to_uvchr_buf(p, e, NULL);
3584             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3585                 break;
3586             }
3587
3588             /* Here the combining sequence continues */
3589             p += UTF8SKIP(p);
3590         }
3591     }
3592
3593     /* In all other cases the lc is the same as the fold */
3594     return turkic_fc(p0, e, ustrp, lenp);
3595 }
3596
3597 STATIC UV
3598 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3599                         U8 * ustrp, STRLEN *lenp)
3600 {
3601     /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3602      * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3603      * Otherwise, it returns the first code point of the Turkic upper or
3604      * title-cased sequence, and the entire sequence will be stored in *ustrp.
3605      * ustrp will contain *lenp bytes
3606      *
3607      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3608      * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
3609      * DOTLESS I */
3610
3611     PERL_ARGS_ASSERT_TURKIC_UC;
3612     assert(e > p);
3613
3614     if (*p == 'i') {
3615         *lenp = 2;
3616         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3617         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3618         return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3619     }
3620
3621     if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3622         *lenp = 1;
3623         *ustrp = 'I';
3624         return 'I';
3625     }
3626
3627     return 0;
3628 }
3629
3630 /* The process for changing the case is essentially the same for the four case
3631  * change types, except there are complications for folding.  Otherwise the
3632  * difference is only which case to change to.  To make sure that they all do
3633  * the same thing, the bodies of the functions are extracted out into the
3634  * following two macros.  The functions are written with the same variable
3635  * names, and these are known and used inside these macros.  It would be
3636  * better, of course, to have inline functions to do it, but since different
3637  * macros are called, depending on which case is being changed to, this is not
3638  * feasible in C (to khw's knowledge).  Two macros are created so that the fold
3639  * function can start with the common start macro, then finish with its special
3640  * handling; while the other three cases can just use the common end macro.
3641  *
3642  * The algorithm is to use the proper (passed in) macro or function to change
3643  * the case for code points that are below 256.  The macro is used if using
3644  * locale rules for the case change; the function if not.  If the code point is
3645  * above 255, it is computed from the input UTF-8, and another macro is called
3646  * to do the conversion.  If necessary, the output is converted to UTF-8.  If
3647  * using a locale, we have to check that the change did not cross the 255/256
3648  * boundary, see check_locale_boundary_crossing() for further details.
3649  *
3650  * The macros are split with the correct case change for the below-256 case
3651  * stored into 'result', and in the middle of an else clause for the above-255
3652  * case.  At that point in the 'else', 'result' is not the final result, but is
3653  * the input code point calculated from the UTF-8.  The fold code needs to
3654  * realize all this and take it from there.
3655  *
3656  * To deal with Turkic locales, the function specified by the parameter
3657  * 'turkic' is called when appropriate.
3658  *
3659  * If you read the two macros as sequential, it's easier to understand what's
3660  * going on. */
3661 #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func,    \
3662                                L1_func_extra_param, turkic)                  \
3663                                                                              \
3664     if (flags & (locale_flags)) {                                            \
3665         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                  \
3666         if (IN_UTF8_CTYPE_LOCALE) {                                          \
3667             if (UNLIKELY(PL_in_utf8_turkic_locale)) {                        \
3668                 UV ret = turkic(p, e, ustrp, lenp);                          \
3669                 if (ret) return ret;                                         \
3670             }                                                                \
3671                                                                              \
3672             /* Otherwise, treat a UTF-8 locale as not being in locale at     \
3673              * all */                                                        \
3674             flags &= ~(locale_flags);                                        \
3675         }                                                                    \
3676     }                                                                        \
3677                                                                              \
3678     if (UTF8_IS_INVARIANT(*p)) {                                             \
3679         if (flags & (locale_flags)) {                                        \
3680             result = LC_L1_change_macro(*p);                                 \
3681         }                                                                    \
3682         else {                                                               \
3683             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
3684         }                                                                    \
3685     }                                                                        \
3686     else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
3687         U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
3688         if (flags & (locale_flags)) {                                        \
3689             result = LC_L1_change_macro(c);                                  \
3690         }                                                                    \
3691         else {                                                               \
3692             return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
3693         }                                                                    \
3694     }                                                                        \
3695     else {  /* malformed UTF-8 or ord above 255 */                           \
3696         STRLEN len_result;                                                   \
3697         result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
3698         if (len_result == (STRLEN) -1) {                                     \
3699             _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
3700         }
3701
3702 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
3703         result = change_macro(result, p, ustrp, lenp);                       \
3704                                                                              \
3705         if (flags & (locale_flags)) {                                        \
3706             result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3707         }                                                                    \
3708         return result;                                                       \
3709     }                                                                        \
3710                                                                              \
3711     /* Here, used locale rules.  Convert back to UTF-8 */                    \
3712     if (UTF8_IS_INVARIANT(result)) {                                         \
3713         *ustrp = (U8) result;                                                \
3714         *lenp = 1;                                                           \
3715     }                                                                        \
3716     else {                                                                   \
3717         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);                             \
3718         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);                       \
3719         *lenp = 2;                                                           \
3720     }                                                                        \
3721                                                                              \
3722     return result;
3723
3724 /* Not currently externally documented, and subject to change:
3725  * <flags> is set iff iff the rules from the current underlying locale are to
3726  *         be used. */
3727
3728 UV
3729 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3730                                 const U8 *e,
3731                                 U8* ustrp,
3732                                 STRLEN *lenp,
3733                                 bool flags)
3734 {
3735     dVAR;
3736     UV result;
3737
3738     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3739
3740     /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3741     /* 2nd char of uc(U+DF) is 'S' */
3742     CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
3743                                                                     turkic_uc);
3744     CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
3745 }
3746
3747 /* Not currently externally documented, and subject to change:
3748  * <flags> is set iff the rules from the current underlying locale are to be
3749  *         used.  Since titlecase is not defined in POSIX, for other than a
3750  *         UTF-8 locale, uppercase is used instead for code points < 256.
3751  */
3752
3753 UV
3754 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3755                                 const U8 *e,
3756                                 U8* ustrp,
3757                                 STRLEN *lenp,
3758                                 bool flags)
3759 {
3760     dVAR;
3761     UV result;
3762
3763     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3764
3765     /* 2nd char of ucfirst(U+DF) is 's' */
3766     CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
3767                                                                     turkic_uc);
3768     CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
3769 }
3770
3771 /* Not currently externally documented, and subject to change:
3772  * <flags> is set iff iff the rules from the current underlying locale are to
3773  *         be used.
3774  */
3775
3776 UV
3777 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3778                                 const U8 *e,
3779                                 U8* ustrp,
3780                                 STRLEN *lenp,
3781                                 bool flags)
3782 {
3783     dVAR;
3784     UV result;
3785
3786     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3787
3788     CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
3789                                                                     turkic_lc);
3790     CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
3791 }
3792
3793 /* Not currently externally documented, and subject to change,
3794  * in <flags>
3795  *      bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3796  *                            locale are to be used.
3797  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
3798  *                            otherwise simple folds
3799  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3800  *                            prohibited
3801  */
3802
3803 UV
3804 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3805                                const U8 *e,
3806                                U8* ustrp,
3807                                STRLEN *lenp,
3808                                U8 flags)
3809 {
3810     dVAR;
3811     UV result;
3812
3813     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3814
3815     /* These are mutually exclusive */
3816     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3817
3818     assert(p != ustrp); /* Otherwise overwrites */
3819
3820     CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
3821                  ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3822                                                                     turkic_fc);
3823
3824         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3825
3826         if (flags & FOLD_FLAGS_LOCALE) {
3827
3828 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3829 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3830 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3831
3832             /* Special case these two characters, as what normally gets
3833              * returned under locale doesn't work */
3834             if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3835             {
3836                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3837                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3838                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3839                               "resolved to \"\\x{17F}\\x{17F}\".");
3840                 goto return_long_s;
3841             }
3842             else
3843 #endif
3844                  if (memBEGINs((char *) p, e - p, LONG_S_T))
3845             {
3846                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3847                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3848                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3849                               "resolved to \"\\x{FB06}\".");
3850                 goto return_ligature_st;
3851             }
3852
3853 #if    UNICODE_MAJOR_VERSION   == 3         \
3854     && UNICODE_DOT_VERSION     == 0         \
3855     && UNICODE_DOT_DOT_VERSION == 1
3856 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3857
3858             /* And special case this on this Unicode version only, for the same
3859              * reaons the other two are special cased.  They would cross the
3860              * 255/256 boundary which is forbidden under /l, and so the code
3861              * wouldn't catch that they are equivalent (which they are only in
3862              * this release) */
3863             else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
3864                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3865                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3866                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3867                               "resolved to \"\\x{0131}\".");
3868                 goto return_dotless_i;
3869             }
3870 #endif
3871
3872             return check_locale_boundary_crossing(p, result, ustrp, lenp);
3873         }
3874         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3875             return result;
3876         }
3877         else {
3878             /* This is called when changing the case of a UTF-8-encoded
3879              * character above the ASCII range, and the result should not
3880              * contain an ASCII character. */
3881
3882             UV original;    /* To store the first code point of <p> */
3883
3884             /* Look at every character in the result; if any cross the
3885             * boundary, the whole thing is disallowed */
3886             U8* s = ustrp;
3887             U8* send = ustrp + *lenp;
3888             while (s < send) {
3889                 if (isASCII(*s)) {
3890                     /* Crossed, have to return the original */
3891                     original = valid_utf8_to_uvchr(p, lenp);
3892
3893                     /* But in these instances, there is an alternative we can
3894                      * return that is valid */
3895                     if (original == LATIN_SMALL_LETTER_SHARP_S
3896 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3897                         || original == LATIN_CAPITAL_LETTER_SHARP_S
3898 #endif
3899                     ) {
3900                         goto return_long_s;
3901                     }
3902                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3903                         goto return_ligature_st;
3904                     }
3905 #if    UNICODE_MAJOR_VERSION   == 3         \
3906     && UNICODE_DOT_VERSION     == 0         \
3907     && UNICODE_DOT_DOT_VERSION == 1
3908
3909                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3910                         goto return_dotless_i;
3911                     }
3912 #endif
3913                     Copy(p, ustrp, *lenp, char);
3914                     return original;
3915                 }
3916                 s += UTF8SKIP(s);
3917             }
3918
3919             /* Here, no characters crossed, result is ok as-is */
3920             return result;
3921         }
3922     }
3923
3924     /* Here, used locale rules.  Convert back to UTF-8 */
3925     if (UTF8_IS_INVARIANT(result)) {
3926         *ustrp = (U8) result;
3927         *lenp = 1;
3928     }
3929     else {
3930         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3931         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3932         *lenp = 2;
3933     }
3934
3935     return result;
3936
3937   return_long_s:
3938     /* Certain folds to 'ss' are prohibited by the options, but they do allow
3939      * folds to a string of two of these characters.  By returning this
3940      * instead, then, e.g.,
3941      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3942      * works. */
3943
3944     *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3945     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
3946         ustrp, *lenp, U8);
3947     return LATIN_SMALL_LETTER_LONG_S;
3948
3949   return_ligature_st:
3950     /* Two folds to 'st' are prohibited by the options; instead we pick one and
3951      * have the other one fold to it */
3952
3953     *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3954     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3955     return LATIN_SMALL_LIGATURE_ST;
3956
3957 #if    UNICODE_MAJOR_VERSION   == 3         \
3958     && UNICODE_DOT_VERSION     == 0         \
3959     && UNICODE_DOT_DOT_VERSION == 1
3960
3961   return_dotless_i:
3962     *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3963     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3964     return LATIN_SMALL_LETTER_DOTLESS_I;
3965
3966 #endif
3967
3968 }
3969
3970 bool
3971 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
3972 {
3973     /* May change: warns if surrogates, non-character code points, or
3974      * non-Unicode code points are in 's' which has length 'len' bytes.
3975      * Returns TRUE if none found; FALSE otherwise.  The only other validity
3976      * check is to make sure that this won't exceed the string's length nor
3977      * overflow */
3978
3979     const U8* const e = s + len;
3980     bool ok = TRUE;
3981
3982     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
3983
3984     while (s < e) {
3985         if (UTF8SKIP(s) > len) {
3986             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
3987                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
3988             return FALSE;
3989         }
3990         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
3991             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
3992                 if (   ckWARN_d(WARN_NON_UNICODE)
3993                     || UNLIKELY(0 < does_utf8_overflow(s, s + len,
3994                                                0 /* Don't consider overlongs */
3995                                                )))
3996                 {
3997                     /* A side effect of this function will be to warn */
3998                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
3999                     ok = FALSE;
4000                 }
4001             }
4002             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4003                 if (ckWARN_d(WARN_SURROGATE)) {
4004                     /* This has a different warning than the one the called
4005                      * function would output, so can't just call it, unlike we
4006                      * do for the non-chars and above-unicodes */
4007                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
4008                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4009                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
4010                                              uv);
4011                     ok = FALSE;
4012                 }
4013             }
4014             else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
4015                      && (ckWARN_d(WARN_NONCHAR)))
4016             {
4017                 /* A side effect of this function will be to warn */
4018                 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
4019                 ok = FALSE;
4020             }
4021         }
4022         s += UTF8SKIP(s);
4023     }
4024
4025     return ok;
4026 }
4027
4028 /*
4029 =for apidoc pv_uni_display
4030
4031 Build to the scalar C<dsv> a displayable version of the string C<spv>,
4032 length C<len>, the displayable version being at most C<pvlim> bytes long
4033 (if longer, the rest is truncated and C<"..."> will be appended).
4034
4035 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4036 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4037 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4038 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4039 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4040 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
4041
4042 The pointer to the PV of the C<dsv> is returned.
4043
4044 See also L</sv_uni_display>.
4045
4046 =cut */
4047 char *
4048 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
4049                           UV flags)
4050 {
4051     int truncated = 0;
4052     const char *s, *e;
4053
4054     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4055
4056     SvPVCLEAR(dsv);
4057     SvUTF8_off(dsv);
4058     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4059          UV u;
4060           /* This serves double duty as a flag and a character to print after
4061              a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4062           */
4063          char ok = 0;
4064
4065          if (pvlim && SvCUR(dsv) >= pvlim) {
4066               truncated++;
4067               break;
4068          }
4069          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4070          if (u < 256) {
4071              const unsigned char c = (unsigned char)u & 0xFF;
4072              if (flags & UNI_DISPLAY_BACKSLASH) {
4073                  switch (c) {
4074                  case '\n':
4075                      ok = 'n'; break;
4076                  case '\r':
4077                      ok = 'r'; break;
4078                  case '\t':
4079                      ok = 't'; break;
4080                  case '\f':
4081                      ok = 'f'; break;
4082                  case '\a':
4083                      ok = 'a'; break;
4084                  case '\\':
4085                      ok = '\\'; break;
4086                  default: break;
4087                  }
4088                  if (ok) {
4089                      const char string = ok;
4090                      sv_catpvs(dsv, "\\");
4091                      sv_catpvn(dsv, &string, 1);
4092                  }
4093              }
4094              /* isPRINT() is the locale-blind version. */
4095              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4096                  const char string = c;
4097                  sv_catpvn(dsv, &string, 1);
4098                  ok = 1;
4099              }
4100          }
4101          if (!ok)
4102              Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4103     }
4104     if (truncated)
4105          sv_catpvs(dsv, "...");
4106
4107     return SvPVX(dsv);
4108 }
4109
4110 /*
4111 =for apidoc sv_uni_display
4112
4113 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4114 the displayable version being at most C<pvlim> bytes long
4115 (if longer, the rest is truncated and "..." will be appended).
4116
4117 The C<flags> argument is as in L</pv_uni_display>().
4118
4119 The pointer to the PV of the C<dsv> is returned.
4120
4121 =cut
4122 */
4123 char *
4124 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4125 {
4126     const char * const ptr =
4127         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4128
4129     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4130
4131     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4132                                 SvCUR(ssv), pvlim, flags);
4133 }
4134
4135 /*
4136 =for apidoc foldEQ_utf8
4137
4138 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4139 both of which may be in UTF-8) are the same case-insensitively; false
4140 otherwise.  How far into the strings to compare is determined by other input
4141 parameters.
4142
4143 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4144 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
4145 C<u2> with respect to C<s2>.
4146
4147 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4148 fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
4149 The scan will not be considered to be a match unless the goal is reached, and
4150 scanning won't continue past that goal.  Correspondingly for C<l2> with respect
4151 to C<s2>.
4152
4153 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4154 pointer is considered an end pointer to the position 1 byte past the maximum
4155 point in C<s1> beyond which scanning will not continue under any circumstances.
4156 (This routine assumes that UTF-8 encoded input strings are not malformed;
4157 malformed input can cause it to read past C<pe1>).  This means that if both
4158 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4159 will never be successful because it can never
4160 get as far as its goal (and in fact is asserted against).  Correspondingly for
4161 C<pe2> with respect to C<s2>.
4162
4163 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4164 C<l2> must be non-zero), and if both do, both have to be
4165 reached for a successful match.   Also, if the fold of a character is multiple
4166 characters, all of them must be matched (see tr21 reference below for
4167 'folding').
4168
4169 Upon a successful match, if C<pe1> is non-C<NULL>,
4170 it will be set to point to the beginning of the I<next> character of C<s1>
4171 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4172
4173 For case-insensitiveness, the "casefolding" of Unicode is used
4174 instead of upper/lowercasing both the characters, see
4175 L<https://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
4176
4177 =cut */
4178
4179 /* A flags parameter has been added which may change, and hence isn't
4180  * externally documented.  Currently it is:
4181  *  0 for as-documented above
4182  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4183                             ASCII one, to not match
4184  *  FOLDEQ_LOCALE           is set iff the rules from the current underlying
4185  *                          locale are to be used.
4186  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4187  *                          routine.  This allows that step to be skipped.
4188  *                          Currently, this requires s1 to be encoded as UTF-8
4189  *                          (u1 must be true), which is asserted for.
4190  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4191  *                          cross certain boundaries.  Hence, the caller should
4192  *                          let this function do the folding instead of
4193  *                          pre-folding.  This code contains an assertion to
4194  *                          that effect.  However, if the caller knows what
4195  *                          it's doing, it can pass this flag to indicate that,
4196  *                          and the assertion is skipped.
4197  *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4198  *                          to s2, and s2 doesn't have to be UTF-8 encoded.
4199  *                          This introduces an asymmetry to save a few branches
4200  *                          in a loop.  Currently, this is not a problem, as
4201  *                          never are both inputs pre-folded.  Simply call this
4202  *                          function with the pre-folded one as the second
4203  *                          string.
4204  *  FOLDEQ_S2_FOLDS_SANE
4205  */
4206 I32
4207 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4208                              const char *s2, char **pe2, UV l2, bool u2,
4209                              U32 flags)
4210 {
4211     const U8 *p1  = (const U8*)s1; /* Point to current char */
4212     const U8 *p2  = (const U8*)s2;
4213     const U8 *g1 = NULL;       /* goal for s1 */
4214     const U8 *g2 = NULL;
4215     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4216     U8 *f1 = NULL;             /* Point to current folded */
4217     const U8 *e2 = NULL;
4218     U8 *f2 = NULL;
4219     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4220     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4221     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4222     U8 flags_for_folder = FOLD_FLAGS_FULL;
4223
4224     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4225
4226     assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4227                && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
4228                         && !(flags &  FOLDEQ_S1_FOLDS_SANE))
4229                     || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
4230                         && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
4231     /* The algorithm is to trial the folds without regard to the flags on
4232      * the first line of the above assert(), and then see if the result
4233      * violates them.  This means that the inputs can't be pre-folded to a
4234      * violating result, hence the assert.  This could be changed, with the
4235      * addition of extra tests here for the already-folded case, which would
4236      * slow it down.  That cost is more than any possible gain for when these
4237      * flags are specified, as the flags indicate /il or /iaa matching which
4238      * is less common than /iu, and I (khw) also believe that real-world /il
4239      * and /iaa matches are most likely to involve code points 0-255, and this
4240      * function only under rare conditions gets called for 0-255. */
4241
4242     if (flags & FOLDEQ_LOCALE) {
4243         if (IN_UTF8_CTYPE_LOCALE) {
4244             if (UNLIKELY(PL_in_utf8_turkic_locale)) {
4245                 flags_for_folder |= FOLD_FLAGS_LOCALE;
4246             }
4247             else {
4248                 flags &= ~FOLDEQ_LOCALE;
4249             }
4250         }
4251         else {
4252             flags_for_folder |= FOLD_FLAGS_LOCALE;
4253         }
4254     }
4255     if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4256         flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4257     }
4258
4259     if (pe1) {
4260         e1 = *(U8**)pe1;
4261     }
4262
4263     if (l1) {
4264         g1 = (const U8*)s1 + l1;
4265     }
4266
4267     if (pe2) {
4268         e2 = *(U8**)pe2;
4269     }
4270
4271     if (l2) {
4272         g2 = (const U8*)s2 + l2;
4273     }
4274
4275     /* Must have at least one goal */
4276     assert(g1 || g2);
4277
4278     if (g1) {
4279
4280         /* Will never match if goal is out-of-bounds */
4281         assert(! e1  || e1 >= g1);
4282
4283         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4284         * only go as far as the goal */
4285         e1 = g1;
4286     }
4287     else {
4288         assert(e1);    /* Must have an end for looking at s1 */
4289     }
4290
4291     /* Same for goal for s2 */
4292     if (g2) {
4293         assert(! e2  || e2 >= g2);
4294         e2 = g2;
4295     }
4296     else {
4297         assert(e2);
4298     }
4299
4300     /* If both operands are already folded, we could just do a memEQ on the
4301      * whole strings at once, but it would be better if the caller realized
4302      * this and didn't even call us */
4303
4304     /* Look through both strings, a character at a time */
4305     while (p1 < e1 && p2 < e2) {
4306
4307         /* If at the beginning of a new character in s1, get its fold to use
4308          * and the length of the fold. */
4309         if (n1 == 0) {
4310             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4311                 f1 = (U8 *) p1;
4312                 assert(u1);
4313                 n1 = UTF8SKIP(f1);
4314             }
4315             else {
4316                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4317
4318                     /* We have to forbid mixing ASCII with non-ASCII if the
4319                      * flags so indicate.  And, we can short circuit having to
4320                      * call the general functions for this common ASCII case,
4321                      * all of whose non-locale folds are also ASCII, and hence
4322                      * UTF-8 invariants, so the UTF8ness of the strings is not
4323                      * relevant. */
4324                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4325                         return 0;
4326                     }
4327                     n1 = 1;
4328                     *foldbuf1 = toFOLD(*p1);
4329                 }
4330                 else if (u1) {
4331                     _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4332                 }
4333                 else {  /* Not UTF-8, get UTF-8 fold */
4334                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4335                 }
4336                 f1 = foldbuf1;
4337             }
4338         }
4339
4340         if (n2 == 0) {    /* Same for s2 */
4341             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4342
4343                 /* Point to the already-folded character.  But for non-UTF-8
4344                  * variants, convert to UTF-8 for the algorithm below */
4345                 if (UTF8_IS_INVARIANT(*p2)) {
4346                     f2 = (U8 *) p2;
4347                     n2 = 1;
4348                 }
4349                 else if (u2) {
4350                     f2 = (U8 *) p2;
4351                     n2 = UTF8SKIP(f2);
4352                 }
4353                 else {
4354                     foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4355                     foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4356                     f2 = foldbuf2;
4357                     n2 = 2;
4358                 }
4359             }
4360             else {
4361                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4362                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4363                         return 0;
4364                     }
4365                     n2 = 1;
4366                     *foldbuf2 = toFOLD(*p2);
4367                 }
4368                 else if (u2) {
4369                     _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4370                 }
4371                 else {
4372                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4373                 }
4374                 f2 = foldbuf2;
4375             }
4376         }
4377
4378         /* Here f1 and f2 point to the beginning of the strings to compare.
4379          * These strings are the folds of the next character from each input
4380          * string, stored in UTF-8. */
4381
4382         /* While there is more to look for in both folds, see if they
4383         * continue to match */
4384         while (n1 && n2) {
4385             U8 fold_length = UTF8SKIP(f1);
4386             if (fold_length != UTF8SKIP(f2)
4387                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4388                                                        function call for single
4389                                                        byte */
4390                 || memNE((char*)f1, (char*)f2, fold_length))
4391             {
4392                 return 0; /* mismatch */
4393             }
4394
4395             /* Here, they matched, advance past them */
4396             n1 -= fold_length;
4397             f1 += fold_length;
4398             n2 -= fold_length;
4399             f2 += fold_length;
4400         }
4401
4402         /* When reach the end of any fold, advance the input past it */
4403         if (n1 == 0) {
4404             p1 += u1 ? UTF8SKIP(p1) : 1;
4405         }
4406         if (n2 == 0) {
4407             p2 += u2 ? UTF8SKIP(p2) : 1;
4408         }
4409     } /* End of loop through both strings */
4410
4411     /* A match is defined by each scan that specified an explicit length
4412     * reaching its final goal, and the other not having matched a partial
4413     * character (which can happen when the fold of a character is more than one
4414     * character). */
4415     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4416         return 0;
4417     }
4418
4419     /* Successful match.  Set output pointers */
4420     if (pe1) {
4421         *pe1 = (char*)p1;
4422     }
4423     if (pe2) {
4424         *pe2 = (char*)p2;
4425     }
4426     return 1;
4427 }
4428
4429 /* XXX The next two functions should likely be moved to mathoms.c once all
4430  * occurrences of them are removed from the core; some cpan-upstream modules
4431  * still use them */
4432
4433 U8 *
4434 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
4435 {
4436     PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
4437
4438     return uvoffuni_to_utf8_flags(d, uv, 0);
4439 }
4440
4441 /*
4442 =for apidoc utf8n_to_uvuni
4443
4444 Instead use L<perlapi/utf8_to_uvchr_buf>, or rarely, L<perlapi/utf8n_to_uvchr>.
4445
4446 This function was useful for code that wanted to handle both EBCDIC and
4447 ASCII platforms with Unicode properties, but starting in Perl v5.20, the
4448 distinctions between the platforms have mostly been made invisible to most
4449 code, so this function is quite unlikely to be what you want.  If you do need
4450 this precise functionality, use instead
4451 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>>
4452 or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|perlapi/utf8n_to_uvchr>>.
4453
4454 =cut
4455 */
4456
4457 UV
4458 Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
4459 {
4460     PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
4461
4462     return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
4463 }
4464
4465 /*
4466 =for apidoc uvuni_to_utf8_flags
4467
4468 Instead you almost certainly want to use L<perlapi/uvchr_to_utf8> or
4469 L<perlapi/uvchr_to_utf8_flags>.
4470
4471 This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
4472 which itself, while not deprecated, should be used only in isolated
4473 circumstances.  These functions were useful for code that wanted to handle
4474 both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
4475 v5.20, the distinctions between the platforms have mostly been made invisible
4476 to most code, so this function is quite unlikely to be what you want.
4477
4478 =cut
4479 */
4480
4481 U8 *
4482 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
4483 {
4484     PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
4485
4486     return uvoffuni_to_utf8_flags(d, uv, flags);
4487 }
4488
4489 /*
4490 =for apidoc utf8_to_uvchr
4491
4492 Returns the native code point of the first character in the string C<s>
4493 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
4494 length, in bytes, of that character.
4495
4496 Some, but not all, UTF-8 malformations are detected, and in fact, some
4497 malformed input could cause reading beyond the end of the input buffer, which
4498 is why this function is deprecated.  Use L</utf8_to_uvchr_buf> instead.
4499
4500 If C<s> points to one of the detected malformations, and UTF8 warnings are
4501 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
4502 C<NULL>) to -1.  If those warnings are off, the computed value if well-defined (or
4503 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
4504 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
4505 next possible position in C<s> that could begin a non-malformed character.
4506 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
4507
4508 =cut
4509 */
4510
4511 UV
4512 Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
4513 {
4514     PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
4515
4516     /* This function is unsafe if malformed UTF-8 input is given it, which is
4517      * why the function is deprecated.  If the first byte of the input
4518      * indicates that there are more bytes remaining in the sequence that forms
4519      * the character than there are in the input buffer, it can read past the
4520      * end.  But we can make it safe if the input string happens to be
4521      * NUL-terminated, as many strings in Perl are, by refusing to read past a
4522      * NUL, which is what UTF8_CHK_SKIP() does.  A NUL indicates the start of
4523      * the next character anyway.  If the input isn't NUL-terminated, the
4524      * function remains unsafe, as it always has been. */
4525
4526     return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen);
4527 }
4528
4529 /*
4530  * ex: set ts=8 sts=4 sw=4 et:
4531  */