inline.h

   1 /*    inline.h
   2  *
   3  *    Copyright (C) 2012 by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  *    This file contains tables and code adapted from
   9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
  10  *    copyright notice:
  11
  12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  13
  14 Permission is hereby granted, free of charge, to any person obtaining a copy of
  15 this software and associated documentation files (the "Software"), to deal in
  16 the Software without restriction, including without limitation the rights to
  17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  18 of the Software, and to permit persons to whom the Software is furnished to do
  19 so, subject to the following conditions:
  20
  21 The above copyright notice and this permission notice shall be included in all
  22 copies or substantial portions of the Software.
  23
  24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 SOFTWARE.
  31
  32  *
  33  * This file is a home for static inline functions that cannot go in other
  34  * header files, because they depend on proto.h (included after most other
  35  * headers) or struct definitions.
  36  *
  37  * Each section names the header file that the functions "belong" to.
  38  */
  39
  40 /* ------------------------------- av.h ------------------------------- */
  41
  42 /*
  43 =for apidoc_section $AV
  44 =for apidoc av_count
  45 Returns the number of elements in the array C<av>.  This is the true length of
  46 the array, including any undefined elements.  It is always the same as
  47 S<C<av_top_index(av) + 1>>.
  48
  49 =cut
  50 */
  51 PERL_STATIC_INLINE Size_t
  52 Perl_av_count(pTHX_ AV *av)
  53 {
  54     PERL_ARGS_ASSERT_AV_COUNT;
  55     assert(SvTYPE(av) == SVt_PVAV);
  56
  57     return AvFILL(av) + 1;
  58 }
  59
  60 /* ------------------------------- av.c ------------------------------- */
  61
  62 /*
  63 =for apidoc av_store_simple
  64
  65 This is a cut-down version of av_store that assumes that the array is
  66 very straightforward - no magic, not readonly, and AvREAL - and that
  67 C<key> is not negative. This function MUST NOT be used in situations
  68 where any of those assumptions may not hold.
  69
  70 Stores an SV in an array.  The array index is specified as C<key>. It
  71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
  72
  73 Note that the caller is responsible for suitably incrementing the reference
  74 count of C<val> before the call.
  75
  76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
  77
  78 =cut
  79 */
  80
  81 PERL_STATIC_INLINE SV**
  82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
  83 {
  84     SV** ary;
  85
  86     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
  87     assert(SvTYPE(av) == SVt_PVAV);
  88     assert(!SvMAGICAL(av));
  89     assert(!SvREADONLY(av));
  90     assert(AvREAL(av));
  91     assert(key > -1);
  92
  93     ary = AvARRAY(av);
  94
  95     if (AvFILLp(av) < key) {
  96         if (key > AvMAX(av)) {
  97             av_extend(av,key);
  98             ary = AvARRAY(av);
  99         }
 100         AvFILLp(av) = key;
 101     } else
 102         SvREFCNT_dec(ary[key]);
 103
 104     ary[key] = val;
 105     return &ary[key];
 106 }
 107
 108 /*
 109 =for apidoc av_fetch_simple
 110
 111 This is a cut-down version of av_fetch that assumes that the array is
 112 very straightforward - no magic, not readonly, and AvREAL - and that
 113 C<key> is not negative. This function MUST NOT be used in situations
 114 where any of those assumptions may not hold.
 115
 116 Returns the SV at the specified index in the array.  The C<key> is the
 117 index.  If lval is true, you are guaranteed to get a real SV back (in case
 118 it wasn't real before), which you can then modify.  Check that the return
 119 value is non-null before dereferencing it to a C<SV*>.
 120
 121 The rough perl equivalent is C<$myarray[$key]>.
 122
 123 =cut
 124 */
 125
 126 PERL_STATIC_INLINE SV**
 127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
 128 {
 129     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
 130     assert(SvTYPE(av) == SVt_PVAV);
 131     assert(!SvMAGICAL(av));
 132     assert(!SvREADONLY(av));
 133     assert(AvREAL(av));
 134     assert(key > -1);
 135
 136     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
 137         return lval ? av_store_simple(av,key,newSV(0)) : NULL;
 138     } else {
 139         return &AvARRAY(av)[key];
 140     }
 141 }
 142
 143 /* ------------------------------- cv.h ------------------------------- */
 144
 145 /*
 146 =for apidoc_section $CV
 147 =for apidoc CvGV
 148 Returns the GV associated with the CV C<sv>, reifying it if necessary.
 149
 150 =cut
 151 */
 152 PERL_STATIC_INLINE GV *
 153 Perl_CvGV(pTHX_ CV *sv)
 154 {
 155     PERL_ARGS_ASSERT_CVGV;
 156
 157     return CvNAMED(sv)
 158         ? Perl_cvgv_from_hek(aTHX_ sv)
 159         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
 160 }
 161
 162 PERL_STATIC_INLINE I32 *
 163 Perl_CvDEPTH(const CV * const sv)
 164 {
 165     PERL_ARGS_ASSERT_CVDEPTH;
 166     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
 167
 168     return &((XPVCV*)SvANY(sv))->xcv_depth;
 169 }
 170
 171 /*
 172  CvPROTO returns the prototype as stored, which is not necessarily what
 173  the interpreter should be using. Specifically, the interpreter assumes
 174  that spaces have been stripped, which has been the case if the prototype
 175  was added by toke.c, but is generally not the case if it was added elsewhere.
 176  Since we can't enforce the spacelessness at assignment time, this routine
 177  provides a temporary copy at parse time with spaces removed.
 178  I<orig> is the start of the original buffer, I<len> is the length of the
 179  prototype and will be updated when this returns.
 180  */
 181
 182 #ifdef PERL_CORE
 183 PERL_STATIC_INLINE char *
 184 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
 185 {
 186     SV * tmpsv;
 187     char * tmps;
 188     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
 189     tmps = SvPVX(tmpsv);
 190     while ((*len)--) {
 191         if (!isSPACE(*orig))
 192             *tmps++ = *orig;
 193         orig++;
 194     }
 195     *tmps = '\0';
 196     *len = tmps - SvPVX(tmpsv);
 197                 return SvPVX(tmpsv);
 198 }
 199 #endif
 200
 201 /* ------------------------------- mg.h ------------------------------- */
 202
 203 #if defined(PERL_CORE) || defined(PERL_EXT)
 204 /* assumes get-magic and stringification have already occurred */
 205 PERL_STATIC_INLINE STRLEN
 206 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
 207 {
 208     assert(mg->mg_type == PERL_MAGIC_regex_global);
 209     assert(mg->mg_len != -1);
 210     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
 211         return (STRLEN)mg->mg_len;
 212     else {
 213         const STRLEN pos = (STRLEN)mg->mg_len;
 214         /* Without this check, we may read past the end of the buffer: */
 215         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
 216         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
 217     }
 218 }
 219 #endif
 220
 221 /* ------------------------------- pad.h ------------------------------ */
 222
 223 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
 224 PERL_STATIC_INLINE bool
 225 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
 226 {
 227     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
 228
 229     /* is seq within the range _LOW to _HIGH ?
 230      * This is complicated by the fact that PL_cop_seqmax
 231      * may have wrapped around at some point */
 232     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
 233         return FALSE; /* not yet introduced */
 234
 235     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
 236     /* in compiling scope */
 237         if (
 238             (seq >  COP_SEQ_RANGE_LOW(pn))
 239             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
 240             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
 241         )
 242             return TRUE;
 243     }
 244     else if (
 245         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
 246         ?
 247             (  seq >  COP_SEQ_RANGE_LOW(pn)
 248             || seq <= COP_SEQ_RANGE_HIGH(pn))
 249
 250         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
 251              && seq <= COP_SEQ_RANGE_HIGH(pn))
 252     )
 253         return TRUE;
 254     return FALSE;
 255 }
 256 #endif
 257
 258 /* ------------------------------- pp.h ------------------------------- */
 259
 260 PERL_STATIC_INLINE I32
 261 Perl_TOPMARK(pTHX)
 262 {
 263     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 264                                  "MARK top  %p %" IVdf "\n",
 265                                   PL_markstack_ptr,
 266                                   (IV)*PL_markstack_ptr)));
 267     return *PL_markstack_ptr;
 268 }
 269
 270 PERL_STATIC_INLINE I32
 271 Perl_POPMARK(pTHX)
 272 {
 273     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 274                                  "MARK pop  %p %" IVdf "\n",
 275                                   (PL_markstack_ptr-1),
 276                                   (IV)*(PL_markstack_ptr-1))));
 277     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
 278     return *PL_markstack_ptr--;
 279 }
 280
 281 /* ----------------------------- regexp.h ----------------------------- */
 282
 283 /* PVLVs need to act as a superset of all scalar types - they are basically
 284  * PVMGs with a few extra fields.
 285  * REGEXPs are first class scalars, but have many fields that can't be copied
 286  * into a PVLV body.
 287  *
 288  * Hence we take a different approach - instead of a copy, PVLVs store a pointer
 289  * back to the original body. To avoid increasing the size of PVLVs just for the
 290  * rare case of REGEXP assignment, this pointer is stored in the memory usually
 291  * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
 292  * read the pointer from the two possible locations. The macro SvLEN() wraps the
 293  * access to the union's member xpvlenu_len, but there is no equivalent macro
 294  * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
 295  *
 296  * See commit df6b4bd56551f2d3 for more details. */
 297
 298 PERL_STATIC_INLINE struct regexp *
 299 Perl_ReANY(const REGEXP * const re)
 300 {
 301     XPV* const p = (XPV*)SvANY(re);
 302
 303     PERL_ARGS_ASSERT_REANY;
 304     assert(isREGEXP(re));
 305
 306     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
 307                                    : (struct regexp *)p;
 308 }
 309
 310 /* ------------------------------- sv.h ------------------------------- */
 311
 312 PERL_STATIC_INLINE bool
 313 Perl_SvTRUE(pTHX_ SV *sv)
 314 {
 315     PERL_ARGS_ASSERT_SVTRUE;
 316
 317     if (UNLIKELY(sv == NULL))
 318         return FALSE;
 319     SvGETMAGIC(sv);
 320     return SvTRUE_nomg_NN(sv);
 321 }
 322
 323 PERL_STATIC_INLINE bool
 324 Perl_SvTRUE_nomg(pTHX_ SV *sv)
 325 {
 326     PERL_ARGS_ASSERT_SVTRUE_NOMG;
 327
 328     if (UNLIKELY(sv == NULL))
 329         return FALSE;
 330     return SvTRUE_nomg_NN(sv);
 331 }
 332
 333 PERL_STATIC_INLINE bool
 334 Perl_SvTRUE_NN(pTHX_ SV *sv)
 335 {
 336     PERL_ARGS_ASSERT_SVTRUE_NN;
 337
 338     SvGETMAGIC(sv);
 339     return SvTRUE_nomg_NN(sv);
 340 }
 341
 342 PERL_STATIC_INLINE bool
 343 Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
 344 {
 345     PERL_ARGS_ASSERT_SVTRUE_COMMON;
 346
 347     if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
 348         return SvIMMORTAL_TRUE(sv);
 349
 350     if (! SvOK(sv))
 351         return FALSE;
 352
 353     if (SvPOK(sv))
 354         return SvPVXtrue(sv);
 355
 356     if (SvIOK(sv))
 357         return SvIVX(sv) != 0; /* casts to bool */
 358
 359     if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
 360         return TRUE;
 361
 362     if (sv_2bool_is_fallback)
 363         return sv_2bool_nomg(sv);
 364
 365     return isGV_with_GP(sv);
 366 }
 367
 368
 369 PERL_STATIC_INLINE SV *
 370 Perl_SvREFCNT_inc(SV *sv)
 371 {
 372     if (LIKELY(sv != NULL))
 373         SvREFCNT(sv)++;
 374     return sv;
 375 }
 376 PERL_STATIC_INLINE SV *
 377 Perl_SvREFCNT_inc_NN(SV *sv)
 378 {
 379     PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
 380
 381     SvREFCNT(sv)++;
 382     return sv;
 383 }
 384 PERL_STATIC_INLINE void
 385 Perl_SvREFCNT_inc_void(SV *sv)
 386 {
 387     if (LIKELY(sv != NULL))
 388         SvREFCNT(sv)++;
 389 }
 390 PERL_STATIC_INLINE void
 391 Perl_SvREFCNT_dec(pTHX_ SV *sv)
 392 {
 393     if (LIKELY(sv != NULL)) {
 394         U32 rc = SvREFCNT(sv);
 395         if (LIKELY(rc > 1))
 396             SvREFCNT(sv) = rc - 1;
 397         else
 398             Perl_sv_free2(aTHX_ sv, rc);
 399     }
 400 }
 401
 402 PERL_STATIC_INLINE void
 403 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
 404 {
 405     U32 rc = SvREFCNT(sv);
 406
 407     PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
 408
 409     if (LIKELY(rc > 1))
 410         SvREFCNT(sv) = rc - 1;
 411     else
 412         Perl_sv_free2(aTHX_ sv, rc);
 413 }
 414
 415 PERL_STATIC_INLINE void
 416 Perl_SvAMAGIC_on(SV *sv)
 417 {
 418     PERL_ARGS_ASSERT_SVAMAGIC_ON;
 419     assert(SvROK(sv));
 420
 421     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
 422 }
 423 PERL_STATIC_INLINE void
 424 Perl_SvAMAGIC_off(SV *sv)
 425 {
 426     PERL_ARGS_ASSERT_SVAMAGIC_OFF;
 427
 428     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
 429         HvAMAGIC_off(SvSTASH(SvRV(sv)));
 430 }
 431
 432 PERL_STATIC_INLINE U32
 433 Perl_SvPADSTALE_on(SV *sv)
 434 {
 435     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 436     return SvFLAGS(sv) |= SVs_PADSTALE;
 437 }
 438 PERL_STATIC_INLINE U32
 439 Perl_SvPADSTALE_off(SV *sv)
 440 {
 441     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 442     return SvFLAGS(sv) &= ~SVs_PADSTALE;
 443 }
 444 #if defined(PERL_CORE) || defined (PERL_EXT)
 445 PERL_STATIC_INLINE STRLEN
 446 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
 447 {
 448     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
 449     if (SvGAMAGIC(sv)) {
 450         U8 *hopped = utf8_hop((U8 *)pv, pos);
 451         if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
 452         return (STRLEN)(hopped - (U8 *)pv);
 453     }
 454     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
 455 }
 456 #endif
 457
 458 /* ------------------------------- utf8.h ------------------------------- */
 459
 460 /*
 461 =for apidoc_section $unicode
 462 */
 463
 464 PERL_STATIC_INLINE void
 465 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
 466 {
 467     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
 468      * encoded string at '*dest', updating '*dest' to include it */
 469
 470     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 471
 472     if (NATIVE_BYTE_IS_INVARIANT(byte))
 473         *((*dest)++) = byte;
 474     else {
 475         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
 476         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
 477     }
 478 }
 479
 480 /*
 481 =for apidoc valid_utf8_to_uvchr
 482 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
 483 known that the next character in the input UTF-8 string C<s> is well-formed
 484 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
 485 points, and non-Unicode code points are allowed.
 486
 487 =cut
 488
 489  */
 490
 491 PERL_STATIC_INLINE UV
 492 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 493 {
 494     const UV expectlen = UTF8SKIP(s);
 495     const U8* send = s + expectlen;
 496     UV uv = *s;
 497
 498     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 499
 500     if (retlen) {
 501         *retlen = expectlen;
 502     }
 503
 504     /* An invariant is trivially returned */
 505     if (expectlen == 1) {
 506         return uv;
 507     }
 508
 509     /* Remove the leading bits that indicate the number of bytes, leaving just
 510      * the bits that are part of the value */
 511     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 512
 513     /* Now, loop through the remaining bytes, accumulating each into the
 514      * working total as we go.  (I khw tried unrolling the loop for up to 4
 515      * bytes, but there was no performance improvement) */
 516     for (++s; s < send; s++) {
 517         uv = UTF8_ACCUMULATE(uv, *s);
 518     }
 519
 520     return UNI_TO_NATIVE(uv);
 521
 522 }
 523
 524 /*
 525 =for apidoc is_utf8_invariant_string
 526
 527 Returns TRUE if the first C<len> bytes of the string C<s> are the same
 528 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
 529 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
 530 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
 531 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
 532 characters are invariant, but so also are the C1 controls.
 533
 534 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 535 use this option, that C<s> can't have embedded C<NUL> characters and has to
 536 have a terminating C<NUL> byte).
 537
 538 See also
 539 C<L</is_utf8_string>>,
 540 C<L</is_utf8_string_flags>>,
 541 C<L</is_utf8_string_loc>>,
 542 C<L</is_utf8_string_loc_flags>>,
 543 C<L</is_utf8_string_loclen>>,
 544 C<L</is_utf8_string_loclen_flags>>,
 545 C<L</is_utf8_fixed_width_buf_flags>>,
 546 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 547 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 548 C<L</is_strict_utf8_string>>,
 549 C<L</is_strict_utf8_string_loc>>,
 550 C<L</is_strict_utf8_string_loclen>>,
 551 C<L</is_c9strict_utf8_string>>,
 552 C<L</is_c9strict_utf8_string_loc>>,
 553 and
 554 C<L</is_c9strict_utf8_string_loclen>>.
 555
 556 =cut
 557
 558 */
 559
 560 #define is_utf8_invariant_string(s, len)                                    \
 561                                 is_utf8_invariant_string_loc(s, len, NULL)
 562
 563 /*
 564 =for apidoc is_utf8_invariant_string_loc
 565
 566 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
 567 the first UTF-8 variant character in the C<ep> pointer; if all characters are
 568 UTF-8 invariant, this function does not change the contents of C<*ep>.
 569
 570 =cut
 571
 572 */
 573
 574 PERL_STATIC_INLINE bool
 575 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 576 {
 577     const U8* send;
 578     const U8* x = s;
 579
 580     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 581
 582     if (len == 0) {
 583         len = strlen((const char *)s);
 584     }
 585
 586     send = s + len;
 587
 588 /* This looks like 0x010101... */
 589 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
 590
 591 /* This looks like 0x808080... */
 592 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
 593 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
 594 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 595
 596 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
 597  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
 598  * optimized out completely on a 32-bit system, and its mask gets optimized out
 599  * on a 64-bit system */
 600 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
 601                                       |   (  PTR2nat(x) >> 1)                 \
 602                                       | ( ( (PTR2nat(x)                       \
 603                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
 604
 605 #ifndef EBCDIC
 606
 607     /* Do the word-at-a-time iff there is at least one usable full word.  That
 608      * means that after advancing to a word boundary, there still is at least a
 609      * full word left.  The number of bytes needed to advance is 'wordsize -
 610      * offset' unless offset is 0. */
 611     if ((STRLEN) (send - x) >= PERL_WORDSIZE
 612
 613                             /* This term is wordsize if subword; 0 if not */
 614                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
 615
 616                             /* 'offset' */
 617                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
 618     {
 619
 620         /* Process per-byte until reach word boundary.  XXX This loop could be
 621          * eliminated if we knew that this platform had fast unaligned reads */
 622         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
 623             if (! UTF8_IS_INVARIANT(*x)) {
 624                 if (ep) {
 625                     *ep = x;
 626                 }
 627
 628                 return FALSE;
 629             }
 630             x++;
 631         }
 632
 633         /* Here, we know we have at least one full word to process.  Process
 634          * per-word as long as we have at least a full word left */
 635         do {
 636             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
 637
 638                 /* Found a variant.  Just return if caller doesn't want its
 639                  * exact position */
 640                 if (! ep) {
 641                     return FALSE;
 642                 }
 643
 644 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 645      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 646
 647                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
 648                 assert(*ep >= s && *ep < send);
 649
 650                 return FALSE;
 651
 652 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 653            checks. */
 654
 655                 break;
 656 #  endif
 657             }
 658
 659             x += PERL_WORDSIZE;
 660
 661         } while (x + PERL_WORDSIZE <= send);
 662     }
 663
 664 #endif      /* End of ! EBCDIC */
 665
 666     /* Process per-byte */
 667     while (x < send) {
 668         if (! UTF8_IS_INVARIANT(*x)) {
 669             if (ep) {
 670                 *ep = x;
 671             }
 672
 673             return FALSE;
 674         }
 675
 676         x++;
 677     }
 678
 679     return TRUE;
 680 }
 681
 682 /* See if the platform has builtins for finding the most/least significant bit,
 683  * and which one is right for using on 32 and 64 bit operands */
 684 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
 685 #  if U32SIZE == INTSIZE
 686 #    define PERL_CLZ_32 __builtin_clz
 687 #  endif
 688 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 689 #    define PERL_CLZ_64 __builtin_clz
 690 #  endif
 691 #endif
 692 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
 693 #  if U32SIZE == INTSIZE
 694 #    define PERL_CTZ_32 __builtin_ctz
 695 #  endif
 696 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 697 #    define PERL_CTZ_64 __builtin_ctz
 698 #  endif
 699 #endif
 700
 701 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
 702 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
 703 #    define PERL_CLZ_32 __builtin_clzl
 704 #  endif
 705 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
 706 #    define PERL_CLZ_64 __builtin_clzl
 707 #  endif
 708 #endif
 709 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
 710 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
 711 #    define PERL_CTZ_32 __builtin_ctzl
 712 #  endif
 713 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
 714 #    define PERL_CTZ_64 __builtin_ctzl
 715 #  endif
 716 #endif
 717
 718 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
 719 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
 720 #    define PERL_CLZ_32 __builtin_clzll
 721 #  endif
 722 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
 723 #    define PERL_CLZ_64 __builtin_clzll
 724 #  endif
 725 #endif
 726 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
 727 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
 728 #    define PERL_CTZ_32 __builtin_ctzll
 729 #  endif
 730 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
 731 #    define PERL_CTZ_64 __builtin_ctzll
 732 #  endif
 733 #endif
 734
 735 #if defined(_MSC_VER)
 736 #  include <intrin.h>
 737 #  pragma intrinsic(_BitScanForward)
 738 #  pragma intrinsic(_BitScanReverse)
 739 #  ifdef _WIN64
 740 #    pragma intrinsic(_BitScanForward64)
 741 #    pragma intrinsic(_BitScanReverse64)
 742 #  endif
 743 #endif
 744
 745 /* The reason there are not checks to see if ffs() and ffsl() are available for
 746  * determining the lsb, is because these don't improve on the deBruijn method
 747  * fallback, which is just a branchless integer multiply, array element
 748  * retrieval, and shift.  The others, even if the function call overhead is
 749  * optimized out, have to cope with the possibility of the input being all
 750  * zeroes, and almost certainly will have conditionals for this eventuality.
 751  * khw, at the time of this commit, looked at the source for both gcc and clang
 752  * to verify this.  (gcc used a method inferior to deBruijn.) */
 753
 754 /* Below are functions to find the first, last, or only set bit in a word.  On
 755  * platforms with 64-bit capability, there is a pair for each operation; the
 756  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
 757  * the same in each pair, so the second is stripped of most comments. */
 758
 759 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 760
 761 PERL_STATIC_INLINE unsigned
 762 Perl_lsbit_pos64(U64 word)
 763 {
 764     /* Find the position (0..63) of the least significant set bit in the input
 765      * word */
 766
 767     ASSUME(word != 0);
 768
 769     /* If we can determine that the platform has a usable fast method to get
 770      * this info, use that */
 771
 772 #  if defined(PERL_CTZ_64)
 773 #    define PERL_HAS_FAST_GET_LSB_POS64
 774
 775     return (unsigned) PERL_CTZ_64(word);
 776
 777 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
 778 #    define PERL_HAS_FAST_GET_LSB_POS64
 779
 780     {
 781         unsigned long index;
 782         _BitScanForward64(&index, word);
 783         return (unsigned)index;
 784     }
 785
 786 #  else
 787
 788     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
 789      * making the lsb the only set bit in the word, and use our function that
 790      * works on words with a single bit set.
 791      *
 792      * Isolate the lsb;
 793      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
 794      *
 795      * The word will look like this, with a rightmost set bit in position 's':
 796      * ('x's are don't cares, and 'y's are their complements)
 797      *      s
 798      *  x..x100..00
 799      *  y..y011..11      Complement
 800      *  y..y100..00      Add 1
 801      *  0..0100..00      And with the original
 802      *
 803      *  (Yes, complementing and adding 1 is just taking the negative on 2's
 804      *  complement machines, but not on 1's complement ones, and some compilers
 805      *  complain about negating an unsigned.)
 806      */
 807     return single_1bit_pos64(word & (~word + 1));
 808
 809 #  endif
 810
 811 }
 812
 813 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
 814 #else   /* ! QUAD */
 815 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
 816 #endif
 817
 818 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
 819 Perl_lsbit_pos32(U32 word)
 820 {
 821     /* Find the position (0..31) of the least significant set bit in the input
 822      * word */
 823
 824     ASSUME(word != 0);
 825
 826 #if defined(PERL_CTZ_32)
 827 #  define PERL_HAS_FAST_GET_LSB_POS32
 828
 829     return (unsigned) PERL_CTZ_32(word);
 830
 831 #elif U32SIZE == 4 && defined(_MSC_VER)
 832 #  define PERL_HAS_FAST_GET_LSB_POS32
 833
 834     {
 835         unsigned long index;
 836         _BitScanForward(&index, word);
 837         return (unsigned)index;
 838     }
 839
 840 #else
 841
 842     return single_1bit_pos32(word & (~word + 1));
 843
 844 #endif
 845
 846 }
 847
 848
 849 /* Convert the leading zeros count to the bit position of the first set bit.
 850  * This just subtracts from the highest position, 31 or 63.  But some compilers
 851  * don't optimize this optimally, and so a bit of bit twiddling encourages them
 852  * to do the right thing.  It turns out that subtracting a smaller non-negative
 853  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
 854  * the two numbers.  To see why, first note that the sum of any number, x, and
 855  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
 856  * the xor of x and all ones is x'. */
 857 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
 858
 859 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 860
 861 PERL_STATIC_INLINE unsigned
 862 Perl_msbit_pos64(U64 word)
 863 {
 864     /* Find the position (0..63) of the most significant set bit in the input
 865      * word */
 866
 867     ASSUME(word != 0);
 868
 869     /* If we can determine that the platform has a usable fast method to get
 870      * this, use that */
 871
 872 #  if defined(PERL_CLZ_64)
 873 #    define PERL_HAS_FAST_GET_MSB_POS64
 874
 875     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
 876
 877 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
 878 #    define PERL_HAS_FAST_GET_MSB_POS64
 879
 880     {
 881         unsigned long index;
 882         _BitScanReverse64(&index, word);
 883         return (unsigned)index;
 884     }
 885
 886 #  else
 887
 888     /* Here, we didn't find a fast method for finding the msb.  Fall back to
 889      * making the msb the only set bit in the word, and use our function that
 890      * works on words with a single bit set.
 891      *
 892      * Isolate the msb; http://codeforces.com/blog/entry/10330
 893      *
 894      * Only the most significant set bit matters.  Or'ing word with its right
 895      * shift of 1 makes that bit and the next one to its right both 1.
 896      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
 897      * ...  We end with the msb and all to the right being 1. */
 898     word |= (word >>  1);
 899     word |= (word >>  2);
 900     word |= (word >>  4);
 901     word |= (word >>  8);
 902     word |= (word >> 16);
 903     word |= (word >> 32);
 904
 905     /* Then subtracting the right shift by 1 clears all but the left-most of
 906      * the 1 bits, which is our desired result */
 907     word -= (word >> 1);
 908
 909     /* Now we have a single bit set */
 910     return single_1bit_pos64(word);
 911
 912 #  endif
 913
 914 }
 915
 916 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
 917 #else   /* ! QUAD */
 918 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
 919 #endif
 920
 921 PERL_STATIC_INLINE unsigned
 922 Perl_msbit_pos32(U32 word)
 923 {
 924     /* Find the position (0..31) of the most significant set bit in the input
 925      * word */
 926
 927     ASSUME(word != 0);
 928
 929 #if defined(PERL_CLZ_32)
 930 #  define PERL_HAS_FAST_GET_MSB_POS32
 931
 932     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
 933
 934 #elif U32SIZE == 4 && defined(_MSC_VER)
 935 #  define PERL_HAS_FAST_GET_MSB_POS32
 936
 937     {
 938         unsigned long index;
 939         _BitScanReverse(&index, word);
 940         return (unsigned)index;
 941     }
 942
 943 #else
 944
 945     word |= (word >>  1);
 946     word |= (word >>  2);
 947     word |= (word >>  4);
 948     word |= (word >>  8);
 949     word |= (word >> 16);
 950     word -= (word >> 1);
 951     return single_1bit_pos32(word);
 952
 953 #endif
 954
 955 }
 956
 957 #if UVSIZE == U64SIZE
 958 #  define msbit_pos(word)  msbit_pos64(word)
 959 #  define lsbit_pos(word)  lsbit_pos64(word)
 960 #elif UVSIZE == U32SIZE
 961 #  define msbit_pos(word)  msbit_pos32(word)
 962 #  define lsbit_pos(word)  lsbit_pos32(word)
 963 #endif
 964
 965 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 966
 967 PERL_STATIC_INLINE unsigned
 968 Perl_single_1bit_pos64(U64 word)
 969 {
 970     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
 971      * find and return the 1's position: 0..63 */
 972
 973 #  ifdef PERL_CORE    /* macro not exported */
 974     ASSUME(isPOWER_OF_2(word));
 975 #  else
 976     ASSUME(word && (word & (word-1)) == 0);
 977 #  endif
 978
 979     /* The only set bit is both the most and least significant bit.  If we have
 980      * a fast way of finding either one, use that.
 981      *
 982      * It may appear at first glance that those functions call this one, but
 983      * they don't if the corresponding #define is set */
 984
 985 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
 986
 987     return msbit_pos64(word);
 988
 989 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
 990
 991     return lsbit_pos64(word);
 992
 993 #  else
 994
 995     /* The position of the only set bit in a word can be quickly calculated
 996      * using deBruijn sequences.  See for example
 997      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
 998     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
 999                                                     >> PERL_deBruijnShift64_];
1000 #  endif
1001
1002 }
1003
1004 #endif
1005
1006 PERL_STATIC_INLINE unsigned
1007 Perl_single_1bit_pos32(U32 word)
1008 {
1009     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1010      * find and return the 1's position: 0..31 */
1011
1012 #ifdef PERL_CORE    /* macro not exported */
1013     ASSUME(isPOWER_OF_2(word));
1014 #else
1015     ASSUME(word && (word & (word-1)) == 0);
1016 #endif
1017 #ifdef PERL_HAS_FAST_GET_MSB_POS32
1018
1019     return msbit_pos32(word);
1020
1021 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1022
1023     return lsbit_pos32(word);
1024
1025 /* Unlikely, but possible for the platform to have a wider fast operation but
1026  * not a narrower one.  But easy enough to handle the case by widening the
1027  * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
1028  * would be slower than the deBruijn method.) */
1029 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1030
1031     return msbit_pos64(word);
1032
1033 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1034
1035     return lsbit_pos64(word);
1036
1037 #else
1038
1039     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1040                                                     >> PERL_deBruijnShift32_];
1041 #endif
1042
1043 }
1044
1045 #ifndef EBCDIC
1046
1047 PERL_STATIC_INLINE unsigned int
1048 Perl_variant_byte_number(PERL_UINTMAX_T word)
1049 {
1050     /* This returns the position in a word (0..7) of the first variant byte in
1051      * it.  This is a helper function.  Note that there are no branches */
1052
1053     /* Get just the msb bits of each byte */
1054     word &= PERL_VARIANTS_WORD_MASK;
1055
1056     /* This should only be called if we know there is a variant byte in the
1057      * word */
1058     assert(word);
1059
1060 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1061
1062     /* Bytes are stored like
1063      *  Byte8 ... Byte2 Byte1
1064      *  63..56...15...8 7...0
1065      * so getting the lsb of the whole modified word is getting the msb of the
1066      * first byte that has its msb set */
1067     word = lsbit_pos_uintmax_(word);
1068
1069     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1070      * to 0..7 */
1071     return (unsigned int) ((word + 1) >> 3) - 1;
1072
1073 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1074
1075     /* Bytes are stored like
1076      *  Byte1 Byte2  ... Byte8
1077      * 63..56 55..47 ... 7...0
1078      * so getting the msb of the whole modified word is getting the msb of the
1079      * first byte that has its msb set */
1080     word = msbit_pos_uintmax_(word);
1081
1082     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1083      * to 0..7 */
1084     word = ((word + 1) >> 3) - 1;
1085
1086     /* And invert the result because of the reversed byte order on this
1087      * platform */
1088     word = CHARBITS - word - 1;
1089
1090     return (unsigned int) word;
1091
1092 #  else
1093 #    error Unexpected byte order
1094 #  endif
1095
1096 }
1097
1098 #endif
1099 #if defined(PERL_CORE) || defined(PERL_EXT)
1100
1101 /*
1102 =for apidoc variant_under_utf8_count
1103
1104 This function looks at the sequence of bytes between C<s> and C<e>, which are
1105 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1106 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1107 each of these would occupy two bytes instead of the single one in the input
1108 string.  Thus, this function returns the precise number of bytes the string
1109 would expand by when translated to UTF-8.
1110
1111 Unlike most of the other functions that have C<utf8> in their name, the input
1112 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1113 I<odd> to emphasize this.
1114
1115 This function is internal to Perl because khw thinks that any XS code that
1116 would want this is probably operating too close to the internals.  Presenting a
1117 valid use case could change that.
1118
1119 See also
1120 C<L<perlapi/is_utf8_invariant_string>>
1121 and
1122 C<L<perlapi/is_utf8_invariant_string_loc>>,
1123
1124 =cut
1125
1126 */
1127
1128 PERL_STATIC_INLINE Size_t
1129 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1130 {
1131     const U8* x = s;
1132     Size_t count = 0;
1133
1134     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1135
1136 #  ifndef EBCDIC
1137
1138     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1139      * same as for is_utf8_invariant_string()) */
1140     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1141                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1142                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1143     {
1144
1145         /* Process per-byte until reach word boundary.  XXX This loop could be
1146          * eliminated if we knew that this platform had fast unaligned reads */
1147         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1148             count += ! UTF8_IS_INVARIANT(*x++);
1149         }
1150
1151         /* Process per-word as long as we have at least a full word left */
1152         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1153                    explanation of how this works */
1154             PERL_UINTMAX_T increment
1155                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1156                       * PERL_COUNT_MULTIPLIER)
1157                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1158             count += (Size_t) increment;
1159             x += PERL_WORDSIZE;
1160         } while (x + PERL_WORDSIZE <= e);
1161     }
1162
1163 #  endif
1164
1165     /* Process per-byte */
1166     while (x < e) {
1167         if (! UTF8_IS_INVARIANT(*x)) {
1168             count++;
1169         }
1170
1171         x++;
1172     }
1173
1174     return count;
1175 }
1176
1177 #endif
1178
1179 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
1180 #  undef PERL_WORDSIZE
1181 #  undef PERL_COUNT_MULTIPLIER
1182 #  undef PERL_WORD_BOUNDARY_MASK
1183 #  undef PERL_VARIANTS_WORD_MASK
1184 #endif
1185
1186 /*
1187 =for apidoc is_utf8_string
1188
1189 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1190 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1191 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1192 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1193 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1194
1195 This function considers Perl's extended UTF-8 to be valid.  That means that
1196 code points above Unicode, surrogates, and non-character code points are
1197 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1198 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1199 code points are considered valid.
1200
1201 See also
1202 C<L</is_utf8_invariant_string>>,
1203 C<L</is_utf8_invariant_string_loc>>,
1204 C<L</is_utf8_string_loc>>,
1205 C<L</is_utf8_string_loclen>>,
1206 C<L</is_utf8_fixed_width_buf_flags>>,
1207 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1208 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1209
1210 =cut
1211 */
1212
1213 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1214
1215 #if defined(PERL_CORE) || defined (PERL_EXT)
1216
1217 /*
1218 =for apidoc is_utf8_non_invariant_string
1219
1220 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1221 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1222 UTF-8; otherwise returns FALSE.
1223
1224 A TRUE return means that at least one code point represented by the sequence
1225 either is a wide character not representable as a single byte, or the
1226 representation differs depending on whether the sequence is encoded in UTF-8 or
1227 not.
1228
1229 See also
1230 C<L<perlapi/is_utf8_invariant_string>>,
1231 C<L<perlapi/is_utf8_string>>
1232
1233 =cut
1234
1235 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1236 It generally needn't be if its string is entirely UTF-8 invariant, and it
1237 shouldn't be if it otherwise contains invalid UTF-8.
1238
1239 It is an internal function because khw thinks that XS code shouldn't be working
1240 at this low a level.  A valid use case could change that.
1241
1242 */
1243
1244 PERL_STATIC_INLINE bool
1245 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1246 {
1247     const U8 * first_variant;
1248
1249     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1250
1251     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1252         return FALSE;
1253     }
1254
1255     return is_utf8_string(first_variant, len - (first_variant - s));
1256 }
1257
1258 #endif
1259
1260 /*
1261 =for apidoc is_strict_utf8_string
1262
1263 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1264 UTF-8-encoded string that is fully interchangeable by any application using
1265 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1266 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1267 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1268 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1269
1270 This function returns FALSE for strings containing any
1271 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1272 non-character code points.
1273
1274 See also
1275 C<L</is_utf8_invariant_string>>,
1276 C<L</is_utf8_invariant_string_loc>>,
1277 C<L</is_utf8_string>>,
1278 C<L</is_utf8_string_flags>>,
1279 C<L</is_utf8_string_loc>>,
1280 C<L</is_utf8_string_loc_flags>>,
1281 C<L</is_utf8_string_loclen>>,
1282 C<L</is_utf8_string_loclen_flags>>,
1283 C<L</is_utf8_fixed_width_buf_flags>>,
1284 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1285 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1286 C<L</is_strict_utf8_string_loc>>,
1287 C<L</is_strict_utf8_string_loclen>>,
1288 C<L</is_c9strict_utf8_string>>,
1289 C<L</is_c9strict_utf8_string_loc>>,
1290 and
1291 C<L</is_c9strict_utf8_string_loclen>>.
1292
1293 =cut
1294 */
1295
1296 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1297
1298 /*
1299 =for apidoc is_c9strict_utf8_string
1300
1301 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1302 UTF-8-encoded string that conforms to
1303 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1304 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1305 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1306 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1307 characters being ASCII constitute 'a valid UTF-8 string'.
1308
1309 This function returns FALSE for strings containing any code points above the
1310 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1311 code points per
1312 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1313
1314 See also
1315 C<L</is_utf8_invariant_string>>,
1316 C<L</is_utf8_invariant_string_loc>>,
1317 C<L</is_utf8_string>>,
1318 C<L</is_utf8_string_flags>>,
1319 C<L</is_utf8_string_loc>>,
1320 C<L</is_utf8_string_loc_flags>>,
1321 C<L</is_utf8_string_loclen>>,
1322 C<L</is_utf8_string_loclen_flags>>,
1323 C<L</is_utf8_fixed_width_buf_flags>>,
1324 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1325 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1326 C<L</is_strict_utf8_string>>,
1327 C<L</is_strict_utf8_string_loc>>,
1328 C<L</is_strict_utf8_string_loclen>>,
1329 C<L</is_c9strict_utf8_string_loc>>,
1330 and
1331 C<L</is_c9strict_utf8_string_loclen>>.
1332
1333 =cut
1334 */
1335
1336 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1337
1338 /*
1339 =for apidoc is_utf8_string_flags
1340
1341 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1342 UTF-8 string, subject to the restrictions imposed by C<flags>;
1343 returns FALSE otherwise.  If C<len> is 0, it will be calculated
1344 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1345 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1346 that all characters being ASCII constitute 'a valid UTF-8 string'.
1347
1348 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1349 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1350 as C<L</is_strict_utf8_string>>; and if C<flags> is
1351 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1352 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1353 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1354 C<L</utf8n_to_uvchr>>, with the same meanings.
1355
1356 See also
1357 C<L</is_utf8_invariant_string>>,
1358 C<L</is_utf8_invariant_string_loc>>,
1359 C<L</is_utf8_string>>,
1360 C<L</is_utf8_string_loc>>,
1361 C<L</is_utf8_string_loc_flags>>,
1362 C<L</is_utf8_string_loclen>>,
1363 C<L</is_utf8_string_loclen_flags>>,
1364 C<L</is_utf8_fixed_width_buf_flags>>,
1365 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1366 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1367 C<L</is_strict_utf8_string>>,
1368 C<L</is_strict_utf8_string_loc>>,
1369 C<L</is_strict_utf8_string_loclen>>,
1370 C<L</is_c9strict_utf8_string>>,
1371 C<L</is_c9strict_utf8_string_loc>>,
1372 and
1373 C<L</is_c9strict_utf8_string_loclen>>.
1374
1375 =cut
1376 */
1377
1378 PERL_STATIC_INLINE bool
1379 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1380 {
1381     const U8 * first_variant;
1382
1383     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1384     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1385                           |UTF8_DISALLOW_PERL_EXTENDED)));
1386
1387     if (len == 0) {
1388         len = strlen((const char *)s);
1389     }
1390
1391     if (flags == 0) {
1392         return is_utf8_string(s, len);
1393     }
1394
1395     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1396                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1397     {
1398         return is_strict_utf8_string(s, len);
1399     }
1400
1401     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1402                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1403     {
1404         return is_c9strict_utf8_string(s, len);
1405     }
1406
1407     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1408         const U8* const send = s + len;
1409         const U8* x = first_variant;
1410
1411         while (x < send) {
1412             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1413             if (UNLIKELY(! cur_len)) {
1414                 return FALSE;
1415             }
1416             x += cur_len;
1417         }
1418     }
1419
1420     return TRUE;
1421 }
1422
1423 /*
1424
1425 =for apidoc is_utf8_string_loc
1426
1427 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1428 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1429 "utf8ness success") in the C<ep> pointer.
1430
1431 See also C<L</is_utf8_string_loclen>>.
1432
1433 =cut
1434 */
1435
1436 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1437
1438 /*
1439
1440 =for apidoc is_utf8_string_loclen
1441
1442 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1443 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1444 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1445 encoded characters in the C<el> pointer.
1446
1447 See also C<L</is_utf8_string_loc>>.
1448
1449 =cut
1450 */
1451
1452 PERL_STATIC_INLINE bool
1453 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1454 {
1455     const U8 * first_variant;
1456
1457     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1458
1459     if (len == 0) {
1460         len = strlen((const char *) s);
1461     }
1462
1463     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1464         if (el)
1465             *el = len;
1466
1467         if (ep) {
1468             *ep = s + len;
1469         }
1470
1471         return TRUE;
1472     }
1473
1474     {
1475         const U8* const send = s + len;
1476         const U8* x = first_variant;
1477         STRLEN outlen = first_variant - s;
1478
1479         while (x < send) {
1480             const STRLEN cur_len = isUTF8_CHAR(x, send);
1481             if (UNLIKELY(! cur_len)) {
1482                 break;
1483             }
1484             x += cur_len;
1485             outlen++;
1486         }
1487
1488         if (el)
1489             *el = outlen;
1490
1491         if (ep) {
1492             *ep = x;
1493         }
1494
1495         return (x == send);
1496     }
1497 }
1498
1499 /* The perl core arranges to never call the DFA below without there being at
1500  * least one byte available to look at.  This allows the DFA to use a do {}
1501  * while loop which means that calling it with a UTF-8 invariant has a single
1502  * conditional, same as the calling code checking for invariance ahead of time.
1503  * And having the calling code remove that conditional speeds up by that
1504  * conditional, the case where it wasn't invariant.  So there's no reason to
1505  * check before caling this.
1506  *
1507  * But we don't know this for non-core calls, so have to retain the check for
1508  * them. */
1509 #ifdef PERL_CORE
1510 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
1511 #else
1512 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
1513 #endif
1514
1515 /*
1516  * DFA for checking input is valid UTF-8 syntax.
1517  *
1518  * This uses adaptations of the table and algorithm given in
1519  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1520  * documentation of the original version.  A copyright notice for the original
1521  * version is given at the beginning of this file.  The Perl adapations are
1522  * documented at the definition of PL_extended_utf8_dfa_tab[].
1523  *
1524  * This dfa is fast.  There are three exit conditions:
1525  *  1) a well-formed code point, acceptable to the table
1526  *  2) the beginning bytes of an incomplete character, whose completion might
1527  *     or might not be acceptable
1528  *  3) unacceptable to the table.  Some of the adaptations have certain,
1529  *     hopefully less likely to occur, legal inputs be unacceptable to the
1530  *     table, so these must be sorted out afterwards.
1531  *
1532  * This macro is a complete implementation of the code executing the DFA.  It
1533  * is passed the input sequence bounds and the table to use, and what to do
1534  * for each of the exit conditions.  There are three canned actions, likely to
1535  * be the ones you want:
1536  *      DFA_RETURN_SUCCESS_
1537  *      DFA_RETURN_FAILURE_
1538  *      DFA_GOTO_TEASE_APART_FF_
1539  *
1540  * You pass a parameter giving the action to take for each of the three
1541  * possible exit conditions:
1542  *
1543  * 'accept_action'  This is executed when the DFA accepts the input.
1544  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1545  * 'reject_action'  This is executed when the DFA rejects the input.
1546  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1547  *                  you have written code to distinguish the rejecting state
1548  *                  results.  Because it happens in several places, and
1549  *                  involves #ifdefs, the special action
1550  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1551  *                  PL_extended_utf8_dfa_tab.  On platforms without
1552  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1553  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1554  *                  need to have a label 'tease_apart_FF' that it will transfer
1555  *                  to.
1556  * 'incomplete_char_action'  This is executed when the DFA ran off the end
1557  *                  before accepting or rejecting the input.
1558  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1559  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1560  *                  off the end, and you place your code to handle this case
1561  *                  immediately after it.
1562  */
1563
1564 #define DFA_RETURN_SUCCESS_      return s - s0
1565 #define DFA_RETURN_FAILURE_      return 0
1566 #ifdef HAS_EXTRA_LONG_UTF8
1567 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1568 #else
1569 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1570 #endif
1571
1572 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1573                               accept_action,                                \
1574                               reject_action,                                \
1575                               incomplete_char_action)                       \
1576     STMT_START {                                                            \
1577         const U8 * s = s0;                                                  \
1578         UV state = 0;                                                       \
1579                                                                             \
1580         PERL_NON_CORE_CHECK_EMPTY(s,e);                                     \
1581                                                                             \
1582         do {                                                                \
1583             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1584             s++;                                                            \
1585                                                                             \
1586             if (state == 0) {   /* Accepting state */                       \
1587                 accept_action;                                              \
1588             }                                                               \
1589                                                                             \
1590             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1591                 reject_action;                                              \
1592             }                                                               \
1593         } while (s < e);                                                    \
1594                                                                             \
1595         /* Here, dropped out of loop before end-of-char */                  \
1596         incomplete_char_action;                                             \
1597     } STMT_END
1598
1599
1600 /*
1601
1602 =for apidoc isUTF8_CHAR
1603
1604 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1605 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1606 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1607 value gives how many bytes starting at C<s> comprise the code point's
1608 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1609 form the first code point in C<s>, are not examined.
1610
1611 The code point can be any that will fit in an IV on this machine, using Perl's
1612 extension to official UTF-8 to represent those higher than the Unicode maximum
1613 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1614 next few bytes in C<s> is legal UTF-8 for a single character.
1615
1616 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1617 defined by Unicode to be fully interchangeable across applications;
1618 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1619 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1620 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1621
1622 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1623 C<L</is_utf8_string_loclen>> to check entire strings.
1624
1625 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1626 machines) is a valid UTF-8 character.
1627
1628 =cut
1629
1630 This uses an adaptation of the table and algorithm given in
1631 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1632 documentation of the original version.  A copyright notice for the original
1633 version is given at the beginning of this file.  The Perl adapation is
1634 documented at the definition of PL_extended_utf8_dfa_tab[].
1635 */
1636
1637 PERL_STATIC_INLINE Size_t
1638 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1639 {
1640     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1641
1642     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1643                           DFA_RETURN_SUCCESS_,
1644                           DFA_TEASE_APART_FF_,
1645                           DFA_RETURN_FAILURE_);
1646
1647     /* Here, we didn't return success, but dropped out of the loop.  In the
1648      * case of PL_extended_utf8_dfa_tab, this means the input is either
1649      * malformed, or the start byte was FF on a platform that the dfa doesn't
1650      * handle FF's.  Call a helper function. */
1651
1652 #ifdef HAS_EXTRA_LONG_UTF8
1653
1654   tease_apart_FF:
1655
1656     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1657      * either malformed, or was for the largest possible start byte, which we
1658      * now check, not inline */
1659     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1660         return 0;
1661     }
1662
1663     return is_utf8_FF_helper_(s0, e,
1664                               FALSE /* require full, not partial char */
1665                              );
1666 #endif
1667
1668 }
1669
1670 /*
1671
1672 =for apidoc isSTRICT_UTF8_CHAR
1673
1674 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1675 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1676 Unicode code point completely acceptable for open interchange between all
1677 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1678 many bytes starting at C<s> comprise the code point's representation.  Any
1679 bytes remaining before C<e>, but beyond the ones needed to form the first code
1680 point in C<s>, are not examined.
1681
1682 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1683 be a surrogate nor a non-character code point.  Thus this excludes any code
1684 point from Perl's extended UTF-8.
1685
1686 This is used to efficiently decide if the next few bytes in C<s> is
1687 legal Unicode-acceptable UTF-8 for a single character.
1688
1689 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1690 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1691 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1692 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1693
1694 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1695 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1696
1697 =cut
1698
1699 This uses an adaptation of the tables and algorithm given in
1700 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1701 documentation of the original version.  A copyright notice for the original
1702 version is given at the beginning of this file.  The Perl adapation is
1703 documented at the definition of strict_extended_utf8_dfa_tab[].
1704
1705 */
1706
1707 PERL_STATIC_INLINE Size_t
1708 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1709 {
1710     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1711
1712     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1713                           DFA_RETURN_SUCCESS_,
1714                           goto check_hanguls,
1715                           DFA_RETURN_FAILURE_);
1716   check_hanguls:
1717
1718     /* Here, we didn't return success, but dropped out of the loop.  In the
1719      * case of PL_strict_utf8_dfa_tab, this means the input is either
1720      * malformed, or was for certain Hanguls; handle them specially */
1721
1722     /* The dfa above drops out for incomplete or illegal inputs, and certain
1723      * legal Hanguls; check and return accordingly */
1724     return is_HANGUL_ED_utf8_safe(s0, e);
1725 }
1726
1727 /*
1728
1729 =for apidoc isC9_STRICT_UTF8_CHAR
1730
1731 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1732 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1733 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1734 the value gives how many bytes starting at C<s> comprise the code point's
1735 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1736 form the first code point in C<s>, are not examined.
1737
1738 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1739 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1740 code points.  This corresponds to
1741 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1742 which said that non-character code points are merely discouraged rather than
1743 completely forbidden in open interchange.  See
1744 L<perlunicode/Noncharacter code points>.
1745
1746 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1747 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1748
1749 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1750 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1751
1752 =cut
1753
1754 This uses an adaptation of the tables and algorithm given in
1755 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1756 documentation of the original version.  A copyright notice for the original
1757 version is given at the beginning of this file.  The Perl adapation is
1758 documented at the definition of PL_c9_utf8_dfa_tab[].
1759
1760 */
1761
1762 PERL_STATIC_INLINE Size_t
1763 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1764 {
1765     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1766
1767     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1768                           DFA_RETURN_SUCCESS_,
1769                           DFA_RETURN_FAILURE_,
1770                           DFA_RETURN_FAILURE_);
1771 }
1772
1773 /*
1774
1775 =for apidoc is_strict_utf8_string_loc
1776
1777 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1778 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1779 "utf8ness success") in the C<ep> pointer.
1780
1781 See also C<L</is_strict_utf8_string_loclen>>.
1782
1783 =cut
1784 */
1785
1786 #define is_strict_utf8_string_loc(s, len, ep)                               \
1787                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1788
1789 /*
1790
1791 =for apidoc is_strict_utf8_string_loclen
1792
1793 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1794 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1795 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1796 encoded characters in the C<el> pointer.
1797
1798 See also C<L</is_strict_utf8_string_loc>>.
1799
1800 =cut
1801 */
1802
1803 PERL_STATIC_INLINE bool
1804 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1805 {
1806     const U8 * first_variant;
1807
1808     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1809
1810     if (len == 0) {
1811         len = strlen((const char *) s);
1812     }
1813
1814     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1815         if (el)
1816             *el = len;
1817
1818         if (ep) {
1819             *ep = s + len;
1820         }
1821
1822         return TRUE;
1823     }
1824
1825     {
1826         const U8* const send = s + len;
1827         const U8* x = first_variant;
1828         STRLEN outlen = first_variant - s;
1829
1830         while (x < send) {
1831             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1832             if (UNLIKELY(! cur_len)) {
1833                 break;
1834             }
1835             x += cur_len;
1836             outlen++;
1837         }
1838
1839         if (el)
1840             *el = outlen;
1841
1842         if (ep) {
1843             *ep = x;
1844         }
1845
1846         return (x == send);
1847     }
1848 }
1849
1850 /*
1851
1852 =for apidoc is_c9strict_utf8_string_loc
1853
1854 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1855 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1856 "utf8ness success") in the C<ep> pointer.
1857
1858 See also C<L</is_c9strict_utf8_string_loclen>>.
1859
1860 =cut
1861 */
1862
1863 #define is_c9strict_utf8_string_loc(s, len, ep)                             \
1864                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1865
1866 /*
1867
1868 =for apidoc is_c9strict_utf8_string_loclen
1869
1870 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1871 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1872 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1873 characters in the C<el> pointer.
1874
1875 See also C<L</is_c9strict_utf8_string_loc>>.
1876
1877 =cut
1878 */
1879
1880 PERL_STATIC_INLINE bool
1881 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1882 {
1883     const U8 * first_variant;
1884
1885     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1886
1887     if (len == 0) {
1888         len = strlen((const char *) s);
1889     }
1890
1891     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1892         if (el)
1893             *el = len;
1894
1895         if (ep) {
1896             *ep = s + len;
1897         }
1898
1899         return TRUE;
1900     }
1901
1902     {
1903         const U8* const send = s + len;
1904         const U8* x = first_variant;
1905         STRLEN outlen = first_variant - s;
1906
1907         while (x < send) {
1908             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1909             if (UNLIKELY(! cur_len)) {
1910                 break;
1911             }
1912             x += cur_len;
1913             outlen++;
1914         }
1915
1916         if (el)
1917             *el = outlen;
1918
1919         if (ep) {
1920             *ep = x;
1921         }
1922
1923         return (x == send);
1924     }
1925 }
1926
1927 /*
1928
1929 =for apidoc is_utf8_string_loc_flags
1930
1931 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1932 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1933 "utf8ness success") in the C<ep> pointer.
1934
1935 See also C<L</is_utf8_string_loclen_flags>>.
1936
1937 =cut
1938 */
1939
1940 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1941                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1942
1943
1944 /* The above 3 actual functions could have been moved into the more general one
1945  * just below, and made #defines that call it with the right 'flags'.  They are
1946  * currently kept separate to increase their chances of getting inlined */
1947
1948 /*
1949
1950 =for apidoc is_utf8_string_loclen_flags
1951
1952 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1953 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1954 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1955 encoded characters in the C<el> pointer.
1956
1957 See also C<L</is_utf8_string_loc_flags>>.
1958
1959 =cut
1960 */
1961
1962 PERL_STATIC_INLINE bool
1963 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1964 {
1965     const U8 * first_variant;
1966
1967     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1968     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1969                           |UTF8_DISALLOW_PERL_EXTENDED)));
1970
1971     if (len == 0) {
1972         len = strlen((const char *) s);
1973     }
1974
1975     if (flags == 0) {
1976         return is_utf8_string_loclen(s, len, ep, el);
1977     }
1978
1979     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1980                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1981     {
1982         return is_strict_utf8_string_loclen(s, len, ep, el);
1983     }
1984
1985     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1986                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1987     {
1988         return is_c9strict_utf8_string_loclen(s, len, ep, el);
1989     }
1990
1991     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1992         if (el)
1993             *el = len;
1994
1995         if (ep) {
1996             *ep = s + len;
1997         }
1998
1999         return TRUE;
2000     }
2001
2002     {
2003         const U8* send = s + len;
2004         const U8* x = first_variant;
2005         STRLEN outlen = first_variant - s;
2006
2007         while (x < send) {
2008             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2009             if (UNLIKELY(! cur_len)) {
2010                 break;
2011             }
2012             x += cur_len;
2013             outlen++;
2014         }
2015
2016         if (el)
2017             *el = outlen;
2018
2019         if (ep) {
2020             *ep = x;
2021         }
2022
2023         return (x == send);
2024     }
2025 }
2026
2027 /*
2028 =for apidoc utf8_distance
2029
2030 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2031 and C<b>.
2032
2033 WARNING: use only if you *know* that the pointers point inside the
2034 same UTF-8 buffer.
2035
2036 =cut
2037 */
2038
2039 PERL_STATIC_INLINE IV
2040 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2041 {
2042     PERL_ARGS_ASSERT_UTF8_DISTANCE;
2043
2044     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2045 }
2046
2047 /*
2048 =for apidoc utf8_hop
2049
2050 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2051 forward or backward.
2052
2053 WARNING: do not use the following unless you *know* C<off> is within
2054 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
2055 on the first byte of character or just after the last byte of a character.
2056
2057 =cut
2058 */
2059
2060 PERL_STATIC_INLINE U8 *
2061 Perl_utf8_hop(const U8 *s, SSize_t off)
2062 {
2063     PERL_ARGS_ASSERT_UTF8_HOP;
2064
2065     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2066      * the bitops (especially ~) can create illegal UTF-8.
2067      * In other words: in Perl UTF-8 is not just for Unicode. */
2068
2069     if (off >= 0) {
2070         while (off--)
2071             s += UTF8SKIP(s);
2072     }
2073     else {
2074         while (off++) {
2075             s--;
2076             while (UTF8_IS_CONTINUATION(*s))
2077                 s--;
2078         }
2079     }
2080     GCC_DIAG_IGNORE(-Wcast-qual)
2081     return (U8 *)s;
2082     GCC_DIAG_RESTORE
2083 }
2084
2085 /*
2086 =for apidoc utf8_hop_forward
2087
2088 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2089 forward.
2090
2091 C<off> must be non-negative.
2092
2093 C<s> must be before or equal to C<end>.
2094
2095 When moving forward it will not move beyond C<end>.
2096
2097 Will not exceed this limit even if the string is not valid "UTF-8".
2098
2099 =cut
2100 */
2101
2102 PERL_STATIC_INLINE U8 *
2103 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2104 {
2105     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2106
2107     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2108      * the bitops (especially ~) can create illegal UTF-8.
2109      * In other words: in Perl UTF-8 is not just for Unicode. */
2110
2111     assert(s <= end);
2112     assert(off >= 0);
2113
2114     while (off--) {
2115         STRLEN skip = UTF8SKIP(s);
2116         if ((STRLEN)(end - s) <= skip) {
2117             GCC_DIAG_IGNORE(-Wcast-qual)
2118             return (U8 *)end;
2119             GCC_DIAG_RESTORE
2120         }
2121         s += skip;
2122     }
2123
2124     GCC_DIAG_IGNORE(-Wcast-qual)
2125     return (U8 *)s;
2126     GCC_DIAG_RESTORE
2127 }
2128
2129 /*
2130 =for apidoc utf8_hop_back
2131
2132 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2133 backward.
2134
2135 C<off> must be non-positive.
2136
2137 C<s> must be after or equal to C<start>.
2138
2139 When moving backward it will not move before C<start>.
2140
2141 Will not exceed this limit even if the string is not valid "UTF-8".
2142
2143 =cut
2144 */
2145
2146 PERL_STATIC_INLINE U8 *
2147 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2148 {
2149     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2150
2151     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2152      * the bitops (especially ~) can create illegal UTF-8.
2153      * In other words: in Perl UTF-8 is not just for Unicode. */
2154
2155     assert(start <= s);
2156     assert(off <= 0);
2157
2158     while (off++ && s > start) {
2159         do {
2160             s--;
2161         } while (UTF8_IS_CONTINUATION(*s) && s > start);
2162     }
2163
2164     GCC_DIAG_IGNORE(-Wcast-qual)
2165     return (U8 *)s;
2166     GCC_DIAG_RESTORE
2167 }
2168
2169 /*
2170 =for apidoc utf8_hop_safe
2171
2172 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2173 either forward or backward.
2174
2175 When moving backward it will not move before C<start>.
2176
2177 When moving forward it will not move beyond C<end>.
2178
2179 Will not exceed those limits even if the string is not valid "UTF-8".
2180
2181 =cut
2182 */
2183
2184 PERL_STATIC_INLINE U8 *
2185 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2186 {
2187     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2188
2189     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2190      * the bitops (especially ~) can create illegal UTF-8.
2191      * In other words: in Perl UTF-8 is not just for Unicode. */
2192
2193     assert(start <= s && s <= end);
2194
2195     if (off >= 0) {
2196         return utf8_hop_forward(s, off, end);
2197     }
2198     else {
2199         return utf8_hop_back(s, off, start);
2200     }
2201 }
2202
2203 /*
2204
2205 =for apidoc isUTF8_CHAR_flags
2206
2207 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2208 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2209 that represents some code point, subject to the restrictions given by C<flags>;
2210 otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
2211 starting at C<s> comprise the code point's representation.  Any bytes remaining
2212 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2213 are not examined.
2214
2215 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2216 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2217 as C<L</isSTRICT_UTF8_CHAR>>;
2218 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2219 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2220 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2221 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2222
2223 The three alternative macros are for the most commonly needed validations; they
2224 are likely to run somewhat faster than this more general one, as they can be
2225 inlined into your code.
2226
2227 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2228 L</is_utf8_string_loclen_flags> to check entire strings.
2229
2230 =cut
2231 */
2232
2233 PERL_STATIC_INLINE STRLEN
2234 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2235 {
2236     PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2237     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2238                           |UTF8_DISALLOW_PERL_EXTENDED)));
2239
2240     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2241                           goto check_success,
2242                           DFA_TEASE_APART_FF_,
2243                           DFA_RETURN_FAILURE_);
2244
2245   check_success:
2246
2247     return is_utf8_char_helper_(s0, e, flags);
2248
2249 #ifdef HAS_EXTRA_LONG_UTF8
2250
2251   tease_apart_FF:
2252
2253     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2254      * either malformed, or was for the largest possible start byte, which
2255      * indicates perl extended UTF-8, well above the Unicode maximum */
2256     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2257         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2258     {
2259         return 0;
2260     }
2261
2262     /* Otherwise examine the sequence not inline */
2263     return is_utf8_FF_helper_(s0, e,
2264                               FALSE /* require full, not partial char */
2265                              );
2266 #endif
2267
2268 }
2269
2270 /*
2271
2272 =for apidoc is_utf8_valid_partial_char
2273
2274 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2275 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2276 points.  Otherwise, it returns 1 if there exists at least one non-empty
2277 sequence of bytes that when appended to sequence C<s>, starting at position
2278 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2279 otherwise returns 0.
2280
2281 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2282 point.
2283
2284 This is useful when a fixed-length buffer is being tested for being well-formed
2285 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2286 it is split somewhere in the middle of the final code point's UTF-8
2287 representation.  (Presumably when the buffer is refreshed with the next chunk
2288 of data, the new first bytes will complete the partial code point.)   This
2289 function is used to verify that the final bytes in the current buffer are in
2290 fact the legal beginning of some code point, so that if they aren't, the
2291 failure can be signalled without having to wait for the next read.
2292
2293 =cut
2294 */
2295 #define is_utf8_valid_partial_char(s, e)                                    \
2296                                 is_utf8_valid_partial_char_flags(s, e, 0)
2297
2298 /*
2299
2300 =for apidoc is_utf8_valid_partial_char_flags
2301
2302 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2303 or not the input is a valid UTF-8 encoded partial character, but it takes an
2304 extra parameter, C<flags>, which can further restrict which code points are
2305 considered valid.
2306
2307 If C<flags> is 0, this behaves identically to
2308 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2309 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2310 there is any sequence of bytes that can complete the input partial character in
2311 such a way that a non-prohibited character is formed, the function returns
2312 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2313 partial character input.  But many  of the other possible excluded types can be
2314 determined from just the first one or two bytes.
2315
2316 =cut
2317  */
2318
2319 PERL_STATIC_INLINE bool
2320 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2321 {
2322     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2323     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2324                           |UTF8_DISALLOW_PERL_EXTENDED)));
2325
2326     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2327                           DFA_RETURN_FAILURE_,
2328                           DFA_TEASE_APART_FF_,
2329                           NOOP);
2330
2331     /* The NOOP above causes the DFA to drop down here iff the input was a
2332      * partial character.  flags=0 => can return TRUE immediately; otherwise we
2333      * need to check (not inline) if the partial character is the beginning of
2334      * a disallowed one */
2335     if (flags == 0) {
2336         return TRUE;
2337     }
2338
2339     return cBOOL(is_utf8_char_helper_(s0, e, flags));
2340
2341 #ifdef HAS_EXTRA_LONG_UTF8
2342
2343   tease_apart_FF:
2344
2345     /* Getting here means the input is either malformed, or, in the case of
2346      * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
2347      * latter case has to be extended UTF-8, so can fail immediately if that is
2348      * forbidden */
2349
2350     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2351         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2352     {
2353         return 0;
2354     }
2355
2356     return is_utf8_FF_helper_(s0, e,
2357                               TRUE /* Require to be a partial character */
2358                              );
2359 #endif
2360
2361 }
2362
2363 /*
2364
2365 =for apidoc is_utf8_fixed_width_buf_flags
2366
2367 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2368 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2369 otherwise it returns FALSE.
2370
2371 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2372 without restriction.  If the final few bytes of the buffer do not form a
2373 complete code point, this will return TRUE anyway, provided that
2374 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2375
2376 If C<flags> in non-zero, it can be any combination of the
2377 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2378 same meanings.
2379
2380 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2381 returns FALSE if the final few bytes of the string don't form a complete code
2382 point.
2383
2384 =cut
2385  */
2386 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2387                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2388
2389 /*
2390
2391 =for apidoc is_utf8_fixed_width_buf_loc_flags
2392
2393 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2394 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2395 to the beginning of any partial character at the end of the buffer; if there is
2396 no partial character C<*ep> will contain C<s>+C<len>.
2397
2398 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2399
2400 =cut
2401 */
2402
2403 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2404                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2405
2406 /*
2407
2408 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2409
2410 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2411 complete, valid characters found in the C<el> pointer.
2412
2413 =cut
2414 */
2415
2416 PERL_STATIC_INLINE bool
2417 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2418                                        STRLEN len,
2419                                        const U8 **ep,
2420                                        STRLEN *el,
2421                                        const U32 flags)
2422 {
2423     const U8 * maybe_partial;
2424
2425     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2426
2427     if (! ep) {
2428         ep  = &maybe_partial;
2429     }
2430
2431     /* If it's entirely valid, return that; otherwise see if the only error is
2432      * that the final few bytes are for a partial character */
2433     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2434            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2435 }
2436
2437 PERL_STATIC_INLINE UV
2438 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2439                       STRLEN curlen,
2440                       STRLEN *retlen,
2441                       const U32 flags,
2442                       U32 * errors,
2443                       AV ** msgs)
2444 {
2445     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2446      * simple cases, and, if necessary calls a helper function to deal with the
2447      * more complex ones.  Almost all well-formed non-problematic code points
2448      * are considered simple, so that it's unlikely that the helper function
2449      * will need to be called.
2450      *
2451      * This is an adaptation of the tables and algorithm given in
2452      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2453      * comprehensive documentation of the original version.  A copyright notice
2454      * for the original version is given at the beginning of this file.  The
2455      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
2456      */
2457
2458     const U8 * const s0 = s;
2459     const U8 * send = s0 + curlen;
2460     UV uv = 0;      /* The 0 silences some stupid compilers */
2461     UV state = 0;
2462
2463     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2464
2465     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2466      * non-problematic code point, which can be returned immediately.
2467      * Otherwise we call a helper function to figure out the more complicated
2468      * cases. */
2469
2470     while (s < send && LIKELY(state != 1)) {
2471         UV type = PL_strict_utf8_dfa_tab[*s];
2472
2473         uv = (state == 0)
2474              ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
2475              : UTF8_ACCUMULATE(uv, *s);
2476         state = PL_strict_utf8_dfa_tab[256 + state + type];
2477
2478         if (state != 0) {
2479             s++;
2480             continue;
2481         }
2482
2483         if (retlen) {
2484             *retlen = s - s0 + 1;
2485         }
2486         if (errors) {
2487             *errors = 0;
2488         }
2489         if (msgs) {
2490             *msgs = NULL;
2491         }
2492
2493         return UNI_TO_NATIVE(uv);
2494     }
2495
2496     /* Here is potentially problematic.  Use the full mechanism */
2497     return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
2498 }
2499
2500 PERL_STATIC_INLINE UV
2501 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2502 {
2503     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2504
2505     assert(s < send);
2506
2507     if (! ckWARN_d(WARN_UTF8)) {
2508
2509         /* EMPTY is not really allowed, and asserts on debugging builds.  But
2510          * on non-debugging we have to deal with it, and this causes it to
2511          * return the REPLACEMENT CHARACTER, as the documentation indicates */
2512         return utf8n_to_uvchr(s, send - s, retlen,
2513                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2514     }
2515     else {
2516         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2517         if (retlen && ret == 0 && *s != '\0') {
2518             *retlen = (STRLEN) -1;
2519         }
2520
2521         return ret;
2522     }
2523 }
2524
2525 /* ------------------------------- perl.h ----------------------------- */
2526
2527 /*
2528 =for apidoc_section $utility
2529
2530 =for apidoc is_safe_syscall
2531
2532 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2533 C<NUL> characters.
2534 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2535 category, and return FALSE.
2536
2537 Return TRUE if the name is safe.
2538
2539 C<what> and C<op_name> are used in any warning.
2540
2541 Used by the C<IS_SAFE_SYSCALL()> macro.
2542
2543 =cut
2544 */
2545
2546 PERL_STATIC_INLINE bool
2547 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2548 {
2549     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2550      * perl itself uses xce*() functions which accept 8-bit strings.
2551      */
2552
2553     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2554
2555     if (len > 1) {
2556         char *null_at;
2557         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2558                 SETERRNO(ENOENT, LIB_INVARG);
2559                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2560                                    "Invalid \\0 character in %s for %s: %s\\0%s",
2561                                    what, op_name, pv, null_at+1);
2562                 return FALSE;
2563         }
2564     }
2565
2566     return TRUE;
2567 }
2568
2569 /*
2570
2571 Return true if the supplied filename has a newline character
2572 immediately before the first (hopefully only) NUL.
2573
2574 My original look at this incorrectly used the len from SvPV(), but
2575 that's incorrect, since we allow for a NUL in pv[len-1].
2576
2577 So instead, strlen() and work from there.
2578
2579 This allow for the user reading a filename, forgetting to chomp it,
2580 then calling:
2581
2582   open my $foo, "$file\0";
2583
2584 */
2585
2586 #ifdef PERL_CORE
2587
2588 PERL_STATIC_INLINE bool
2589 S_should_warn_nl(const char *pv)
2590 {
2591     STRLEN len;
2592
2593     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2594
2595     len = strlen(pv);
2596
2597     return len > 0 && pv[len-1] == '\n';
2598 }
2599
2600 #endif
2601
2602 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2603
2604 PERL_STATIC_INLINE bool
2605 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2606 {
2607     /* This function determines if the input NV 'nv' may be converted without
2608      * loss of data to an IV.  If not, it returns FALSE taking no other action.
2609      * But if it is possible, it does the conversion, returning TRUE, and
2610      * storing the converted result in '*ivp' */
2611
2612     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2613
2614 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2615     /* Normally any comparison with a NaN returns false; if we can't rely
2616      * on that behaviour, check explicitly */
2617     if (UNLIKELY(Perl_isnan(nv))) {
2618         return FALSE;
2619     }
2620 #  endif
2621
2622     /* Written this way so that with an always-false NaN comparison we
2623      * return false */
2624     if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2625         return FALSE;
2626     }
2627
2628     if ((IV) nv != nv) {
2629         return FALSE;
2630     }
2631
2632     *ivp = (IV) nv;
2633     return TRUE;
2634 }
2635
2636 #endif
2637
2638 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2639
2640 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2641
2642 #define MAX_CHARSET_NAME_LENGTH 2
2643
2644 PERL_STATIC_INLINE const char *
2645 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2646 {
2647     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2648
2649     /* Returns a string that corresponds to the name of the regex character set
2650      * given by 'flags', and *lenp is set the length of that string, which
2651      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2652
2653     *lenp = 1;
2654     switch (get_regex_charset(flags)) {
2655         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2656         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2657         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2658         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2659         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2660             *lenp = 2;
2661             return ASCII_MORE_RESTRICT_PAT_MODS;
2662     }
2663     /* The NOT_REACHED; hides an assert() which has a rather complex
2664      * definition in perl.h. */
2665     NOT_REACHED; /* NOTREACHED */
2666     return "?";     /* Unknown */
2667 }
2668
2669 #endif
2670
2671 /*
2672
2673 Return false if any get magic is on the SV other than taint magic.
2674
2675 */
2676
2677 PERL_STATIC_INLINE bool
2678 Perl_sv_only_taint_gmagic(SV *sv)
2679 {
2680     MAGIC *mg = SvMAGIC(sv);
2681
2682     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2683
2684     while (mg) {
2685         if (mg->mg_type != PERL_MAGIC_taint
2686             && !(mg->mg_flags & MGf_GSKIP)
2687             && mg->mg_virtual->svt_get) {
2688             return FALSE;
2689         }
2690         mg = mg->mg_moremagic;
2691     }
2692
2693     return TRUE;
2694 }
2695
2696 /* ------------------ cop.h ------------------------------------------- */
2697
2698 /* implement GIMME_V() macro */
2699
2700 PERL_STATIC_INLINE U8
2701 Perl_gimme_V(pTHX)
2702 {
2703     I32 cxix;
2704     U8  gimme = (PL_op->op_flags & OPf_WANT);
2705
2706     if (gimme)
2707         return gimme;
2708     cxix = PL_curstackinfo->si_cxsubix;
2709     if (cxix < 0)
2710         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2711     assert(cxstack[cxix].blk_gimme & G_WANT);
2712     return (cxstack[cxix].blk_gimme & G_WANT);
2713 }
2714
2715
2716 /* Enter a block. Push a new base context and return its address. */
2717
2718 PERL_STATIC_INLINE PERL_CONTEXT *
2719 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2720 {
2721     PERL_CONTEXT * cx;
2722
2723     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2724
2725     CXINC;
2726     cx = CX_CUR();
2727     cx->cx_type        = type;
2728     cx->blk_gimme      = gimme;
2729     cx->blk_oldsaveix  = saveix;
2730     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2731     cx->blk_oldcop     = PL_curcop;
2732     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2733     cx->blk_oldscopesp = PL_scopestack_ix;
2734     cx->blk_oldpm      = PL_curpm;
2735     cx->blk_old_tmpsfloor = PL_tmps_floor;
2736
2737     PL_tmps_floor        = PL_tmps_ix;
2738     CX_DEBUG(cx, "PUSH");
2739     return cx;
2740 }
2741
2742
2743 /* Exit a block (RETURN and LAST). */
2744
2745 PERL_STATIC_INLINE void
2746 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2747 {
2748     PERL_ARGS_ASSERT_CX_POPBLOCK;
2749
2750     CX_DEBUG(cx, "POP");
2751     /* these 3 are common to cx_popblock and cx_topblock */
2752     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2753     PL_scopestack_ix = cx->blk_oldscopesp;
2754     PL_curpm         = cx->blk_oldpm;
2755
2756     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2757      * and leaves a CX entry lying around for repeated use, so
2758      * skip for multicall */                  \
2759     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2760             || PL_savestack_ix == cx->blk_oldsaveix);
2761     PL_curcop     = cx->blk_oldcop;
2762     PL_tmps_floor = cx->blk_old_tmpsfloor;
2763 }
2764
2765 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2766  * Whereas cx_popblock() restores the state to the point just before
2767  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2768  * *after* cx_pushblock() was called. */
2769
2770 PERL_STATIC_INLINE void
2771 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2772 {
2773     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2774
2775     CX_DEBUG(cx, "TOP");
2776     /* these 3 are common to cx_popblock and cx_topblock */
2777     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2778     PL_scopestack_ix = cx->blk_oldscopesp;
2779     PL_curpm         = cx->blk_oldpm;
2780
2781     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2782 }
2783
2784
2785 PERL_STATIC_INLINE void
2786 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2787 {
2788     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2789
2790     PERL_ARGS_ASSERT_CX_PUSHSUB;
2791
2792     PERL_DTRACE_PROBE_ENTRY(cv);
2793     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2794     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2795     cx->blk_sub.cv = cv;
2796     cx->blk_sub.olddepth = CvDEPTH(cv);
2797     cx->blk_sub.prevcomppad = PL_comppad;
2798     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2799     cx->blk_sub.retop = retop;
2800     SvREFCNT_inc_simple_void_NN(cv);
2801     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2802 }
2803
2804
2805 /* subsets of cx_popsub() */
2806
2807 PERL_STATIC_INLINE void
2808 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2809 {
2810     CV *cv;
2811
2812     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2813     assert(CxTYPE(cx) == CXt_SUB);
2814
2815     PL_comppad = cx->blk_sub.prevcomppad;
2816     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2817     cv = cx->blk_sub.cv;
2818     CvDEPTH(cv) = cx->blk_sub.olddepth;
2819     cx->blk_sub.cv = NULL;
2820     SvREFCNT_dec(cv);
2821     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2822 }
2823
2824
2825 /* handle the @_ part of leaving a sub */
2826
2827 PERL_STATIC_INLINE void
2828 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2829 {
2830     AV *av;
2831
2832     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2833     assert(CxTYPE(cx) == CXt_SUB);
2834     assert(AvARRAY(MUTABLE_AV(
2835         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2836                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2837
2838     CX_POP_SAVEARRAY(cx);
2839     av = MUTABLE_AV(PAD_SVl(0));
2840     if (UNLIKELY(AvREAL(av)))
2841         /* abandon @_ if it got reified */
2842         clear_defarray(av, 0);
2843     else {
2844         CLEAR_ARGARRAY(av);
2845     }
2846 }
2847
2848
2849 PERL_STATIC_INLINE void
2850 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2851 {
2852     PERL_ARGS_ASSERT_CX_POPSUB;
2853     assert(CxTYPE(cx) == CXt_SUB);
2854
2855     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2856
2857     if (CxHASARGS(cx))
2858         cx_popsub_args(cx);
2859     cx_popsub_common(cx);
2860 }
2861
2862
2863 PERL_STATIC_INLINE void
2864 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2865 {
2866     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2867
2868     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2869     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2870     cx->blk_format.cv          = cv;
2871     cx->blk_format.retop       = retop;
2872     cx->blk_format.gv          = gv;
2873     cx->blk_format.dfoutgv     = PL_defoutgv;
2874     cx->blk_format.prevcomppad = PL_comppad;
2875     cx->blk_u16                = 0;
2876
2877     SvREFCNT_inc_simple_void_NN(cv);
2878     CvDEPTH(cv)++;
2879     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2880 }
2881
2882
2883 PERL_STATIC_INLINE void
2884 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2885 {
2886     CV *cv;
2887     GV *dfout;
2888
2889     PERL_ARGS_ASSERT_CX_POPFORMAT;
2890     assert(CxTYPE(cx) == CXt_FORMAT);
2891
2892     dfout = cx->blk_format.dfoutgv;
2893     setdefout(dfout);
2894     cx->blk_format.dfoutgv = NULL;
2895     SvREFCNT_dec_NN(dfout);
2896
2897     PL_comppad = cx->blk_format.prevcomppad;
2898     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2899     cv = cx->blk_format.cv;
2900     cx->blk_format.cv = NULL;
2901     --CvDEPTH(cv);
2902     SvREFCNT_dec_NN(cv);
2903     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2904 }
2905
2906
2907 PERL_STATIC_INLINE void
2908 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2909 {
2910     cx->blk_eval.retop         = retop;
2911     cx->blk_eval.old_namesv    = namesv;
2912     cx->blk_eval.old_eval_root = PL_eval_root;
2913     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2914     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2915     cx->blk_eval.cur_top_env   = PL_top_env;
2916
2917     assert(!(PL_in_eval     & ~ 0x3F));
2918     assert(!(PL_op->op_type & ~0x1FF));
2919     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2920 }
2921
2922 PERL_STATIC_INLINE void
2923 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2924 {
2925     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2926
2927     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2928
2929     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2930     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2931 }
2932
2933 PERL_STATIC_INLINE void
2934 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2935 {
2936     PERL_ARGS_ASSERT_CX_PUSHTRY;
2937
2938     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2939
2940     /* Don't actually change it, just store the current value so it's restored
2941      * by the common popeval */
2942     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2943 }
2944
2945
2946 PERL_STATIC_INLINE void
2947 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2948 {
2949     SV *sv;
2950
2951     PERL_ARGS_ASSERT_CX_POPEVAL;
2952     assert(CxTYPE(cx) == CXt_EVAL);
2953
2954     PL_in_eval = CxOLD_IN_EVAL(cx);
2955     assert(!(PL_in_eval & 0xc0));
2956     PL_eval_root = cx->blk_eval.old_eval_root;
2957     sv = cx->blk_eval.cur_text;
2958     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2959         cx->blk_eval.cur_text = NULL;
2960         SvREFCNT_dec_NN(sv);
2961     }
2962
2963     sv = cx->blk_eval.old_namesv;
2964     if (sv) {
2965         cx->blk_eval.old_namesv = NULL;
2966         SvREFCNT_dec_NN(sv);
2967     }
2968     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2969 }
2970
2971
2972 /* push a plain loop, i.e.
2973  *     { block }
2974  *     while (cond) { block }
2975  *     for (init;cond;continue) { block }
2976  * This loop can be last/redo'ed etc.
2977  */
2978
2979 PERL_STATIC_INLINE void
2980 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2981 {
2982     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2983     cx->blk_loop.my_op = cLOOP;
2984 }
2985
2986
2987 /* push a true for loop, i.e.
2988  *     for var (list) { block }
2989  */
2990
2991 PERL_STATIC_INLINE void
2992 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2993 {
2994     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2995
2996     /* this one line is common with cx_pushloop_plain */
2997     cx->blk_loop.my_op = cLOOP;
2998
2999     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3000     cx->blk_loop.itersave      = itersave;
3001 #ifdef USE_ITHREADS
3002     cx->blk_loop.oldcomppad = PL_comppad;
3003 #endif
3004 }
3005
3006
3007 /* pop all loop types, including plain */
3008
3009 PERL_STATIC_INLINE void
3010 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3011 {
3012     PERL_ARGS_ASSERT_CX_POPLOOP;
3013
3014     assert(CxTYPE_is_LOOP(cx));
3015     if (  CxTYPE(cx) == CXt_LOOP_ARY
3016        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3017     {
3018         /* Free ary or cur. This assumes that state_u.ary.ary
3019          * aligns with state_u.lazysv.cur. See cx_dup() */
3020         SV *sv = cx->blk_loop.state_u.lazysv.cur;
3021         cx->blk_loop.state_u.lazysv.cur = NULL;
3022         SvREFCNT_dec_NN(sv);
3023         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3024             sv = cx->blk_loop.state_u.lazysv.end;
3025             cx->blk_loop.state_u.lazysv.end = NULL;
3026             SvREFCNT_dec_NN(sv);
3027         }
3028     }
3029     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3030         SV *cursv;
3031         SV **svp = (cx)->blk_loop.itervar_u.svp;
3032         if ((cx->cx_type & CXp_FOR_GV))
3033             svp = &GvSV((GV*)svp);
3034         cursv = *svp;
3035         *svp = cx->blk_loop.itersave;
3036         cx->blk_loop.itersave = NULL;
3037         SvREFCNT_dec(cursv);
3038     }
3039 }
3040
3041
3042 PERL_STATIC_INLINE void
3043 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3044 {
3045     PERL_ARGS_ASSERT_CX_PUSHWHEN;
3046
3047     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3048 }
3049
3050
3051 PERL_STATIC_INLINE void
3052 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3053 {
3054     PERL_ARGS_ASSERT_CX_POPWHEN;
3055     assert(CxTYPE(cx) == CXt_WHEN);
3056
3057     PERL_UNUSED_ARG(cx);
3058     PERL_UNUSED_CONTEXT;
3059     /* currently NOOP */
3060 }
3061
3062
3063 PERL_STATIC_INLINE void
3064 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3065 {
3066     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3067
3068     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3069     cx->blk_givwhen.defsv_save = orig_defsv;
3070 }
3071
3072
3073 PERL_STATIC_INLINE void
3074 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3075 {
3076     SV *sv;
3077
3078     PERL_ARGS_ASSERT_CX_POPGIVEN;
3079     assert(CxTYPE(cx) == CXt_GIVEN);
3080
3081     sv = GvSV(PL_defgv);
3082     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3083     cx->blk_givwhen.defsv_save = NULL;
3084     SvREFCNT_dec(sv);
3085 }
3086
3087 /* ------------------ util.h ------------------------------------------- */
3088
3089 /*
3090 =for apidoc_section $string
3091
3092 =for apidoc foldEQ
3093
3094 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3095 same
3096 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
3097 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
3098 range bytes match only themselves.
3099
3100 =cut
3101 */
3102
3103 PERL_STATIC_INLINE I32
3104 Perl_foldEQ(const char *s1, const char *s2, I32 len)
3105 {
3106     const U8 *a = (const U8 *)s1;
3107     const U8 *b = (const U8 *)s2;
3108
3109     PERL_ARGS_ASSERT_FOLDEQ;
3110
3111     assert(len >= 0);
3112
3113     while (len--) {
3114         if (*a != *b && *a != PL_fold[*b])
3115             return 0;
3116         a++,b++;
3117     }
3118     return 1;
3119 }
3120
3121 PERL_STATIC_INLINE I32
3122 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
3123 {
3124     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
3125      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3126      * does not check for this.  Nor does it check that the strings each have
3127      * at least 'len' characters. */
3128
3129     const U8 *a = (const U8 *)s1;
3130     const U8 *b = (const U8 *)s2;
3131
3132     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3133
3134     assert(len >= 0);
3135
3136     while (len--) {
3137         if (*a != *b && *a != PL_fold_latin1[*b]) {
3138             return 0;
3139         }
3140         a++, b++;
3141     }
3142     return 1;
3143 }
3144
3145 /*
3146 =for apidoc_section $locale
3147 =for apidoc foldEQ_locale
3148
3149 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3150 same case-insensitively in the current locale; false otherwise.
3151
3152 =cut
3153 */
3154
3155 PERL_STATIC_INLINE I32
3156 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3157 {
3158     const U8 *a = (const U8 *)s1;
3159     const U8 *b = (const U8 *)s2;
3160
3161     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3162
3163     assert(len >= 0);
3164
3165     while (len--) {
3166         if (*a != *b && *a != PL_fold_locale[*b])
3167             return 0;
3168         a++,b++;
3169     }
3170     return 1;
3171 }
3172
3173 /*
3174 =for apidoc_section $string
3175 =for apidoc my_strnlen
3176
3177 The C library C<strnlen> if available, or a Perl implementation of it.
3178
3179 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3180 characters.  It will never attempt to address more than C<maxlen>
3181 characters, making it suitable for use with strings that are not
3182 guaranteed to be NUL-terminated.
3183
3184 =cut
3185
3186 Description stolen from http://man.openbsd.org/strnlen.3,
3187 implementation stolen from PostgreSQL.
3188 */
3189 #ifndef HAS_STRNLEN
3190
3191 PERL_STATIC_INLINE Size_t
3192 Perl_my_strnlen(const char *str, Size_t maxlen)
3193 {
3194     const char *end = (char *) memchr(str, '\0', maxlen);
3195
3196     PERL_ARGS_ASSERT_MY_STRNLEN;
3197
3198     if (end == NULL) return maxlen;
3199     return end - str;
3200 }
3201
3202 #endif
3203
3204 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3205
3206 PERL_STATIC_INLINE void *
3207 S_my_memrchr(const char * s, const char c, const STRLEN len)
3208 {
3209     /* memrchr(), since many platforms lack it */
3210
3211     const char * t = s + len - 1;
3212
3213     PERL_ARGS_ASSERT_MY_MEMRCHR;
3214
3215     while (t >= s) {
3216         if (*t == c) {
3217             return (void *) t;
3218         }
3219         t--;
3220     }
3221
3222     return NULL;
3223 }
3224
3225 #endif
3226
3227 PERL_STATIC_INLINE char *
3228 Perl_mortal_getenv(const char * str)
3229 {
3230     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3231      *
3232      * It's (mostly) thread-safe because it uses a mutex to prevent other
3233      * threads (that look at this mutex) from destroying the result before this
3234      * routine has a chance to copy the result to a place that won't be
3235      * destroyed before the caller gets a chance to handle it.  That place is a
3236      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3237      * impression that the SV will hang around longer under more circumstances
3238      *
3239      * The reason it isn't completely thread-safe is that other code could
3240      * simply not pay attention to the mutex.  All of the Perl core uses the
3241      * mutex, but it is possible for code from, say XS, to not use this mutex,
3242      * defeating the safety.
3243      *
3244      * getenv() returns, in some implementations, a pointer to a spot in the
3245      * **environ array, which could be invalidated at any time by this or
3246      * another thread changing the environment.  Other implementations copy the
3247      * **environ value to a static buffer, returning a pointer to that.  That
3248      * buffer might or might not be invalidated by a getenv() call in another
3249      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3250      * many getenv() calls can safely be running simultaneously, so a
3251      * many-reader (but no simultaneous writers) lock is ok.  There is a
3252      * Configure probe to see if another thread destroys the buffer, and the
3253      * mutex is defined accordingly.
3254      *
3255      * But in all cases, using the mutex prevents these problems, as long as
3256      * all code uses the same mutex..
3257      *
3258      * A complication is that this can be called during phases where the
3259      * mortalization process isn't available.  These are in interpreter
3260      * destruction or early in construction.  khw believes that at these times
3261      * there shouldn't be anything else going on, so plain getenv is safe AS
3262      * LONG AS the caller acts on the return before calling it again. */
3263
3264     char * ret;
3265     dTHX;
3266
3267     PERL_ARGS_ASSERT_MORTAL_GETENV;
3268
3269     /* Can't mortalize without stacks.  khw believes that no other threads
3270      * should be running, so no need to lock things, and this may be during a
3271      * phase when locking isn't even available */
3272     if (UNLIKELY(PL_scopestack_ix == 0)) {
3273         return getenv(str);
3274     }
3275
3276 #ifdef PERL_MEM_LOG
3277
3278     /* A major complication arises under PERL_MEM_LOG.  When that is active,
3279      * every memory allocation may result in logging, depending on the value of
3280      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3281      * saving ENV{foo}'s value (but before saving it), the logging code will
3282      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3283      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3284      * lock a boolean mutex recursively); 3) destroying the getenv() static
3285      * buffer; or 4) destroying the temporary created by this for the copy
3286      * causes a log entry to be made which could cause a new temporary to be
3287      * created, which will need to be destroyed at some point, leading to an
3288      * infinite loop.
3289      *
3290      * The solution adopted here (after some gnashing of teeth) is to detect
3291      * the recursive calls and calls from the logger, and treat them specially.
3292      * Let's say we want to do getenv("foo").  We first find
3293      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3294      * variable, so no temporary is required.  Then we do getenv(foo}, and in
3295      * the process of creating a temporary to save it, this function will be
3296      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3297      * we detect that it is such a call and return our saved value instead of
3298      * locking and doing a new getenv().  This solves all of problems 1), 2),
3299      * and 3).  Because all the getenv()s are done while the mutex is locked,
3300      * the state cannot have changed.  To solve 4), we don't create a temporary
3301      * when this is called from the logging code.  That code disposes of the
3302      * return value while the mutex is still locked.
3303      *
3304      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3305      * digits and 3 particular letters are significant; the rest are ignored by
3306      * the memory logging code.  Thus the per-interpreter variable only needs
3307      * to be large enough to save the significant information, the size of
3308      * which is known at compile time.  The first byte is extra, reserved for
3309      * flags for our use.  To protect against overflowing, only the reserved
3310      * byte, as many digits as don't overflow, and the three letters are
3311      * stored.
3312      *
3313      * The reserved byte has two bits:
3314      *      0x1 if set indicates that if we get here, it is a recursive call of
3315      *          getenv()
3316      *      0x2 if set indicates that the call is from the logging code.
3317      *
3318      * If the flag indicates this is a recursive call, just return the stored
3319      * value of PL_mem_log;  An empty value gets turned into NULL. */
3320     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3321         if (PL_mem_log[1] == '\0') {
3322             return NULL;
3323         } else {
3324             return PL_mem_log + 1;
3325         }
3326     }
3327
3328 #endif
3329
3330     GETENV_LOCK;
3331
3332 #ifdef PERL_MEM_LOG
3333
3334     /* Here we are in a critical section.  As explained above, we do our own
3335      * getenv(PERL_MEM_LOG), saving the result safely. */
3336     ret = getenv("PERL_MEM_LOG");
3337     if (ret == NULL) {  /* No logging active */
3338
3339         /* Return that immediately if called from the logging code */
3340         if (PL_mem_log[0] & 0x2) {
3341             GETENV_UNLOCK;
3342             return NULL;
3343         }
3344
3345         PL_mem_log[1] = '\0';
3346     }
3347     else {
3348         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3349
3350         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3351          * extremely long string.  But we want only a few characters from it.
3352          * PL_mem_log has been made large enough to hold just the ones we need.
3353          * First the file descriptor. */
3354         if (isDIGIT(*ret)) {
3355             const char * s = ret;
3356             if (UNLIKELY(*s == '0')) {
3357
3358                 /* Reduce multiple leading zeros to a single one.  This is to
3359                  * allow the caller to change what to do with leading zeros. */
3360                 *mem_log_meat++ = '0';
3361                 s++;
3362                 while (*s == '0') {
3363                     s++;
3364                 }
3365             }
3366
3367             /* If the input overflows, copy just enough for the result to also
3368              * overflow, plus 1 to make sure */
3369             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3370                 *mem_log_meat++ = *s++;
3371             }
3372         }
3373
3374         /* Then each of the three significant characters */
3375         if (strchr(ret, 'm')) {
3376             *mem_log_meat++ = 'm';
3377         }
3378         if (strchr(ret, 's')) {
3379             *mem_log_meat++ = 's';
3380         }
3381         if (strchr(ret, 't')) {
3382             *mem_log_meat++ = 't';
3383         }
3384         *mem_log_meat = '\0';
3385
3386         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3387     }
3388
3389     /* If we are being called from the logger, it only needs the significant
3390      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3391     if (PL_mem_log[0] & 0x2) {
3392         assert(strEQ(str, "PERL_MEM_LOG"));
3393         GETENV_UNLOCK;
3394         return PL_mem_log + 1;
3395     }
3396
3397     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3398      * is coming from other than the logging code, so it should be treated the
3399      * same as any other getenv(), returning the full value, not just the
3400      * significant part, and having its value saved.  Set the flag that
3401      * indicates any call to this routine will be a recursion from here */
3402     PL_mem_log[0] = 0x1;
3403
3404 #endif
3405
3406     /* Now get the value of the real desired variable, and save a copy */
3407     ret = getenv(str);
3408
3409     if (ret != NULL) {
3410         ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3411     }
3412
3413     GETENV_UNLOCK;
3414
3415 #ifdef PERL_MEM_LOG
3416
3417     /* Clear the buffer */
3418     Zero(PL_mem_log, sizeof(PL_mem_log), char);
3419
3420 #endif
3421
3422     return ret;
3423 }
3424
3425 PERL_STATIC_INLINE bool
3426 Perl_sv_isbool(pTHX_ const SV *sv)
3427 {
3428     return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3429         (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3430 }
3431
3432 #ifdef USE_ITHREADS
3433
3434 PERL_STATIC_INLINE AV *
3435 Perl_cop_file_avn(pTHX_ const COP *cop) {
3436
3437     PERL_ARGS_ASSERT_COP_FILE_AVN;
3438
3439     const char *file = CopFILE(cop);
3440     if (file) {
3441         GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3442         if (gv) {
3443             return GvAVn(gv);
3444         }
3445         else
3446             return NULL;
3447      }
3448      else
3449          return NULL;
3450 }
3451
3452 #endif
3453
3454 /*
3455  * ex: set ts=8 sts=4 sw=4 et:
3456  */