inline.h

   1 /*    inline.h
   2  *
   3  *    Copyright (C) 2012 by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  *    This file contains tables and code adapted from
   9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
  10  *    copyright notice:
  11
  12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  13
  14 Permission is hereby granted, free of charge, to any person obtaining a copy of
  15 this software and associated documentation files (the "Software"), to deal in
  16 the Software without restriction, including without limitation the rights to
  17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  18 of the Software, and to permit persons to whom the Software is furnished to do
  19 so, subject to the following conditions:
  20
  21 The above copyright notice and this permission notice shall be included in all
  22 copies or substantial portions of the Software.
  23
  24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 SOFTWARE.
  31
  32  *
  33  * This file is a home for static inline functions that cannot go in other
  34  * header files, because they depend on proto.h (included after most other
  35  * headers) or struct definitions.
  36  *
  37  * Each section names the header file that the functions "belong" to.
  38  */
  39
  40 /* ------------------------------- av.h ------------------------------- */
  41
  42 /*
  43 =for apidoc_section $AV
  44 =for apidoc av_count
  45 Returns the number of elements in the array C<av>.  This is the true length of
  46 the array, including any undefined elements.  It is always the same as
  47 S<C<av_top_index(av) + 1>>.
  48
  49 =cut
  50 */
  51 PERL_STATIC_INLINE Size_t
  52 Perl_av_count(pTHX_ AV *av)
  53 {
  54     PERL_ARGS_ASSERT_AV_COUNT;
  55     assert(SvTYPE(av) == SVt_PVAV);
  56
  57     return AvFILL(av) + 1;
  58 }
  59
  60 /* ------------------------------- av.c ------------------------------- */
  61
  62 /*
  63 =for apidoc av_store_simple
  64
  65 This is a cut-down version of av_store that assumes that the array is
  66 very straightforward - no magic, not readonly, and AvREAL - and that
  67 C<key> is not negative. This function MUST NOT be used in situations
  68 where any of those assumptions may not hold.
  69
  70 Stores an SV in an array.  The array index is specified as C<key>. It
  71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
  72
  73 Note that the caller is responsible for suitably incrementing the reference
  74 count of C<val> before the call.
  75
  76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
  77
  78 =cut
  79 */
  80
  81 PERL_STATIC_INLINE SV**
  82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
  83 {
  84     SV** ary;
  85
  86     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
  87     assert(SvTYPE(av) == SVt_PVAV);
  88     assert(!SvMAGICAL(av));
  89     assert(!SvREADONLY(av));
  90     assert(AvREAL(av));
  91     assert(key > -1);
  92
  93     ary = AvARRAY(av);
  94
  95     if (AvFILLp(av) < key) {
  96         if (key > AvMAX(av)) {
  97             av_extend(av,key);
  98             ary = AvARRAY(av);
  99         }
 100         AvFILLp(av) = key;
 101     } else
 102         SvREFCNT_dec(ary[key]);
 103
 104     ary[key] = val;
 105     return &ary[key];
 106 }
 107
 108 /*
 109 =for apidoc av_fetch_simple
 110
 111 This is a cut-down version of av_fetch that assumes that the array is
 112 very straightforward - no magic, not readonly, and AvREAL - and that
 113 C<key> is not negative. This function MUST NOT be used in situations
 114 where any of those assumptions may not hold.
 115
 116 Returns the SV at the specified index in the array.  The C<key> is the
 117 index.  If lval is true, you are guaranteed to get a real SV back (in case
 118 it wasn't real before), which you can then modify.  Check that the return
 119 value is non-null before dereferencing it to a C<SV*>.
 120
 121 The rough perl equivalent is C<$myarray[$key]>.
 122
 123 =cut
 124 */
 125
 126 PERL_STATIC_INLINE SV**
 127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
 128 {
 129     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
 130     assert(SvTYPE(av) == SVt_PVAV);
 131     assert(!SvMAGICAL(av));
 132     assert(!SvREADONLY(av));
 133     assert(AvREAL(av));
 134     assert(key > -1);
 135
 136     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
 137         return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
 138     } else {
 139         return &AvARRAY(av)[key];
 140     }
 141 }
 142
 143 /* ------------------------------- cv.h ------------------------------- */
 144
 145 /*
 146 =for apidoc_section $CV
 147 =for apidoc CvGV
 148 Returns the GV associated with the CV C<sv>, reifying it if necessary.
 149
 150 =cut
 151 */
 152 PERL_STATIC_INLINE GV *
 153 Perl_CvGV(pTHX_ CV *sv)
 154 {
 155     PERL_ARGS_ASSERT_CVGV;
 156
 157     return CvNAMED(sv)
 158         ? Perl_cvgv_from_hek(aTHX_ sv)
 159         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
 160 }
 161
 162 /*
 163 =for apidoc CvDEPTH
 164 Returns the recursion level of the CV C<sv>.  Hence >= 2 indicates we are in a
 165 recursive call.
 166
 167 =cut
 168 */
 169 PERL_STATIC_INLINE I32 *
 170 Perl_CvDEPTH(const CV * const sv)
 171 {
 172     PERL_ARGS_ASSERT_CVDEPTH;
 173     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
 174
 175     return &((XPVCV*)SvANY(sv))->xcv_depth;
 176 }
 177
 178 /*
 179  CvPROTO returns the prototype as stored, which is not necessarily what
 180  the interpreter should be using. Specifically, the interpreter assumes
 181  that spaces have been stripped, which has been the case if the prototype
 182  was added by toke.c, but is generally not the case if it was added elsewhere.
 183  Since we can't enforce the spacelessness at assignment time, this routine
 184  provides a temporary copy at parse time with spaces removed.
 185  I<orig> is the start of the original buffer, I<len> is the length of the
 186  prototype and will be updated when this returns.
 187  */
 188
 189 #ifdef PERL_CORE
 190 PERL_STATIC_INLINE char *
 191 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
 192 {
 193     SV * tmpsv;
 194     char * tmps;
 195     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
 196     tmps = SvPVX(tmpsv);
 197     while ((*len)--) {
 198         if (!isSPACE(*orig))
 199             *tmps++ = *orig;
 200         orig++;
 201     }
 202     *tmps = '\0';
 203     *len = tmps - SvPVX(tmpsv);
 204                 return SvPVX(tmpsv);
 205 }
 206 #endif
 207
 208 /* ------------------------------- mg.h ------------------------------- */
 209
 210 #if defined(PERL_CORE) || defined(PERL_EXT)
 211 /* assumes get-magic and stringification have already occurred */
 212 PERL_STATIC_INLINE STRLEN
 213 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
 214 {
 215     assert(mg->mg_type == PERL_MAGIC_regex_global);
 216     assert(mg->mg_len != -1);
 217     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
 218         return (STRLEN)mg->mg_len;
 219     else {
 220         const STRLEN pos = (STRLEN)mg->mg_len;
 221         /* Without this check, we may read past the end of the buffer: */
 222         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
 223         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
 224     }
 225 }
 226 #endif
 227
 228 /* ------------------------------- pad.h ------------------------------ */
 229
 230 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
 231 PERL_STATIC_INLINE bool
 232 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
 233 {
 234     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
 235
 236     /* is seq within the range _LOW to _HIGH ?
 237      * This is complicated by the fact that PL_cop_seqmax
 238      * may have wrapped around at some point */
 239     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
 240         return FALSE; /* not yet introduced */
 241
 242     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
 243     /* in compiling scope */
 244         if (
 245             (seq >  COP_SEQ_RANGE_LOW(pn))
 246             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
 247             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
 248         )
 249             return TRUE;
 250     }
 251     else if (
 252         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
 253         ?
 254             (  seq >  COP_SEQ_RANGE_LOW(pn)
 255             || seq <= COP_SEQ_RANGE_HIGH(pn))
 256
 257         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
 258              && seq <= COP_SEQ_RANGE_HIGH(pn))
 259     )
 260         return TRUE;
 261     return FALSE;
 262 }
 263 #endif
 264
 265 /* ------------------------------- pp.h ------------------------------- */
 266
 267 PERL_STATIC_INLINE I32
 268 Perl_TOPMARK(pTHX)
 269 {
 270     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 271                                  "MARK top  %p %" IVdf "\n",
 272                                   PL_markstack_ptr,
 273                                   (IV)*PL_markstack_ptr)));
 274     return *PL_markstack_ptr;
 275 }
 276
 277 PERL_STATIC_INLINE I32
 278 Perl_POPMARK(pTHX)
 279 {
 280     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 281                                  "MARK pop  %p %" IVdf "\n",
 282                                   (PL_markstack_ptr-1),
 283                                   (IV)*(PL_markstack_ptr-1))));
 284     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
 285     return *PL_markstack_ptr--;
 286 }
 287
 288 /* ----------------------------- regexp.h ----------------------------- */
 289
 290 /* PVLVs need to act as a superset of all scalar types - they are basically
 291  * PVMGs with a few extra fields.
 292  * REGEXPs are first class scalars, but have many fields that can't be copied
 293  * into a PVLV body.
 294  *
 295  * Hence we take a different approach - instead of a copy, PVLVs store a pointer
 296  * back to the original body. To avoid increasing the size of PVLVs just for the
 297  * rare case of REGEXP assignment, this pointer is stored in the memory usually
 298  * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
 299  * read the pointer from the two possible locations. The macro SvLEN() wraps the
 300  * access to the union's member xpvlenu_len, but there is no equivalent macro
 301  * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
 302  *
 303  * See commit df6b4bd56551f2d3 for more details. */
 304
 305 PERL_STATIC_INLINE struct regexp *
 306 Perl_ReANY(const REGEXP * const re)
 307 {
 308     XPV* const p = (XPV*)SvANY(re);
 309
 310     PERL_ARGS_ASSERT_REANY;
 311     assert(isREGEXP(re));
 312
 313     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
 314                                    : (struct regexp *)p;
 315 }
 316
 317 /* ------------------------------- sv.h ------------------------------- */
 318
 319 PERL_STATIC_INLINE bool
 320 Perl_SvTRUE(pTHX_ SV *sv)
 321 {
 322     PERL_ARGS_ASSERT_SVTRUE;
 323
 324     if (UNLIKELY(sv == NULL))
 325         return FALSE;
 326     SvGETMAGIC(sv);
 327     return SvTRUE_nomg_NN(sv);
 328 }
 329
 330 PERL_STATIC_INLINE bool
 331 Perl_SvTRUE_nomg(pTHX_ SV *sv)
 332 {
 333     PERL_ARGS_ASSERT_SVTRUE_NOMG;
 334
 335     if (UNLIKELY(sv == NULL))
 336         return FALSE;
 337     return SvTRUE_nomg_NN(sv);
 338 }
 339
 340 PERL_STATIC_INLINE bool
 341 Perl_SvTRUE_NN(pTHX_ SV *sv)
 342 {
 343     PERL_ARGS_ASSERT_SVTRUE_NN;
 344
 345     SvGETMAGIC(sv);
 346     return SvTRUE_nomg_NN(sv);
 347 }
 348
 349 PERL_STATIC_INLINE bool
 350 Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
 351 {
 352     PERL_ARGS_ASSERT_SVTRUE_COMMON;
 353
 354     if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
 355         return SvIMMORTAL_TRUE(sv);
 356
 357     if (! SvOK(sv))
 358         return FALSE;
 359
 360     if (SvPOK(sv))
 361         return SvPVXtrue(sv);
 362
 363     if (SvIOK(sv))
 364         return SvIVX(sv) != 0; /* casts to bool */
 365
 366     if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
 367         return TRUE;
 368
 369     if (sv_2bool_is_fallback)
 370         return sv_2bool_nomg(sv);
 371
 372     return isGV_with_GP(sv);
 373 }
 374
 375
 376 PERL_STATIC_INLINE SV *
 377 Perl_SvREFCNT_inc(SV *sv)
 378 {
 379     if (LIKELY(sv != NULL))
 380         SvREFCNT(sv)++;
 381     return sv;
 382 }
 383 PERL_STATIC_INLINE SV *
 384 Perl_SvREFCNT_inc_NN(SV *sv)
 385 {
 386     PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
 387
 388     SvREFCNT(sv)++;
 389     return sv;
 390 }
 391 PERL_STATIC_INLINE void
 392 Perl_SvREFCNT_inc_void(SV *sv)
 393 {
 394     if (LIKELY(sv != NULL))
 395         SvREFCNT(sv)++;
 396 }
 397 PERL_STATIC_INLINE void
 398 Perl_SvREFCNT_dec(pTHX_ SV *sv)
 399 {
 400     if (LIKELY(sv != NULL)) {
 401         U32 rc = SvREFCNT(sv);
 402         if (LIKELY(rc > 1))
 403             SvREFCNT(sv) = rc - 1;
 404         else
 405             Perl_sv_free2(aTHX_ sv, rc);
 406     }
 407 }
 408
 409 PERL_STATIC_INLINE void
 410 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
 411 {
 412     U32 rc = SvREFCNT(sv);
 413
 414     PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
 415
 416     if (LIKELY(rc > 1))
 417         SvREFCNT(sv) = rc - 1;
 418     else
 419         Perl_sv_free2(aTHX_ sv, rc);
 420 }
 421
 422 /*
 423 =for apidoc SvAMAGIC_on
 424
 425 Indicate that C<sv> has overloading (active magic) enabled.
 426
 427 =cut
 428 */
 429
 430 PERL_STATIC_INLINE void
 431 Perl_SvAMAGIC_on(SV *sv)
 432 {
 433     PERL_ARGS_ASSERT_SVAMAGIC_ON;
 434     assert(SvROK(sv));
 435
 436     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
 437 }
 438
 439 /*
 440 =for apidoc SvAMAGIC_off
 441
 442 Indicate that C<sv> has overloading (active magic) disabled.
 443
 444 =cut
 445 */
 446
 447 PERL_STATIC_INLINE void
 448 Perl_SvAMAGIC_off(SV *sv)
 449 {
 450     PERL_ARGS_ASSERT_SVAMAGIC_OFF;
 451
 452     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
 453         HvAMAGIC_off(SvSTASH(SvRV(sv)));
 454 }
 455
 456 PERL_STATIC_INLINE U32
 457 Perl_SvPADSTALE_on(SV *sv)
 458 {
 459     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 460     return SvFLAGS(sv) |= SVs_PADSTALE;
 461 }
 462 PERL_STATIC_INLINE U32
 463 Perl_SvPADSTALE_off(SV *sv)
 464 {
 465     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 466     return SvFLAGS(sv) &= ~SVs_PADSTALE;
 467 }
 468 #if defined(PERL_CORE) || defined (PERL_EXT)
 469 PERL_STATIC_INLINE STRLEN
 470 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
 471 {
 472     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
 473     if (SvGAMAGIC(sv)) {
 474         U8 *hopped = utf8_hop((U8 *)pv, pos);
 475         if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
 476         return (STRLEN)(hopped - (U8 *)pv);
 477     }
 478     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
 479 }
 480 #endif
 481
 482 /* ------------------------------- utf8.h ------------------------------- */
 483
 484 /*
 485 =for apidoc_section $unicode
 486 */
 487
 488 PERL_STATIC_INLINE void
 489 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
 490 {
 491     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
 492      * encoded string at '*dest', updating '*dest' to include it */
 493
 494     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 495
 496     if (NATIVE_BYTE_IS_INVARIANT(byte))
 497         *((*dest)++) = byte;
 498     else {
 499         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
 500         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
 501     }
 502 }
 503
 504 /*
 505 =for apidoc valid_utf8_to_uvchr
 506 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
 507 known that the next character in the input UTF-8 string C<s> is well-formed
 508 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
 509 points, and non-Unicode code points are allowed.
 510
 511 =cut
 512
 513  */
 514
 515 PERL_STATIC_INLINE UV
 516 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 517 {
 518     const UV expectlen = UTF8SKIP(s);
 519     const U8* send = s + expectlen;
 520     UV uv = *s;
 521
 522     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 523
 524     if (retlen) {
 525         *retlen = expectlen;
 526     }
 527
 528     /* An invariant is trivially returned */
 529     if (expectlen == 1) {
 530         return uv;
 531     }
 532
 533     /* Remove the leading bits that indicate the number of bytes, leaving just
 534      * the bits that are part of the value */
 535     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 536
 537     /* Now, loop through the remaining bytes, accumulating each into the
 538      * working total as we go.  (I khw tried unrolling the loop for up to 4
 539      * bytes, but there was no performance improvement) */
 540     for (++s; s < send; s++) {
 541         uv = UTF8_ACCUMULATE(uv, *s);
 542     }
 543
 544     return UNI_TO_NATIVE(uv);
 545
 546 }
 547
 548 /*
 549 =for apidoc is_utf8_invariant_string
 550
 551 Returns TRUE if the first C<len> bytes of the string C<s> are the same
 552 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
 553 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
 554 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
 555 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
 556 characters are invariant, but so also are the C1 controls.
 557
 558 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 559 use this option, that C<s> can't have embedded C<NUL> characters and has to
 560 have a terminating C<NUL> byte).
 561
 562 See also
 563 C<L</is_utf8_string>>,
 564 C<L</is_utf8_string_flags>>,
 565 C<L</is_utf8_string_loc>>,
 566 C<L</is_utf8_string_loc_flags>>,
 567 C<L</is_utf8_string_loclen>>,
 568 C<L</is_utf8_string_loclen_flags>>,
 569 C<L</is_utf8_fixed_width_buf_flags>>,
 570 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 571 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 572 C<L</is_strict_utf8_string>>,
 573 C<L</is_strict_utf8_string_loc>>,
 574 C<L</is_strict_utf8_string_loclen>>,
 575 C<L</is_c9strict_utf8_string>>,
 576 C<L</is_c9strict_utf8_string_loc>>,
 577 and
 578 C<L</is_c9strict_utf8_string_loclen>>.
 579
 580 =cut
 581
 582 */
 583
 584 #define is_utf8_invariant_string(s, len)                                    \
 585                                 is_utf8_invariant_string_loc(s, len, NULL)
 586
 587 /*
 588 =for apidoc is_utf8_invariant_string_loc
 589
 590 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
 591 the first UTF-8 variant character in the C<ep> pointer; if all characters are
 592 UTF-8 invariant, this function does not change the contents of C<*ep>.
 593
 594 =cut
 595
 596 */
 597
 598 PERL_STATIC_INLINE bool
 599 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 600 {
 601     const U8* send;
 602     const U8* x = s;
 603
 604     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 605
 606     if (len == 0) {
 607         len = strlen((const char *)s);
 608     }
 609
 610     send = s + len;
 611
 612 /* This looks like 0x010101... */
 613 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
 614
 615 /* This looks like 0x808080... */
 616 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
 617 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
 618 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 619
 620 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
 621  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
 622  * optimized out completely on a 32-bit system, and its mask gets optimized out
 623  * on a 64-bit system */
 624 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
 625                                       |   (  PTR2nat(x) >> 1)                 \
 626                                       | ( ( (PTR2nat(x)                       \
 627                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
 628
 629 #ifndef EBCDIC
 630
 631     /* Do the word-at-a-time iff there is at least one usable full word.  That
 632      * means that after advancing to a word boundary, there still is at least a
 633      * full word left.  The number of bytes needed to advance is 'wordsize -
 634      * offset' unless offset is 0. */
 635     if ((STRLEN) (send - x) >= PERL_WORDSIZE
 636
 637                             /* This term is wordsize if subword; 0 if not */
 638                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
 639
 640                             /* 'offset' */
 641                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
 642     {
 643
 644         /* Process per-byte until reach word boundary.  XXX This loop could be
 645          * eliminated if we knew that this platform had fast unaligned reads */
 646         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
 647             if (! UTF8_IS_INVARIANT(*x)) {
 648                 if (ep) {
 649                     *ep = x;
 650                 }
 651
 652                 return FALSE;
 653             }
 654             x++;
 655         }
 656
 657         /* Here, we know we have at least one full word to process.  Process
 658          * per-word as long as we have at least a full word left */
 659         do {
 660             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
 661
 662                 /* Found a variant.  Just return if caller doesn't want its
 663                  * exact position */
 664                 if (! ep) {
 665                     return FALSE;
 666                 }
 667
 668 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 669      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 670
 671                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
 672                 assert(*ep >= s && *ep < send);
 673
 674                 return FALSE;
 675
 676 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 677            checks. */
 678
 679                 break;
 680 #  endif
 681             }
 682
 683             x += PERL_WORDSIZE;
 684
 685         } while (x + PERL_WORDSIZE <= send);
 686     }
 687
 688 #endif      /* End of ! EBCDIC */
 689
 690     /* Process per-byte */
 691     while (x < send) {
 692         if (! UTF8_IS_INVARIANT(*x)) {
 693             if (ep) {
 694                 *ep = x;
 695             }
 696
 697             return FALSE;
 698         }
 699
 700         x++;
 701     }
 702
 703     return TRUE;
 704 }
 705
 706 /* See if the platform has builtins for finding the most/least significant bit,
 707  * and which one is right for using on 32 and 64 bit operands */
 708 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
 709 #  if U32SIZE == INTSIZE
 710 #    define PERL_CLZ_32 __builtin_clz
 711 #  endif
 712 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 713 #    define PERL_CLZ_64 __builtin_clz
 714 #  endif
 715 #endif
 716 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
 717 #  if U32SIZE == INTSIZE
 718 #    define PERL_CTZ_32 __builtin_ctz
 719 #  endif
 720 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 721 #    define PERL_CTZ_64 __builtin_ctz
 722 #  endif
 723 #endif
 724
 725 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
 726 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
 727 #    define PERL_CLZ_32 __builtin_clzl
 728 #  endif
 729 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
 730 #    define PERL_CLZ_64 __builtin_clzl
 731 #  endif
 732 #endif
 733 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
 734 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
 735 #    define PERL_CTZ_32 __builtin_ctzl
 736 #  endif
 737 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
 738 #    define PERL_CTZ_64 __builtin_ctzl
 739 #  endif
 740 #endif
 741
 742 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
 743 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
 744 #    define PERL_CLZ_32 __builtin_clzll
 745 #  endif
 746 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
 747 #    define PERL_CLZ_64 __builtin_clzll
 748 #  endif
 749 #endif
 750 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
 751 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
 752 #    define PERL_CTZ_32 __builtin_ctzll
 753 #  endif
 754 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
 755 #    define PERL_CTZ_64 __builtin_ctzll
 756 #  endif
 757 #endif
 758
 759 #if defined(_MSC_VER)
 760 #  include <intrin.h>
 761 #  pragma intrinsic(_BitScanForward)
 762 #  pragma intrinsic(_BitScanReverse)
 763 #  ifdef _WIN64
 764 #    pragma intrinsic(_BitScanForward64)
 765 #    pragma intrinsic(_BitScanReverse64)
 766 #  endif
 767 #endif
 768
 769 /* The reason there are not checks to see if ffs() and ffsl() are available for
 770  * determining the lsb, is because these don't improve on the deBruijn method
 771  * fallback, which is just a branchless integer multiply, array element
 772  * retrieval, and shift.  The others, even if the function call overhead is
 773  * optimized out, have to cope with the possibility of the input being all
 774  * zeroes, and almost certainly will have conditionals for this eventuality.
 775  * khw, at the time of this commit, looked at the source for both gcc and clang
 776  * to verify this.  (gcc used a method inferior to deBruijn.) */
 777
 778 /* Below are functions to find the first, last, or only set bit in a word.  On
 779  * platforms with 64-bit capability, there is a pair for each operation; the
 780  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
 781  * the same in each pair, so the second is stripped of most comments. */
 782
 783 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 784
 785 PERL_STATIC_INLINE unsigned
 786 Perl_lsbit_pos64(U64 word)
 787 {
 788     /* Find the position (0..63) of the least significant set bit in the input
 789      * word */
 790
 791     ASSUME(word != 0);
 792
 793     /* If we can determine that the platform has a usable fast method to get
 794      * this info, use that */
 795
 796 #  if defined(PERL_CTZ_64)
 797 #    define PERL_HAS_FAST_GET_LSB_POS64
 798
 799     return (unsigned) PERL_CTZ_64(word);
 800
 801 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
 802 #    define PERL_HAS_FAST_GET_LSB_POS64
 803
 804     {
 805         unsigned long index;
 806         _BitScanForward64(&index, word);
 807         return (unsigned)index;
 808     }
 809
 810 #  else
 811
 812     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
 813      * making the lsb the only set bit in the word, and use our function that
 814      * works on words with a single bit set.
 815      *
 816      * Isolate the lsb;
 817      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
 818      *
 819      * The word will look like this, with a rightmost set bit in position 's':
 820      * ('x's are don't cares, and 'y's are their complements)
 821      *      s
 822      *  x..x100..00
 823      *  y..y011..11      Complement
 824      *  y..y100..00      Add 1
 825      *  0..0100..00      And with the original
 826      *
 827      *  (Yes, complementing and adding 1 is just taking the negative on 2's
 828      *  complement machines, but not on 1's complement ones, and some compilers
 829      *  complain about negating an unsigned.)
 830      */
 831     return single_1bit_pos64(word & (~word + 1));
 832
 833 #  endif
 834
 835 }
 836
 837 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
 838 #else   /* ! QUAD */
 839 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
 840 #endif
 841
 842 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
 843 Perl_lsbit_pos32(U32 word)
 844 {
 845     /* Find the position (0..31) of the least significant set bit in the input
 846      * word */
 847
 848     ASSUME(word != 0);
 849
 850 #if defined(PERL_CTZ_32)
 851 #  define PERL_HAS_FAST_GET_LSB_POS32
 852
 853     return (unsigned) PERL_CTZ_32(word);
 854
 855 #elif U32SIZE == 4 && defined(_MSC_VER)
 856 #  define PERL_HAS_FAST_GET_LSB_POS32
 857
 858     {
 859         unsigned long index;
 860         _BitScanForward(&index, word);
 861         return (unsigned)index;
 862     }
 863
 864 #else
 865
 866     return single_1bit_pos32(word & (~word + 1));
 867
 868 #endif
 869
 870 }
 871
 872
 873 /* Convert the leading zeros count to the bit position of the first set bit.
 874  * This just subtracts from the highest position, 31 or 63.  But some compilers
 875  * don't optimize this optimally, and so a bit of bit twiddling encourages them
 876  * to do the right thing.  It turns out that subtracting a smaller non-negative
 877  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
 878  * the two numbers.  To see why, first note that the sum of any number, x, and
 879  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
 880  * the xor of x and all ones is x'. */
 881 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
 882
 883 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 884
 885 PERL_STATIC_INLINE unsigned
 886 Perl_msbit_pos64(U64 word)
 887 {
 888     /* Find the position (0..63) of the most significant set bit in the input
 889      * word */
 890
 891     ASSUME(word != 0);
 892
 893     /* If we can determine that the platform has a usable fast method to get
 894      * this, use that */
 895
 896 #  if defined(PERL_CLZ_64)
 897 #    define PERL_HAS_FAST_GET_MSB_POS64
 898
 899     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
 900
 901 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
 902 #    define PERL_HAS_FAST_GET_MSB_POS64
 903
 904     {
 905         unsigned long index;
 906         _BitScanReverse64(&index, word);
 907         return (unsigned)index;
 908     }
 909
 910 #  else
 911
 912     /* Here, we didn't find a fast method for finding the msb.  Fall back to
 913      * making the msb the only set bit in the word, and use our function that
 914      * works on words with a single bit set.
 915      *
 916      * Isolate the msb; http://codeforces.com/blog/entry/10330
 917      *
 918      * Only the most significant set bit matters.  Or'ing word with its right
 919      * shift of 1 makes that bit and the next one to its right both 1.
 920      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
 921      * ...  We end with the msb and all to the right being 1. */
 922     word |= (word >>  1);
 923     word |= (word >>  2);
 924     word |= (word >>  4);
 925     word |= (word >>  8);
 926     word |= (word >> 16);
 927     word |= (word >> 32);
 928
 929     /* Then subtracting the right shift by 1 clears all but the left-most of
 930      * the 1 bits, which is our desired result */
 931     word -= (word >> 1);
 932
 933     /* Now we have a single bit set */
 934     return single_1bit_pos64(word);
 935
 936 #  endif
 937
 938 }
 939
 940 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
 941 #else   /* ! QUAD */
 942 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
 943 #endif
 944
 945 PERL_STATIC_INLINE unsigned
 946 Perl_msbit_pos32(U32 word)
 947 {
 948     /* Find the position (0..31) of the most significant set bit in the input
 949      * word */
 950
 951     ASSUME(word != 0);
 952
 953 #if defined(PERL_CLZ_32)
 954 #  define PERL_HAS_FAST_GET_MSB_POS32
 955
 956     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
 957
 958 #elif U32SIZE == 4 && defined(_MSC_VER)
 959 #  define PERL_HAS_FAST_GET_MSB_POS32
 960
 961     {
 962         unsigned long index;
 963         _BitScanReverse(&index, word);
 964         return (unsigned)index;
 965     }
 966
 967 #else
 968
 969     word |= (word >>  1);
 970     word |= (word >>  2);
 971     word |= (word >>  4);
 972     word |= (word >>  8);
 973     word |= (word >> 16);
 974     word -= (word >> 1);
 975     return single_1bit_pos32(word);
 976
 977 #endif
 978
 979 }
 980
 981 #if UVSIZE == U64SIZE
 982 #  define msbit_pos(word)  msbit_pos64(word)
 983 #  define lsbit_pos(word)  lsbit_pos64(word)
 984 #elif UVSIZE == U32SIZE
 985 #  define msbit_pos(word)  msbit_pos32(word)
 986 #  define lsbit_pos(word)  lsbit_pos32(word)
 987 #endif
 988
 989 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 990
 991 PERL_STATIC_INLINE unsigned
 992 Perl_single_1bit_pos64(U64 word)
 993 {
 994     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
 995      * find and return the 1's position: 0..63 */
 996
 997 #  ifdef PERL_CORE    /* macro not exported */
 998     ASSUME(isPOWER_OF_2(word));
 999 #  else
1000     ASSUME(word && (word & (word-1)) == 0);
1001 #  endif
1002
1003     /* The only set bit is both the most and least significant bit.  If we have
1004      * a fast way of finding either one, use that.
1005      *
1006      * It may appear at first glance that those functions call this one, but
1007      * they don't if the corresponding #define is set */
1008
1009 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
1010
1011     return msbit_pos64(word);
1012
1013 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1014
1015     return lsbit_pos64(word);
1016
1017 #  else
1018
1019     /* The position of the only set bit in a word can be quickly calculated
1020      * using deBruijn sequences.  See for example
1021      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
1022     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
1023                                                     >> PERL_deBruijnShift64_];
1024 #  endif
1025
1026 }
1027
1028 #endif
1029
1030 PERL_STATIC_INLINE unsigned
1031 Perl_single_1bit_pos32(U32 word)
1032 {
1033     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1034      * find and return the 1's position: 0..31 */
1035
1036 #ifdef PERL_CORE    /* macro not exported */
1037     ASSUME(isPOWER_OF_2(word));
1038 #else
1039     ASSUME(word && (word & (word-1)) == 0);
1040 #endif
1041 #ifdef PERL_HAS_FAST_GET_MSB_POS32
1042
1043     return msbit_pos32(word);
1044
1045 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1046
1047     return lsbit_pos32(word);
1048
1049 /* Unlikely, but possible for the platform to have a wider fast operation but
1050  * not a narrower one.  But easy enough to handle the case by widening the
1051  * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
1052  * would be slower than the deBruijn method.) */
1053 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1054
1055     return msbit_pos64(word);
1056
1057 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1058
1059     return lsbit_pos64(word);
1060
1061 #else
1062
1063     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1064                                                     >> PERL_deBruijnShift32_];
1065 #endif
1066
1067 }
1068
1069 #ifndef EBCDIC
1070
1071 PERL_STATIC_INLINE unsigned int
1072 Perl_variant_byte_number(PERL_UINTMAX_T word)
1073 {
1074     /* This returns the position in a word (0..7) of the first variant byte in
1075      * it.  This is a helper function.  Note that there are no branches */
1076
1077     /* Get just the msb bits of each byte */
1078     word &= PERL_VARIANTS_WORD_MASK;
1079
1080     /* This should only be called if we know there is a variant byte in the
1081      * word */
1082     assert(word);
1083
1084 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1085
1086     /* Bytes are stored like
1087      *  Byte8 ... Byte2 Byte1
1088      *  63..56...15...8 7...0
1089      * so getting the lsb of the whole modified word is getting the msb of the
1090      * first byte that has its msb set */
1091     word = lsbit_pos_uintmax_(word);
1092
1093     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1094      * to 0..7 */
1095     return (unsigned int) ((word + 1) >> 3) - 1;
1096
1097 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1098
1099     /* Bytes are stored like
1100      *  Byte1 Byte2  ... Byte8
1101      * 63..56 55..47 ... 7...0
1102      * so getting the msb of the whole modified word is getting the msb of the
1103      * first byte that has its msb set */
1104     word = msbit_pos_uintmax_(word);
1105
1106     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1107      * to 0..7 */
1108     word = ((word + 1) >> 3) - 1;
1109
1110     /* And invert the result because of the reversed byte order on this
1111      * platform */
1112     word = CHARBITS - word - 1;
1113
1114     return (unsigned int) word;
1115
1116 #  else
1117 #    error Unexpected byte order
1118 #  endif
1119
1120 }
1121
1122 #endif
1123 #if defined(PERL_CORE) || defined(PERL_EXT)
1124
1125 /*
1126 =for apidoc variant_under_utf8_count
1127
1128 This function looks at the sequence of bytes between C<s> and C<e>, which are
1129 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1130 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1131 each of these would occupy two bytes instead of the single one in the input
1132 string.  Thus, this function returns the precise number of bytes the string
1133 would expand by when translated to UTF-8.
1134
1135 Unlike most of the other functions that have C<utf8> in their name, the input
1136 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1137 I<odd> to emphasize this.
1138
1139 This function is internal to Perl because khw thinks that any XS code that
1140 would want this is probably operating too close to the internals.  Presenting a
1141 valid use case could change that.
1142
1143 See also
1144 C<L<perlapi/is_utf8_invariant_string>>
1145 and
1146 C<L<perlapi/is_utf8_invariant_string_loc>>,
1147
1148 =cut
1149
1150 */
1151
1152 PERL_STATIC_INLINE Size_t
1153 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1154 {
1155     const U8* x = s;
1156     Size_t count = 0;
1157
1158     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1159
1160 #  ifndef EBCDIC
1161
1162     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1163      * same as for is_utf8_invariant_string()) */
1164     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1165                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1166                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1167     {
1168
1169         /* Process per-byte until reach word boundary.  XXX This loop could be
1170          * eliminated if we knew that this platform had fast unaligned reads */
1171         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1172             count += ! UTF8_IS_INVARIANT(*x++);
1173         }
1174
1175         /* Process per-word as long as we have at least a full word left */
1176         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1177                    explanation of how this works */
1178             PERL_UINTMAX_T increment
1179                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1180                       * PERL_COUNT_MULTIPLIER)
1181                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1182             count += (Size_t) increment;
1183             x += PERL_WORDSIZE;
1184         } while (x + PERL_WORDSIZE <= e);
1185     }
1186
1187 #  endif
1188
1189     /* Process per-byte */
1190     while (x < e) {
1191         if (! UTF8_IS_INVARIANT(*x)) {
1192             count++;
1193         }
1194
1195         x++;
1196     }
1197
1198     return count;
1199 }
1200
1201 #endif
1202
1203 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
1204 #  undef PERL_WORDSIZE
1205 #  undef PERL_COUNT_MULTIPLIER
1206 #  undef PERL_WORD_BOUNDARY_MASK
1207 #  undef PERL_VARIANTS_WORD_MASK
1208 #endif
1209
1210 /*
1211 =for apidoc is_utf8_string
1212
1213 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1214 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1215 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1216 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1217 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1218
1219 This function considers Perl's extended UTF-8 to be valid.  That means that
1220 code points above Unicode, surrogates, and non-character code points are
1221 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1222 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1223 code points are considered valid.
1224
1225 See also
1226 C<L</is_utf8_invariant_string>>,
1227 C<L</is_utf8_invariant_string_loc>>,
1228 C<L</is_utf8_string_loc>>,
1229 C<L</is_utf8_string_loclen>>,
1230 C<L</is_utf8_fixed_width_buf_flags>>,
1231 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1232 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1233
1234 =cut
1235 */
1236
1237 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1238
1239 #if defined(PERL_CORE) || defined (PERL_EXT)
1240
1241 /*
1242 =for apidoc is_utf8_non_invariant_string
1243
1244 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1245 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1246 UTF-8; otherwise returns FALSE.
1247
1248 A TRUE return means that at least one code point represented by the sequence
1249 either is a wide character not representable as a single byte, or the
1250 representation differs depending on whether the sequence is encoded in UTF-8 or
1251 not.
1252
1253 See also
1254 C<L<perlapi/is_utf8_invariant_string>>,
1255 C<L<perlapi/is_utf8_string>>
1256
1257 =cut
1258
1259 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1260 It generally needn't be if its string is entirely UTF-8 invariant, and it
1261 shouldn't be if it otherwise contains invalid UTF-8.
1262
1263 It is an internal function because khw thinks that XS code shouldn't be working
1264 at this low a level.  A valid use case could change that.
1265
1266 */
1267
1268 PERL_STATIC_INLINE bool
1269 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1270 {
1271     const U8 * first_variant;
1272
1273     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1274
1275     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1276         return FALSE;
1277     }
1278
1279     return is_utf8_string(first_variant, len - (first_variant - s));
1280 }
1281
1282 #endif
1283
1284 /*
1285 =for apidoc is_strict_utf8_string
1286
1287 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1288 UTF-8-encoded string that is fully interchangeable by any application using
1289 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1290 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1291 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1292 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1293
1294 This function returns FALSE for strings containing any
1295 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1296 non-character code points.
1297
1298 See also
1299 C<L</is_utf8_invariant_string>>,
1300 C<L</is_utf8_invariant_string_loc>>,
1301 C<L</is_utf8_string>>,
1302 C<L</is_utf8_string_flags>>,
1303 C<L</is_utf8_string_loc>>,
1304 C<L</is_utf8_string_loc_flags>>,
1305 C<L</is_utf8_string_loclen>>,
1306 C<L</is_utf8_string_loclen_flags>>,
1307 C<L</is_utf8_fixed_width_buf_flags>>,
1308 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1309 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1310 C<L</is_strict_utf8_string_loc>>,
1311 C<L</is_strict_utf8_string_loclen>>,
1312 C<L</is_c9strict_utf8_string>>,
1313 C<L</is_c9strict_utf8_string_loc>>,
1314 and
1315 C<L</is_c9strict_utf8_string_loclen>>.
1316
1317 =cut
1318 */
1319
1320 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1321
1322 /*
1323 =for apidoc is_c9strict_utf8_string
1324
1325 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1326 UTF-8-encoded string that conforms to
1327 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1328 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1329 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1330 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1331 characters being ASCII constitute 'a valid UTF-8 string'.
1332
1333 This function returns FALSE for strings containing any code points above the
1334 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1335 code points per
1336 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1337
1338 See also
1339 C<L</is_utf8_invariant_string>>,
1340 C<L</is_utf8_invariant_string_loc>>,
1341 C<L</is_utf8_string>>,
1342 C<L</is_utf8_string_flags>>,
1343 C<L</is_utf8_string_loc>>,
1344 C<L</is_utf8_string_loc_flags>>,
1345 C<L</is_utf8_string_loclen>>,
1346 C<L</is_utf8_string_loclen_flags>>,
1347 C<L</is_utf8_fixed_width_buf_flags>>,
1348 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1349 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1350 C<L</is_strict_utf8_string>>,
1351 C<L</is_strict_utf8_string_loc>>,
1352 C<L</is_strict_utf8_string_loclen>>,
1353 C<L</is_c9strict_utf8_string_loc>>,
1354 and
1355 C<L</is_c9strict_utf8_string_loclen>>.
1356
1357 =cut
1358 */
1359
1360 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1361
1362 /*
1363 =for apidoc is_utf8_string_flags
1364
1365 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1366 UTF-8 string, subject to the restrictions imposed by C<flags>;
1367 returns FALSE otherwise.  If C<len> is 0, it will be calculated
1368 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1369 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1370 that all characters being ASCII constitute 'a valid UTF-8 string'.
1371
1372 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1373 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1374 as C<L</is_strict_utf8_string>>; and if C<flags> is
1375 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1376 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1377 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1378 C<L</utf8n_to_uvchr>>, with the same meanings.
1379
1380 See also
1381 C<L</is_utf8_invariant_string>>,
1382 C<L</is_utf8_invariant_string_loc>>,
1383 C<L</is_utf8_string>>,
1384 C<L</is_utf8_string_loc>>,
1385 C<L</is_utf8_string_loc_flags>>,
1386 C<L</is_utf8_string_loclen>>,
1387 C<L</is_utf8_string_loclen_flags>>,
1388 C<L</is_utf8_fixed_width_buf_flags>>,
1389 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1390 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1391 C<L</is_strict_utf8_string>>,
1392 C<L</is_strict_utf8_string_loc>>,
1393 C<L</is_strict_utf8_string_loclen>>,
1394 C<L</is_c9strict_utf8_string>>,
1395 C<L</is_c9strict_utf8_string_loc>>,
1396 and
1397 C<L</is_c9strict_utf8_string_loclen>>.
1398
1399 =cut
1400 */
1401
1402 PERL_STATIC_INLINE bool
1403 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1404 {
1405     const U8 * first_variant;
1406
1407     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1408     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1409                           |UTF8_DISALLOW_PERL_EXTENDED)));
1410
1411     if (len == 0) {
1412         len = strlen((const char *)s);
1413     }
1414
1415     if (flags == 0) {
1416         return is_utf8_string(s, len);
1417     }
1418
1419     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1420                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1421     {
1422         return is_strict_utf8_string(s, len);
1423     }
1424
1425     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1426                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1427     {
1428         return is_c9strict_utf8_string(s, len);
1429     }
1430
1431     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1432         const U8* const send = s + len;
1433         const U8* x = first_variant;
1434
1435         while (x < send) {
1436             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1437             if (UNLIKELY(! cur_len)) {
1438                 return FALSE;
1439             }
1440             x += cur_len;
1441         }
1442     }
1443
1444     return TRUE;
1445 }
1446
1447 /*
1448
1449 =for apidoc is_utf8_string_loc
1450
1451 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1452 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1453 "utf8ness success") in the C<ep> pointer.
1454
1455 See also C<L</is_utf8_string_loclen>>.
1456
1457 =cut
1458 */
1459
1460 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1461
1462 /*
1463
1464 =for apidoc is_utf8_string_loclen
1465
1466 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1467 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1468 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1469 encoded characters in the C<el> pointer.
1470
1471 See also C<L</is_utf8_string_loc>>.
1472
1473 =cut
1474 */
1475
1476 PERL_STATIC_INLINE bool
1477 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1478 {
1479     const U8 * first_variant;
1480
1481     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1482
1483     if (len == 0) {
1484         len = strlen((const char *) s);
1485     }
1486
1487     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1488         if (el)
1489             *el = len;
1490
1491         if (ep) {
1492             *ep = s + len;
1493         }
1494
1495         return TRUE;
1496     }
1497
1498     {
1499         const U8* const send = s + len;
1500         const U8* x = first_variant;
1501         STRLEN outlen = first_variant - s;
1502
1503         while (x < send) {
1504             const STRLEN cur_len = isUTF8_CHAR(x, send);
1505             if (UNLIKELY(! cur_len)) {
1506                 break;
1507             }
1508             x += cur_len;
1509             outlen++;
1510         }
1511
1512         if (el)
1513             *el = outlen;
1514
1515         if (ep) {
1516             *ep = x;
1517         }
1518
1519         return (x == send);
1520     }
1521 }
1522
1523 /* The perl core arranges to never call the DFA below without there being at
1524  * least one byte available to look at.  This allows the DFA to use a do {}
1525  * while loop which means that calling it with a UTF-8 invariant has a single
1526  * conditional, same as the calling code checking for invariance ahead of time.
1527  * And having the calling code remove that conditional speeds up by that
1528  * conditional, the case where it wasn't invariant.  So there's no reason to
1529  * check before caling this.
1530  *
1531  * But we don't know this for non-core calls, so have to retain the check for
1532  * them. */
1533 #ifdef PERL_CORE
1534 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
1535 #else
1536 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
1537 #endif
1538
1539 /*
1540  * DFA for checking input is valid UTF-8 syntax.
1541  *
1542  * This uses adaptations of the table and algorithm given in
1543  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1544  * documentation of the original version.  A copyright notice for the original
1545  * version is given at the beginning of this file.  The Perl adapations are
1546  * documented at the definition of PL_extended_utf8_dfa_tab[].
1547  *
1548  * This dfa is fast.  There are three exit conditions:
1549  *  1) a well-formed code point, acceptable to the table
1550  *  2) the beginning bytes of an incomplete character, whose completion might
1551  *     or might not be acceptable
1552  *  3) unacceptable to the table.  Some of the adaptations have certain,
1553  *     hopefully less likely to occur, legal inputs be unacceptable to the
1554  *     table, so these must be sorted out afterwards.
1555  *
1556  * This macro is a complete implementation of the code executing the DFA.  It
1557  * is passed the input sequence bounds and the table to use, and what to do
1558  * for each of the exit conditions.  There are three canned actions, likely to
1559  * be the ones you want:
1560  *      DFA_RETURN_SUCCESS_
1561  *      DFA_RETURN_FAILURE_
1562  *      DFA_GOTO_TEASE_APART_FF_
1563  *
1564  * You pass a parameter giving the action to take for each of the three
1565  * possible exit conditions:
1566  *
1567  * 'accept_action'  This is executed when the DFA accepts the input.
1568  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1569  * 'reject_action'  This is executed when the DFA rejects the input.
1570  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1571  *                  you have written code to distinguish the rejecting state
1572  *                  results.  Because it happens in several places, and
1573  *                  involves #ifdefs, the special action
1574  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1575  *                  PL_extended_utf8_dfa_tab.  On platforms without
1576  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1577  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1578  *                  need to have a label 'tease_apart_FF' that it will transfer
1579  *                  to.
1580  * 'incomplete_char_action'  This is executed when the DFA ran off the end
1581  *                  before accepting or rejecting the input.
1582  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1583  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1584  *                  off the end, and you place your code to handle this case
1585  *                  immediately after it.
1586  */
1587
1588 #define DFA_RETURN_SUCCESS_      return s - s0
1589 #define DFA_RETURN_FAILURE_      return 0
1590 #ifdef HAS_EXTRA_LONG_UTF8
1591 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1592 #else
1593 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1594 #endif
1595
1596 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1597                               accept_action,                                \
1598                               reject_action,                                \
1599                               incomplete_char_action)                       \
1600     STMT_START {                                                            \
1601         const U8 * s = s0;                                                  \
1602         UV state = 0;                                                       \
1603                                                                             \
1604         PERL_NON_CORE_CHECK_EMPTY(s,e);                                     \
1605                                                                             \
1606         do {                                                                \
1607             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1608             s++;                                                            \
1609                                                                             \
1610             if (state == 0) {   /* Accepting state */                       \
1611                 accept_action;                                              \
1612             }                                                               \
1613                                                                             \
1614             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1615                 reject_action;                                              \
1616             }                                                               \
1617         } while (s < e);                                                    \
1618                                                                             \
1619         /* Here, dropped out of loop before end-of-char */                  \
1620         incomplete_char_action;                                             \
1621     } STMT_END
1622
1623
1624 /*
1625
1626 =for apidoc isUTF8_CHAR
1627
1628 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1629 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1630 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1631 value gives how many bytes starting at C<s> comprise the code point's
1632 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1633 form the first code point in C<s>, are not examined.
1634
1635 The code point can be any that will fit in an IV on this machine, using Perl's
1636 extension to official UTF-8 to represent those higher than the Unicode maximum
1637 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1638 next few bytes in C<s> is legal UTF-8 for a single character.
1639
1640 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1641 defined by Unicode to be fully interchangeable across applications;
1642 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1643 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1644 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1645
1646 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1647 C<L</is_utf8_string_loclen>> to check entire strings.
1648
1649 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1650 machines) is a valid UTF-8 character.
1651
1652 =cut
1653
1654 This uses an adaptation of the table and algorithm given in
1655 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1656 documentation of the original version.  A copyright notice for the original
1657 version is given at the beginning of this file.  The Perl adapation is
1658 documented at the definition of PL_extended_utf8_dfa_tab[].
1659 */
1660
1661 PERL_STATIC_INLINE Size_t
1662 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1663 {
1664     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1665
1666     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1667                           DFA_RETURN_SUCCESS_,
1668                           DFA_TEASE_APART_FF_,
1669                           DFA_RETURN_FAILURE_);
1670
1671     /* Here, we didn't return success, but dropped out of the loop.  In the
1672      * case of PL_extended_utf8_dfa_tab, this means the input is either
1673      * malformed, or the start byte was FF on a platform that the dfa doesn't
1674      * handle FF's.  Call a helper function. */
1675
1676 #ifdef HAS_EXTRA_LONG_UTF8
1677
1678   tease_apart_FF:
1679
1680     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1681      * either malformed, or was for the largest possible start byte, which we
1682      * now check, not inline */
1683     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1684         return 0;
1685     }
1686
1687     return is_utf8_FF_helper_(s0, e,
1688                               FALSE /* require full, not partial char */
1689                              );
1690 #endif
1691
1692 }
1693
1694 /*
1695
1696 =for apidoc isSTRICT_UTF8_CHAR
1697
1698 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1699 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1700 Unicode code point completely acceptable for open interchange between all
1701 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1702 many bytes starting at C<s> comprise the code point's representation.  Any
1703 bytes remaining before C<e>, but beyond the ones needed to form the first code
1704 point in C<s>, are not examined.
1705
1706 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1707 be a surrogate nor a non-character code point.  Thus this excludes any code
1708 point from Perl's extended UTF-8.
1709
1710 This is used to efficiently decide if the next few bytes in C<s> is
1711 legal Unicode-acceptable UTF-8 for a single character.
1712
1713 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1714 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1715 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1716 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1717
1718 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1719 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1720
1721 =cut
1722
1723 This uses an adaptation of the tables and algorithm given in
1724 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1725 documentation of the original version.  A copyright notice for the original
1726 version is given at the beginning of this file.  The Perl adapation is
1727 documented at the definition of strict_extended_utf8_dfa_tab[].
1728
1729 */
1730
1731 PERL_STATIC_INLINE Size_t
1732 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1733 {
1734     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1735
1736     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1737                           DFA_RETURN_SUCCESS_,
1738                           goto check_hanguls,
1739                           DFA_RETURN_FAILURE_);
1740   check_hanguls:
1741
1742     /* Here, we didn't return success, but dropped out of the loop.  In the
1743      * case of PL_strict_utf8_dfa_tab, this means the input is either
1744      * malformed, or was for certain Hanguls; handle them specially */
1745
1746     /* The dfa above drops out for incomplete or illegal inputs, and certain
1747      * legal Hanguls; check and return accordingly */
1748     return is_HANGUL_ED_utf8_safe(s0, e);
1749 }
1750
1751 /*
1752
1753 =for apidoc isC9_STRICT_UTF8_CHAR
1754
1755 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1756 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1757 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1758 the value gives how many bytes starting at C<s> comprise the code point's
1759 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1760 form the first code point in C<s>, are not examined.
1761
1762 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1763 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1764 code points.  This corresponds to
1765 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1766 which said that non-character code points are merely discouraged rather than
1767 completely forbidden in open interchange.  See
1768 L<perlunicode/Noncharacter code points>.
1769
1770 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1771 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1772
1773 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1774 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1775
1776 =cut
1777
1778 This uses an adaptation of the tables and algorithm given in
1779 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1780 documentation of the original version.  A copyright notice for the original
1781 version is given at the beginning of this file.  The Perl adapation is
1782 documented at the definition of PL_c9_utf8_dfa_tab[].
1783
1784 */
1785
1786 PERL_STATIC_INLINE Size_t
1787 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1788 {
1789     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1790
1791     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1792                           DFA_RETURN_SUCCESS_,
1793                           DFA_RETURN_FAILURE_,
1794                           DFA_RETURN_FAILURE_);
1795 }
1796
1797 /*
1798
1799 =for apidoc is_strict_utf8_string_loc
1800
1801 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1802 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1803 "utf8ness success") in the C<ep> pointer.
1804
1805 See also C<L</is_strict_utf8_string_loclen>>.
1806
1807 =cut
1808 */
1809
1810 #define is_strict_utf8_string_loc(s, len, ep)                               \
1811                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1812
1813 /*
1814
1815 =for apidoc is_strict_utf8_string_loclen
1816
1817 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1818 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1819 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1820 encoded characters in the C<el> pointer.
1821
1822 See also C<L</is_strict_utf8_string_loc>>.
1823
1824 =cut
1825 */
1826
1827 PERL_STATIC_INLINE bool
1828 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1829 {
1830     const U8 * first_variant;
1831
1832     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1833
1834     if (len == 0) {
1835         len = strlen((const char *) s);
1836     }
1837
1838     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1839         if (el)
1840             *el = len;
1841
1842         if (ep) {
1843             *ep = s + len;
1844         }
1845
1846         return TRUE;
1847     }
1848
1849     {
1850         const U8* const send = s + len;
1851         const U8* x = first_variant;
1852         STRLEN outlen = first_variant - s;
1853
1854         while (x < send) {
1855             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1856             if (UNLIKELY(! cur_len)) {
1857                 break;
1858             }
1859             x += cur_len;
1860             outlen++;
1861         }
1862
1863         if (el)
1864             *el = outlen;
1865
1866         if (ep) {
1867             *ep = x;
1868         }
1869
1870         return (x == send);
1871     }
1872 }
1873
1874 /*
1875
1876 =for apidoc is_c9strict_utf8_string_loc
1877
1878 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1879 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1880 "utf8ness success") in the C<ep> pointer.
1881
1882 See also C<L</is_c9strict_utf8_string_loclen>>.
1883
1884 =cut
1885 */
1886
1887 #define is_c9strict_utf8_string_loc(s, len, ep)                             \
1888                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1889
1890 /*
1891
1892 =for apidoc is_c9strict_utf8_string_loclen
1893
1894 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1895 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1896 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1897 characters in the C<el> pointer.
1898
1899 See also C<L</is_c9strict_utf8_string_loc>>.
1900
1901 =cut
1902 */
1903
1904 PERL_STATIC_INLINE bool
1905 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1906 {
1907     const U8 * first_variant;
1908
1909     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1910
1911     if (len == 0) {
1912         len = strlen((const char *) s);
1913     }
1914
1915     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1916         if (el)
1917             *el = len;
1918
1919         if (ep) {
1920             *ep = s + len;
1921         }
1922
1923         return TRUE;
1924     }
1925
1926     {
1927         const U8* const send = s + len;
1928         const U8* x = first_variant;
1929         STRLEN outlen = first_variant - s;
1930
1931         while (x < send) {
1932             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1933             if (UNLIKELY(! cur_len)) {
1934                 break;
1935             }
1936             x += cur_len;
1937             outlen++;
1938         }
1939
1940         if (el)
1941             *el = outlen;
1942
1943         if (ep) {
1944             *ep = x;
1945         }
1946
1947         return (x == send);
1948     }
1949 }
1950
1951 /*
1952
1953 =for apidoc is_utf8_string_loc_flags
1954
1955 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1956 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1957 "utf8ness success") in the C<ep> pointer.
1958
1959 See also C<L</is_utf8_string_loclen_flags>>.
1960
1961 =cut
1962 */
1963
1964 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1965                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1966
1967
1968 /* The above 3 actual functions could have been moved into the more general one
1969  * just below, and made #defines that call it with the right 'flags'.  They are
1970  * currently kept separate to increase their chances of getting inlined */
1971
1972 /*
1973
1974 =for apidoc is_utf8_string_loclen_flags
1975
1976 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1977 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1978 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1979 encoded characters in the C<el> pointer.
1980
1981 See also C<L</is_utf8_string_loc_flags>>.
1982
1983 =cut
1984 */
1985
1986 PERL_STATIC_INLINE bool
1987 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1988 {
1989     const U8 * first_variant;
1990
1991     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1992     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1993                           |UTF8_DISALLOW_PERL_EXTENDED)));
1994
1995     if (len == 0) {
1996         len = strlen((const char *) s);
1997     }
1998
1999     if (flags == 0) {
2000         return is_utf8_string_loclen(s, len, ep, el);
2001     }
2002
2003     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2004                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2005     {
2006         return is_strict_utf8_string_loclen(s, len, ep, el);
2007     }
2008
2009     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2010                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2011     {
2012         return is_c9strict_utf8_string_loclen(s, len, ep, el);
2013     }
2014
2015     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2016         if (el)
2017             *el = len;
2018
2019         if (ep) {
2020             *ep = s + len;
2021         }
2022
2023         return TRUE;
2024     }
2025
2026     {
2027         const U8* send = s + len;
2028         const U8* x = first_variant;
2029         STRLEN outlen = first_variant - s;
2030
2031         while (x < send) {
2032             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2033             if (UNLIKELY(! cur_len)) {
2034                 break;
2035             }
2036             x += cur_len;
2037             outlen++;
2038         }
2039
2040         if (el)
2041             *el = outlen;
2042
2043         if (ep) {
2044             *ep = x;
2045         }
2046
2047         return (x == send);
2048     }
2049 }
2050
2051 /*
2052 =for apidoc utf8_distance
2053
2054 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2055 and C<b>.
2056
2057 WARNING: use only if you *know* that the pointers point inside the
2058 same UTF-8 buffer.
2059
2060 =cut
2061 */
2062
2063 PERL_STATIC_INLINE IV
2064 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2065 {
2066     PERL_ARGS_ASSERT_UTF8_DISTANCE;
2067
2068     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2069 }
2070
2071 /*
2072 =for apidoc utf8_hop
2073
2074 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2075 forward or backward.
2076
2077 WARNING: do not use the following unless you *know* C<off> is within
2078 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
2079 on the first byte of character or just after the last byte of a character.
2080
2081 =cut
2082 */
2083
2084 PERL_STATIC_INLINE U8 *
2085 Perl_utf8_hop(const U8 *s, SSize_t off)
2086 {
2087     PERL_ARGS_ASSERT_UTF8_HOP;
2088
2089     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2090      * the bitops (especially ~) can create illegal UTF-8.
2091      * In other words: in Perl UTF-8 is not just for Unicode. */
2092
2093     if (off >= 0) {
2094         while (off--)
2095             s += UTF8SKIP(s);
2096     }
2097     else {
2098         while (off++) {
2099             s--;
2100             while (UTF8_IS_CONTINUATION(*s))
2101                 s--;
2102         }
2103     }
2104     GCC_DIAG_IGNORE(-Wcast-qual)
2105     return (U8 *)s;
2106     GCC_DIAG_RESTORE
2107 }
2108
2109 /*
2110 =for apidoc utf8_hop_forward
2111
2112 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2113 forward.
2114
2115 C<off> must be non-negative.
2116
2117 C<s> must be before or equal to C<end>.
2118
2119 When moving forward it will not move beyond C<end>.
2120
2121 Will not exceed this limit even if the string is not valid "UTF-8".
2122
2123 =cut
2124 */
2125
2126 PERL_STATIC_INLINE U8 *
2127 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2128 {
2129     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2130
2131     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2132      * the bitops (especially ~) can create illegal UTF-8.
2133      * In other words: in Perl UTF-8 is not just for Unicode. */
2134
2135     assert(s <= end);
2136     assert(off >= 0);
2137
2138     while (off--) {
2139         STRLEN skip = UTF8SKIP(s);
2140         if ((STRLEN)(end - s) <= skip) {
2141             GCC_DIAG_IGNORE(-Wcast-qual)
2142             return (U8 *)end;
2143             GCC_DIAG_RESTORE
2144         }
2145         s += skip;
2146     }
2147
2148     GCC_DIAG_IGNORE(-Wcast-qual)
2149     return (U8 *)s;
2150     GCC_DIAG_RESTORE
2151 }
2152
2153 /*
2154 =for apidoc utf8_hop_back
2155
2156 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2157 backward.
2158
2159 C<off> must be non-positive.
2160
2161 C<s> must be after or equal to C<start>.
2162
2163 When moving backward it will not move before C<start>.
2164
2165 Will not exceed this limit even if the string is not valid "UTF-8".
2166
2167 =cut
2168 */
2169
2170 PERL_STATIC_INLINE U8 *
2171 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2172 {
2173     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2174
2175     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2176      * the bitops (especially ~) can create illegal UTF-8.
2177      * In other words: in Perl UTF-8 is not just for Unicode. */
2178
2179     assert(start <= s);
2180     assert(off <= 0);
2181
2182     while (off++ && s > start) {
2183         do {
2184             s--;
2185         } while (UTF8_IS_CONTINUATION(*s) && s > start);
2186     }
2187
2188     GCC_DIAG_IGNORE(-Wcast-qual)
2189     return (U8 *)s;
2190     GCC_DIAG_RESTORE
2191 }
2192
2193 /*
2194 =for apidoc utf8_hop_safe
2195
2196 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2197 either forward or backward.
2198
2199 When moving backward it will not move before C<start>.
2200
2201 When moving forward it will not move beyond C<end>.
2202
2203 Will not exceed those limits even if the string is not valid "UTF-8".
2204
2205 =cut
2206 */
2207
2208 PERL_STATIC_INLINE U8 *
2209 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2210 {
2211     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2212
2213     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2214      * the bitops (especially ~) can create illegal UTF-8.
2215      * In other words: in Perl UTF-8 is not just for Unicode. */
2216
2217     assert(start <= s && s <= end);
2218
2219     if (off >= 0) {
2220         return utf8_hop_forward(s, off, end);
2221     }
2222     else {
2223         return utf8_hop_back(s, off, start);
2224     }
2225 }
2226
2227 /*
2228
2229 =for apidoc isUTF8_CHAR_flags
2230
2231 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2232 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2233 that represents some code point, subject to the restrictions given by C<flags>;
2234 otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
2235 starting at C<s> comprise the code point's representation.  Any bytes remaining
2236 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2237 are not examined.
2238
2239 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2240 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2241 as C<L</isSTRICT_UTF8_CHAR>>;
2242 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2243 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2244 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2245 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2246
2247 The three alternative macros are for the most commonly needed validations; they
2248 are likely to run somewhat faster than this more general one, as they can be
2249 inlined into your code.
2250
2251 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2252 L</is_utf8_string_loclen_flags> to check entire strings.
2253
2254 =cut
2255 */
2256
2257 PERL_STATIC_INLINE STRLEN
2258 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2259 {
2260     PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2261     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2262                           |UTF8_DISALLOW_PERL_EXTENDED)));
2263
2264     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2265                           goto check_success,
2266                           DFA_TEASE_APART_FF_,
2267                           DFA_RETURN_FAILURE_);
2268
2269   check_success:
2270
2271     return is_utf8_char_helper_(s0, e, flags);
2272
2273 #ifdef HAS_EXTRA_LONG_UTF8
2274
2275   tease_apart_FF:
2276
2277     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2278      * either malformed, or was for the largest possible start byte, which
2279      * indicates perl extended UTF-8, well above the Unicode maximum */
2280     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2281         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2282     {
2283         return 0;
2284     }
2285
2286     /* Otherwise examine the sequence not inline */
2287     return is_utf8_FF_helper_(s0, e,
2288                               FALSE /* require full, not partial char */
2289                              );
2290 #endif
2291
2292 }
2293
2294 /*
2295
2296 =for apidoc is_utf8_valid_partial_char
2297
2298 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2299 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2300 points.  Otherwise, it returns 1 if there exists at least one non-empty
2301 sequence of bytes that when appended to sequence C<s>, starting at position
2302 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2303 otherwise returns 0.
2304
2305 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2306 point.
2307
2308 This is useful when a fixed-length buffer is being tested for being well-formed
2309 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2310 it is split somewhere in the middle of the final code point's UTF-8
2311 representation.  (Presumably when the buffer is refreshed with the next chunk
2312 of data, the new first bytes will complete the partial code point.)   This
2313 function is used to verify that the final bytes in the current buffer are in
2314 fact the legal beginning of some code point, so that if they aren't, the
2315 failure can be signalled without having to wait for the next read.
2316
2317 =cut
2318 */
2319 #define is_utf8_valid_partial_char(s, e)                                    \
2320                                 is_utf8_valid_partial_char_flags(s, e, 0)
2321
2322 /*
2323
2324 =for apidoc is_utf8_valid_partial_char_flags
2325
2326 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2327 or not the input is a valid UTF-8 encoded partial character, but it takes an
2328 extra parameter, C<flags>, which can further restrict which code points are
2329 considered valid.
2330
2331 If C<flags> is 0, this behaves identically to
2332 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2333 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2334 there is any sequence of bytes that can complete the input partial character in
2335 such a way that a non-prohibited character is formed, the function returns
2336 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2337 partial character input.  But many  of the other possible excluded types can be
2338 determined from just the first one or two bytes.
2339
2340 =cut
2341  */
2342
2343 PERL_STATIC_INLINE bool
2344 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2345 {
2346     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2347     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2348                           |UTF8_DISALLOW_PERL_EXTENDED)));
2349
2350     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2351                           DFA_RETURN_FAILURE_,
2352                           DFA_TEASE_APART_FF_,
2353                           NOOP);
2354
2355     /* The NOOP above causes the DFA to drop down here iff the input was a
2356      * partial character.  flags=0 => can return TRUE immediately; otherwise we
2357      * need to check (not inline) if the partial character is the beginning of
2358      * a disallowed one */
2359     if (flags == 0) {
2360         return TRUE;
2361     }
2362
2363     return cBOOL(is_utf8_char_helper_(s0, e, flags));
2364
2365 #ifdef HAS_EXTRA_LONG_UTF8
2366
2367   tease_apart_FF:
2368
2369     /* Getting here means the input is either malformed, or, in the case of
2370      * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
2371      * latter case has to be extended UTF-8, so can fail immediately if that is
2372      * forbidden */
2373
2374     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2375         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2376     {
2377         return 0;
2378     }
2379
2380     return is_utf8_FF_helper_(s0, e,
2381                               TRUE /* Require to be a partial character */
2382                              );
2383 #endif
2384
2385 }
2386
2387 /*
2388
2389 =for apidoc is_utf8_fixed_width_buf_flags
2390
2391 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2392 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2393 otherwise it returns FALSE.
2394
2395 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2396 without restriction.  If the final few bytes of the buffer do not form a
2397 complete code point, this will return TRUE anyway, provided that
2398 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2399
2400 If C<flags> in non-zero, it can be any combination of the
2401 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2402 same meanings.
2403
2404 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2405 returns FALSE if the final few bytes of the string don't form a complete code
2406 point.
2407
2408 =cut
2409  */
2410 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2411                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2412
2413 /*
2414
2415 =for apidoc is_utf8_fixed_width_buf_loc_flags
2416
2417 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2418 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2419 to the beginning of any partial character at the end of the buffer; if there is
2420 no partial character C<*ep> will contain C<s>+C<len>.
2421
2422 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2423
2424 =cut
2425 */
2426
2427 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2428                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2429
2430 /*
2431
2432 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2433
2434 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2435 complete, valid characters found in the C<el> pointer.
2436
2437 =cut
2438 */
2439
2440 PERL_STATIC_INLINE bool
2441 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2442                                        STRLEN len,
2443                                        const U8 **ep,
2444                                        STRLEN *el,
2445                                        const U32 flags)
2446 {
2447     const U8 * maybe_partial;
2448
2449     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2450
2451     if (! ep) {
2452         ep  = &maybe_partial;
2453     }
2454
2455     /* If it's entirely valid, return that; otherwise see if the only error is
2456      * that the final few bytes are for a partial character */
2457     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2458            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2459 }
2460
2461 PERL_STATIC_INLINE UV
2462 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2463                       STRLEN curlen,
2464                       STRLEN *retlen,
2465                       const U32 flags,
2466                       U32 * errors,
2467                       AV ** msgs)
2468 {
2469     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2470      * simple cases, and, if necessary calls a helper function to deal with the
2471      * more complex ones.  Almost all well-formed non-problematic code points
2472      * are considered simple, so that it's unlikely that the helper function
2473      * will need to be called.
2474      *
2475      * This is an adaptation of the tables and algorithm given in
2476      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2477      * comprehensive documentation of the original version.  A copyright notice
2478      * for the original version is given at the beginning of this file.  The
2479      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
2480      */
2481
2482     const U8 * const s0 = s;
2483     const U8 * send = s0 + curlen;
2484     UV type;
2485     UV uv;
2486
2487     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2488
2489     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2490      * non-problematic code point, which can be returned immediately.
2491      * Otherwise we call a helper function to figure out the more complicated
2492      * cases. */
2493
2494     /* No calls from core pass in an empty string; non-core need a check */
2495 #ifdef PERL_CORE
2496     assert(curlen > 0);
2497 #else
2498     if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2499                                                         flags, errors, msgs);
2500 #endif
2501
2502     type = PL_strict_utf8_dfa_tab[*s];
2503
2504     /* The table is structured so that 'type' is 0 iff the input byte is
2505      * represented identically regardless of the UTF-8ness of the string */
2506     if (type == 0) {   /* UTF-8 invariants are returned unchanged */
2507         uv = *s;
2508     }
2509     else {
2510         UV state = PL_strict_utf8_dfa_tab[256 + type];
2511         uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2512
2513         while (++s < send) {
2514             type  = PL_strict_utf8_dfa_tab[*s];
2515             state = PL_strict_utf8_dfa_tab[256 + state + type];
2516
2517             uv = UTF8_ACCUMULATE(uv, *s);
2518
2519             if (state == 0) {
2520 #ifdef EBCDIC
2521                 uv = UNI_TO_NATIVE(uv);
2522 #endif
2523                 goto success;
2524             }
2525
2526             if (UNLIKELY(state == 1)) {
2527                 break;
2528             }
2529         }
2530
2531         /* Here is potentially problematic.  Use the full mechanism */
2532         return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2533                                            errors, msgs);
2534     }
2535
2536   success:
2537     if (retlen) {
2538         *retlen = s - s0 + 1;
2539     }
2540     if (errors) {
2541         *errors = 0;
2542     }
2543     if (msgs) {
2544         *msgs = NULL;
2545     }
2546
2547     return uv;
2548 }
2549
2550 PERL_STATIC_INLINE UV
2551 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2552 {
2553     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2554
2555     assert(s < send);
2556
2557     if (! ckWARN_d(WARN_UTF8)) {
2558
2559         /* EMPTY is not really allowed, and asserts on debugging builds.  But
2560          * on non-debugging we have to deal with it, and this causes it to
2561          * return the REPLACEMENT CHARACTER, as the documentation indicates */
2562         return utf8n_to_uvchr(s, send - s, retlen,
2563                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2564     }
2565     else {
2566         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2567         if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2568             *retlen = (STRLEN) -1;
2569         }
2570
2571         return ret;
2572     }
2573 }
2574
2575 /* ------------------------------- perl.h ----------------------------- */
2576
2577 /*
2578 =for apidoc_section $utility
2579
2580 =for apidoc is_safe_syscall
2581
2582 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2583 C<NUL> characters.
2584 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2585 category, and return FALSE.
2586
2587 Return TRUE if the name is safe.
2588
2589 C<what> and C<op_name> are used in any warning.
2590
2591 Used by the C<IS_SAFE_SYSCALL()> macro.
2592
2593 =cut
2594 */
2595
2596 PERL_STATIC_INLINE bool
2597 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2598 {
2599     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2600      * perl itself uses xce*() functions which accept 8-bit strings.
2601      */
2602
2603     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2604
2605     if (len > 1) {
2606         char *null_at;
2607         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2608                 SETERRNO(ENOENT, LIB_INVARG);
2609                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2610                                    "Invalid \\0 character in %s for %s: %s\\0%s",
2611                                    what, op_name, pv, null_at+1);
2612                 return FALSE;
2613         }
2614     }
2615
2616     return TRUE;
2617 }
2618
2619 /*
2620
2621 Return true if the supplied filename has a newline character
2622 immediately before the first (hopefully only) NUL.
2623
2624 My original look at this incorrectly used the len from SvPV(), but
2625 that's incorrect, since we allow for a NUL in pv[len-1].
2626
2627 So instead, strlen() and work from there.
2628
2629 This allow for the user reading a filename, forgetting to chomp it,
2630 then calling:
2631
2632   open my $foo, "$file\0";
2633
2634 */
2635
2636 #ifdef PERL_CORE
2637
2638 PERL_STATIC_INLINE bool
2639 S_should_warn_nl(const char *pv)
2640 {
2641     STRLEN len;
2642
2643     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2644
2645     len = strlen(pv);
2646
2647     return len > 0 && pv[len-1] == '\n';
2648 }
2649
2650 #endif
2651
2652 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2653
2654 PERL_STATIC_INLINE bool
2655 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2656 {
2657     /* This function determines if the input NV 'nv' may be converted without
2658      * loss of data to an IV.  If not, it returns FALSE taking no other action.
2659      * But if it is possible, it does the conversion, returning TRUE, and
2660      * storing the converted result in '*ivp' */
2661
2662     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2663
2664 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2665     /* Normally any comparison with a NaN returns false; if we can't rely
2666      * on that behaviour, check explicitly */
2667     if (UNLIKELY(Perl_isnan(nv))) {
2668         return FALSE;
2669     }
2670 #  endif
2671
2672     /* Written this way so that with an always-false NaN comparison we
2673      * return false */
2674     if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2675         return FALSE;
2676     }
2677
2678     if ((IV) nv != nv) {
2679         return FALSE;
2680     }
2681
2682     *ivp = (IV) nv;
2683     return TRUE;
2684 }
2685
2686 #endif
2687
2688 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2689
2690 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2691
2692 #define MAX_CHARSET_NAME_LENGTH 2
2693
2694 PERL_STATIC_INLINE const char *
2695 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2696 {
2697     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2698
2699     /* Returns a string that corresponds to the name of the regex character set
2700      * given by 'flags', and *lenp is set the length of that string, which
2701      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2702
2703     *lenp = 1;
2704     switch (get_regex_charset(flags)) {
2705         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2706         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2707         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2708         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2709         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2710             *lenp = 2;
2711             return ASCII_MORE_RESTRICT_PAT_MODS;
2712     }
2713     /* The NOT_REACHED; hides an assert() which has a rather complex
2714      * definition in perl.h. */
2715     NOT_REACHED; /* NOTREACHED */
2716     return "?";     /* Unknown */
2717 }
2718
2719 #endif
2720
2721 /*
2722
2723 Return false if any get magic is on the SV other than taint magic.
2724
2725 */
2726
2727 PERL_STATIC_INLINE bool
2728 Perl_sv_only_taint_gmagic(SV *sv)
2729 {
2730     MAGIC *mg = SvMAGIC(sv);
2731
2732     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2733
2734     while (mg) {
2735         if (mg->mg_type != PERL_MAGIC_taint
2736             && !(mg->mg_flags & MGf_GSKIP)
2737             && mg->mg_virtual->svt_get) {
2738             return FALSE;
2739         }
2740         mg = mg->mg_moremagic;
2741     }
2742
2743     return TRUE;
2744 }
2745
2746 /* ------------------ cop.h ------------------------------------------- */
2747
2748 /* implement GIMME_V() macro */
2749
2750 PERL_STATIC_INLINE U8
2751 Perl_gimme_V(pTHX)
2752 {
2753     I32 cxix;
2754     U8  gimme = (PL_op->op_flags & OPf_WANT);
2755
2756     if (gimme)
2757         return gimme;
2758     cxix = PL_curstackinfo->si_cxsubix;
2759     if (cxix < 0)
2760         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2761     assert(cxstack[cxix].blk_gimme & G_WANT);
2762     return (cxstack[cxix].blk_gimme & G_WANT);
2763 }
2764
2765
2766 /* Enter a block. Push a new base context and return its address. */
2767
2768 PERL_STATIC_INLINE PERL_CONTEXT *
2769 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2770 {
2771     PERL_CONTEXT * cx;
2772
2773     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2774
2775     CXINC;
2776     cx = CX_CUR();
2777     cx->cx_type        = type;
2778     cx->blk_gimme      = gimme;
2779     cx->blk_oldsaveix  = saveix;
2780     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2781     cx->blk_oldcop     = PL_curcop;
2782     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2783     cx->blk_oldscopesp = PL_scopestack_ix;
2784     cx->blk_oldpm      = PL_curpm;
2785     cx->blk_old_tmpsfloor = PL_tmps_floor;
2786
2787     PL_tmps_floor        = PL_tmps_ix;
2788     CX_DEBUG(cx, "PUSH");
2789     return cx;
2790 }
2791
2792
2793 /* Exit a block (RETURN and LAST). */
2794
2795 PERL_STATIC_INLINE void
2796 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2797 {
2798     PERL_ARGS_ASSERT_CX_POPBLOCK;
2799
2800     CX_DEBUG(cx, "POP");
2801     /* these 3 are common to cx_popblock and cx_topblock */
2802     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2803     PL_scopestack_ix = cx->blk_oldscopesp;
2804     PL_curpm         = cx->blk_oldpm;
2805
2806     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2807      * and leaves a CX entry lying around for repeated use, so
2808      * skip for multicall */                  \
2809     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2810             || PL_savestack_ix == cx->blk_oldsaveix);
2811     PL_curcop     = cx->blk_oldcop;
2812     PL_tmps_floor = cx->blk_old_tmpsfloor;
2813 }
2814
2815 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2816  * Whereas cx_popblock() restores the state to the point just before
2817  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2818  * *after* cx_pushblock() was called. */
2819
2820 PERL_STATIC_INLINE void
2821 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2822 {
2823     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2824
2825     CX_DEBUG(cx, "TOP");
2826     /* these 3 are common to cx_popblock and cx_topblock */
2827     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2828     PL_scopestack_ix = cx->blk_oldscopesp;
2829     PL_curpm         = cx->blk_oldpm;
2830
2831     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2832 }
2833
2834
2835 PERL_STATIC_INLINE void
2836 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2837 {
2838     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2839
2840     PERL_ARGS_ASSERT_CX_PUSHSUB;
2841
2842     PERL_DTRACE_PROBE_ENTRY(cv);
2843     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2844     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2845     cx->blk_sub.cv = cv;
2846     cx->blk_sub.olddepth = CvDEPTH(cv);
2847     cx->blk_sub.prevcomppad = PL_comppad;
2848     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2849     cx->blk_sub.retop = retop;
2850     SvREFCNT_inc_simple_void_NN(cv);
2851     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2852 }
2853
2854
2855 /* subsets of cx_popsub() */
2856
2857 PERL_STATIC_INLINE void
2858 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2859 {
2860     CV *cv;
2861
2862     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2863     assert(CxTYPE(cx) == CXt_SUB);
2864
2865     PL_comppad = cx->blk_sub.prevcomppad;
2866     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2867     cv = cx->blk_sub.cv;
2868     CvDEPTH(cv) = cx->blk_sub.olddepth;
2869     cx->blk_sub.cv = NULL;
2870     SvREFCNT_dec(cv);
2871     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2872 }
2873
2874
2875 /* handle the @_ part of leaving a sub */
2876
2877 PERL_STATIC_INLINE void
2878 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2879 {
2880     AV *av;
2881
2882     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2883     assert(CxTYPE(cx) == CXt_SUB);
2884     assert(AvARRAY(MUTABLE_AV(
2885         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2886                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2887
2888     CX_POP_SAVEARRAY(cx);
2889     av = MUTABLE_AV(PAD_SVl(0));
2890     if (UNLIKELY(AvREAL(av)))
2891         /* abandon @_ if it got reified */
2892         clear_defarray(av, 0);
2893     else {
2894         CLEAR_ARGARRAY(av);
2895     }
2896 }
2897
2898
2899 PERL_STATIC_INLINE void
2900 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2901 {
2902     PERL_ARGS_ASSERT_CX_POPSUB;
2903     assert(CxTYPE(cx) == CXt_SUB);
2904
2905     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2906
2907     if (CxHASARGS(cx))
2908         cx_popsub_args(cx);
2909     cx_popsub_common(cx);
2910 }
2911
2912
2913 PERL_STATIC_INLINE void
2914 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2915 {
2916     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2917
2918     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2919     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2920     cx->blk_format.cv          = cv;
2921     cx->blk_format.retop       = retop;
2922     cx->blk_format.gv          = gv;
2923     cx->blk_format.dfoutgv     = PL_defoutgv;
2924     cx->blk_format.prevcomppad = PL_comppad;
2925     cx->blk_u16                = 0;
2926
2927     SvREFCNT_inc_simple_void_NN(cv);
2928     CvDEPTH(cv)++;
2929     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2930 }
2931
2932
2933 PERL_STATIC_INLINE void
2934 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2935 {
2936     CV *cv;
2937     GV *dfout;
2938
2939     PERL_ARGS_ASSERT_CX_POPFORMAT;
2940     assert(CxTYPE(cx) == CXt_FORMAT);
2941
2942     dfout = cx->blk_format.dfoutgv;
2943     setdefout(dfout);
2944     cx->blk_format.dfoutgv = NULL;
2945     SvREFCNT_dec_NN(dfout);
2946
2947     PL_comppad = cx->blk_format.prevcomppad;
2948     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2949     cv = cx->blk_format.cv;
2950     cx->blk_format.cv = NULL;
2951     --CvDEPTH(cv);
2952     SvREFCNT_dec_NN(cv);
2953     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2954 }
2955
2956
2957 PERL_STATIC_INLINE void
2958 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2959 {
2960     cx->blk_eval.retop         = retop;
2961     cx->blk_eval.old_namesv    = namesv;
2962     cx->blk_eval.old_eval_root = PL_eval_root;
2963     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2964     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2965     cx->blk_eval.cur_top_env   = PL_top_env;
2966
2967     assert(!(PL_in_eval     & ~ 0x3F));
2968     assert(!(PL_op->op_type & ~0x1FF));
2969     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2970 }
2971
2972 PERL_STATIC_INLINE void
2973 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2974 {
2975     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2976
2977     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2978
2979     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2980     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2981 }
2982
2983 PERL_STATIC_INLINE void
2984 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2985 {
2986     PERL_ARGS_ASSERT_CX_PUSHTRY;
2987
2988     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2989
2990     /* Don't actually change it, just store the current value so it's restored
2991      * by the common popeval */
2992     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2993 }
2994
2995
2996 PERL_STATIC_INLINE void
2997 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2998 {
2999     SV *sv;
3000
3001     PERL_ARGS_ASSERT_CX_POPEVAL;
3002     assert(CxTYPE(cx) == CXt_EVAL);
3003
3004     PL_in_eval = CxOLD_IN_EVAL(cx);
3005     assert(!(PL_in_eval & 0xc0));
3006     PL_eval_root = cx->blk_eval.old_eval_root;
3007     sv = cx->blk_eval.cur_text;
3008     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
3009         cx->blk_eval.cur_text = NULL;
3010         SvREFCNT_dec_NN(sv);
3011     }
3012
3013     sv = cx->blk_eval.old_namesv;
3014     if (sv) {
3015         cx->blk_eval.old_namesv = NULL;
3016         SvREFCNT_dec_NN(sv);
3017     }
3018     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
3019 }
3020
3021
3022 /* push a plain loop, i.e.
3023  *     { block }
3024  *     while (cond) { block }
3025  *     for (init;cond;continue) { block }
3026  * This loop can be last/redo'ed etc.
3027  */
3028
3029 PERL_STATIC_INLINE void
3030 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3031 {
3032     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3033     cx->blk_loop.my_op = cLOOP;
3034 }
3035
3036
3037 /* push a true for loop, i.e.
3038  *     for var (list) { block }
3039  */
3040
3041 PERL_STATIC_INLINE void
3042 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3043 {
3044     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3045
3046     /* this one line is common with cx_pushloop_plain */
3047     cx->blk_loop.my_op = cLOOP;
3048
3049     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3050     cx->blk_loop.itersave      = itersave;
3051 #ifdef USE_ITHREADS
3052     cx->blk_loop.oldcomppad = PL_comppad;
3053 #endif
3054 }
3055
3056
3057 /* pop all loop types, including plain */
3058
3059 PERL_STATIC_INLINE void
3060 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3061 {
3062     PERL_ARGS_ASSERT_CX_POPLOOP;
3063
3064     assert(CxTYPE_is_LOOP(cx));
3065     if (  CxTYPE(cx) == CXt_LOOP_ARY
3066        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3067     {
3068         /* Free ary or cur. This assumes that state_u.ary.ary
3069          * aligns with state_u.lazysv.cur. See cx_dup() */
3070         SV *sv = cx->blk_loop.state_u.lazysv.cur;
3071         cx->blk_loop.state_u.lazysv.cur = NULL;
3072         SvREFCNT_dec_NN(sv);
3073         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3074             sv = cx->blk_loop.state_u.lazysv.end;
3075             cx->blk_loop.state_u.lazysv.end = NULL;
3076             SvREFCNT_dec_NN(sv);
3077         }
3078     }
3079     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3080         SV *cursv;
3081         SV **svp = (cx)->blk_loop.itervar_u.svp;
3082         if ((cx->cx_type & CXp_FOR_GV))
3083             svp = &GvSV((GV*)svp);
3084         cursv = *svp;
3085         *svp = cx->blk_loop.itersave;
3086         cx->blk_loop.itersave = NULL;
3087         SvREFCNT_dec(cursv);
3088     }
3089 }
3090
3091
3092 PERL_STATIC_INLINE void
3093 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3094 {
3095     PERL_ARGS_ASSERT_CX_PUSHWHEN;
3096
3097     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3098 }
3099
3100
3101 PERL_STATIC_INLINE void
3102 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3103 {
3104     PERL_ARGS_ASSERT_CX_POPWHEN;
3105     assert(CxTYPE(cx) == CXt_WHEN);
3106
3107     PERL_UNUSED_ARG(cx);
3108     PERL_UNUSED_CONTEXT;
3109     /* currently NOOP */
3110 }
3111
3112
3113 PERL_STATIC_INLINE void
3114 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3115 {
3116     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3117
3118     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3119     cx->blk_givwhen.defsv_save = orig_defsv;
3120 }
3121
3122
3123 PERL_STATIC_INLINE void
3124 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3125 {
3126     SV *sv;
3127
3128     PERL_ARGS_ASSERT_CX_POPGIVEN;
3129     assert(CxTYPE(cx) == CXt_GIVEN);
3130
3131     sv = GvSV(PL_defgv);
3132     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3133     cx->blk_givwhen.defsv_save = NULL;
3134     SvREFCNT_dec(sv);
3135 }
3136
3137 /* ------------------ util.h ------------------------------------------- */
3138
3139 /*
3140 =for apidoc_section $string
3141
3142 =for apidoc foldEQ
3143
3144 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3145 same
3146 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
3147 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
3148 range bytes match only themselves.
3149
3150 =cut
3151 */
3152
3153 PERL_STATIC_INLINE I32
3154 Perl_foldEQ(const char *s1, const char *s2, I32 len)
3155 {
3156     const U8 *a = (const U8 *)s1;
3157     const U8 *b = (const U8 *)s2;
3158
3159     PERL_ARGS_ASSERT_FOLDEQ;
3160
3161     assert(len >= 0);
3162
3163     while (len--) {
3164         if (*a != *b && *a != PL_fold[*b])
3165             return 0;
3166         a++,b++;
3167     }
3168     return 1;
3169 }
3170
3171 PERL_STATIC_INLINE I32
3172 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
3173 {
3174     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
3175      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3176      * does not check for this.  Nor does it check that the strings each have
3177      * at least 'len' characters. */
3178
3179     const U8 *a = (const U8 *)s1;
3180     const U8 *b = (const U8 *)s2;
3181
3182     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3183
3184     assert(len >= 0);
3185
3186     while (len--) {
3187         if (*a != *b && *a != PL_fold_latin1[*b]) {
3188             return 0;
3189         }
3190         a++, b++;
3191     }
3192     return 1;
3193 }
3194
3195 /*
3196 =for apidoc_section $locale
3197 =for apidoc foldEQ_locale
3198
3199 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3200 same case-insensitively in the current locale; false otherwise.
3201
3202 =cut
3203 */
3204
3205 PERL_STATIC_INLINE I32
3206 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3207 {
3208     const U8 *a = (const U8 *)s1;
3209     const U8 *b = (const U8 *)s2;
3210
3211     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3212
3213     assert(len >= 0);
3214
3215     while (len--) {
3216         if (*a != *b && *a != PL_fold_locale[*b])
3217             return 0;
3218         a++,b++;
3219     }
3220     return 1;
3221 }
3222
3223 /*
3224 =for apidoc_section $string
3225 =for apidoc my_strnlen
3226
3227 The C library C<strnlen> if available, or a Perl implementation of it.
3228
3229 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3230 characters.  It will never attempt to address more than C<maxlen>
3231 characters, making it suitable for use with strings that are not
3232 guaranteed to be NUL-terminated.
3233
3234 =cut
3235
3236 Description stolen from http://man.openbsd.org/strnlen.3,
3237 implementation stolen from PostgreSQL.
3238 */
3239 #ifndef HAS_STRNLEN
3240
3241 PERL_STATIC_INLINE Size_t
3242 Perl_my_strnlen(const char *str, Size_t maxlen)
3243 {
3244     const char *end = (char *) memchr(str, '\0', maxlen);
3245
3246     PERL_ARGS_ASSERT_MY_STRNLEN;
3247
3248     if (end == NULL) return maxlen;
3249     return end - str;
3250 }
3251
3252 #endif
3253
3254 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3255
3256 PERL_STATIC_INLINE void *
3257 S_my_memrchr(const char * s, const char c, const STRLEN len)
3258 {
3259     /* memrchr(), since many platforms lack it */
3260
3261     const char * t = s + len - 1;
3262
3263     PERL_ARGS_ASSERT_MY_MEMRCHR;
3264
3265     while (t >= s) {
3266         if (*t == c) {
3267             return (void *) t;
3268         }
3269         t--;
3270     }
3271
3272     return NULL;
3273 }
3274
3275 #endif
3276
3277 PERL_STATIC_INLINE char *
3278 Perl_mortal_getenv(const char * str)
3279 {
3280     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3281      *
3282      * It's (mostly) thread-safe because it uses a mutex to prevent other
3283      * threads (that look at this mutex) from destroying the result before this
3284      * routine has a chance to copy the result to a place that won't be
3285      * destroyed before the caller gets a chance to handle it.  That place is a
3286      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3287      * impression that the SV will hang around longer under more circumstances
3288      *
3289      * The reason it isn't completely thread-safe is that other code could
3290      * simply not pay attention to the mutex.  All of the Perl core uses the
3291      * mutex, but it is possible for code from, say XS, to not use this mutex,
3292      * defeating the safety.
3293      *
3294      * getenv() returns, in some implementations, a pointer to a spot in the
3295      * **environ array, which could be invalidated at any time by this or
3296      * another thread changing the environment.  Other implementations copy the
3297      * **environ value to a static buffer, returning a pointer to that.  That
3298      * buffer might or might not be invalidated by a getenv() call in another
3299      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3300      * many getenv() calls can safely be running simultaneously, so a
3301      * many-reader (but no simultaneous writers) lock is ok.  There is a
3302      * Configure probe to see if another thread destroys the buffer, and the
3303      * mutex is defined accordingly.
3304      *
3305      * But in all cases, using the mutex prevents these problems, as long as
3306      * all code uses the same mutex.
3307      *
3308      * A complication is that this can be called during phases where the
3309      * mortalization process isn't available.  These are in interpreter
3310      * destruction or early in construction.  khw believes that at these times
3311      * there shouldn't be anything else going on, so plain getenv is safe AS
3312      * LONG AS the caller acts on the return before calling it again. */
3313
3314     char * ret;
3315     dTHX;
3316
3317     PERL_ARGS_ASSERT_MORTAL_GETENV;
3318
3319     /* Can't mortalize without stacks.  khw believes that no other threads
3320      * should be running, so no need to lock things, and this may be during a
3321      * phase when locking isn't even available */
3322     if (UNLIKELY(PL_scopestack_ix == 0)) {
3323         return getenv(str);
3324     }
3325
3326 #ifdef PERL_MEM_LOG
3327
3328     /* A major complication arises under PERL_MEM_LOG.  When that is active,
3329      * every memory allocation may result in logging, depending on the value of
3330      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3331      * saving ENV{foo}'s value (but before saving it), the logging code will
3332      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3333      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3334      * lock a boolean mutex recursively); 3) destroying the getenv() static
3335      * buffer; or 4) destroying the temporary created by this for the copy
3336      * causes a log entry to be made which could cause a new temporary to be
3337      * created, which will need to be destroyed at some point, leading to an
3338      * infinite loop.
3339      *
3340      * The solution adopted here (after some gnashing of teeth) is to detect
3341      * the recursive calls and calls from the logger, and treat them specially.
3342      * Let's say we want to do getenv("foo").  We first find
3343      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3344      * variable, so no temporary is required.  Then we do getenv(foo}, and in
3345      * the process of creating a temporary to save it, this function will be
3346      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3347      * we detect that it is such a call and return our saved value instead of
3348      * locking and doing a new getenv().  This solves all of problems 1), 2),
3349      * and 3).  Because all the getenv()s are done while the mutex is locked,
3350      * the state cannot have changed.  To solve 4), we don't create a temporary
3351      * when this is called from the logging code.  That code disposes of the
3352      * return value while the mutex is still locked.
3353      *
3354      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3355      * digits and 3 particular letters are significant; the rest are ignored by
3356      * the memory logging code.  Thus the per-interpreter variable only needs
3357      * to be large enough to save the significant information, the size of
3358      * which is known at compile time.  The first byte is extra, reserved for
3359      * flags for our use.  To protect against overflowing, only the reserved
3360      * byte, as many digits as don't overflow, and the three letters are
3361      * stored.
3362      *
3363      * The reserved byte has two bits:
3364      *      0x1 if set indicates that if we get here, it is a recursive call of
3365      *          getenv()
3366      *      0x2 if set indicates that the call is from the logging code.
3367      *
3368      * If the flag indicates this is a recursive call, just return the stored
3369      * value of PL_mem_log;  An empty value gets turned into NULL. */
3370     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3371         if (PL_mem_log[1] == '\0') {
3372             return NULL;
3373         } else {
3374             return PL_mem_log + 1;
3375         }
3376     }
3377
3378 #endif
3379
3380     GETENV_LOCK;
3381
3382 #ifdef PERL_MEM_LOG
3383
3384     /* Here we are in a critical section.  As explained above, we do our own
3385      * getenv(PERL_MEM_LOG), saving the result safely. */
3386     ret = getenv("PERL_MEM_LOG");
3387     if (ret == NULL) {  /* No logging active */
3388
3389         /* Return that immediately if called from the logging code */
3390         if (PL_mem_log[0] & 0x2) {
3391             GETENV_UNLOCK;
3392             return NULL;
3393         }
3394
3395         PL_mem_log[1] = '\0';
3396     }
3397     else {
3398         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3399
3400         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3401          * extremely long string.  But we want only a few characters from it.
3402          * PL_mem_log has been made large enough to hold just the ones we need.
3403          * First the file descriptor. */
3404         if (isDIGIT(*ret)) {
3405             const char * s = ret;
3406             if (UNLIKELY(*s == '0')) {
3407
3408                 /* Reduce multiple leading zeros to a single one.  This is to
3409                  * allow the caller to change what to do with leading zeros. */
3410                 *mem_log_meat++ = '0';
3411                 s++;
3412                 while (*s == '0') {
3413                     s++;
3414                 }
3415             }
3416
3417             /* If the input overflows, copy just enough for the result to also
3418              * overflow, plus 1 to make sure */
3419             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3420                 *mem_log_meat++ = *s++;
3421             }
3422         }
3423
3424         /* Then each of the three significant characters */
3425         if (strchr(ret, 'm')) {
3426             *mem_log_meat++ = 'm';
3427         }
3428         if (strchr(ret, 's')) {
3429             *mem_log_meat++ = 's';
3430         }
3431         if (strchr(ret, 't')) {
3432             *mem_log_meat++ = 't';
3433         }
3434         *mem_log_meat = '\0';
3435
3436         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3437     }
3438
3439     /* If we are being called from the logger, it only needs the significant
3440      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3441     if (PL_mem_log[0] & 0x2) {
3442         assert(strEQ(str, "PERL_MEM_LOG"));
3443         GETENV_UNLOCK;
3444         return PL_mem_log + 1;
3445     }
3446
3447     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3448      * is coming from other than the logging code, so it should be treated the
3449      * same as any other getenv(), returning the full value, not just the
3450      * significant part, and having its value saved.  Set the flag that
3451      * indicates any call to this routine will be a recursion from here */
3452     PL_mem_log[0] = 0x1;
3453
3454 #endif
3455
3456     /* Now get the value of the real desired variable, and save a copy */
3457     ret = getenv(str);
3458
3459     if (ret != NULL) {
3460         ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3461     }
3462
3463     GETENV_UNLOCK;
3464
3465 #ifdef PERL_MEM_LOG
3466
3467     /* Clear the buffer */
3468     Zero(PL_mem_log, sizeof(PL_mem_log), char);
3469
3470 #endif
3471
3472     return ret;
3473 }
3474
3475 PERL_STATIC_INLINE bool
3476 Perl_sv_isbool(pTHX_ const SV *sv)
3477 {
3478     return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
3479 }
3480
3481 #ifdef USE_ITHREADS
3482
3483 PERL_STATIC_INLINE AV *
3484 Perl_cop_file_avn(pTHX_ const COP *cop) {
3485
3486     PERL_ARGS_ASSERT_COP_FILE_AVN;
3487
3488     const char *file = CopFILE(cop);
3489     if (file) {
3490         GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3491         if (gv) {
3492             return GvAVn(gv);
3493         }
3494         else
3495             return NULL;
3496      }
3497      else
3498          return NULL;
3499 }
3500
3501 #endif
3502
3503 /*
3504  * ex: set ts=8 sts=4 sw=4 et:
3505  */