inline.h

   1 /*    inline.h
   2  *
   3  *    Copyright (C) 2012 by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  *    This file contains tables and code adapted from
   9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
  10  *    copyright notice:
  11
  12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  13
  14 Permission is hereby granted, free of charge, to any person obtaining a copy of
  15 this software and associated documentation files (the "Software"), to deal in
  16 the Software without restriction, including without limitation the rights to
  17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  18 of the Software, and to permit persons to whom the Software is furnished to do
  19 so, subject to the following conditions:
  20
  21 The above copyright notice and this permission notice shall be included in all
  22 copies or substantial portions of the Software.
  23
  24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 SOFTWARE.
  31
  32  *
  33  * This file is a home for static inline functions that cannot go in other
  34  * header files, because they depend on proto.h (included after most other
  35  * headers) or struct definitions.
  36  *
  37  * Each section names the header file that the functions "belong" to.
  38  */
  39
  40 /* ------------------------------- av.h ------------------------------- */
  41
  42 /*
  43 =for apidoc_section $AV
  44 =for apidoc av_count
  45 Returns the number of elements in the array C<av>.  This is the true length of
  46 the array, including any undefined elements.  It is always the same as
  47 S<C<av_top_index(av) + 1>>.
  48
  49 =cut
  50 */
  51 PERL_STATIC_INLINE Size_t
  52 Perl_av_count(pTHX_ AV *av)
  53 {
  54     PERL_ARGS_ASSERT_AV_COUNT;
  55     assert(SvTYPE(av) == SVt_PVAV);
  56
  57     return AvFILL(av) + 1;
  58 }
  59
  60 /* ------------------------------- av.c ------------------------------- */
  61
  62 /*
  63 =for apidoc av_store_simple
  64
  65 This is a cut-down version of av_store that assumes that the array is
  66 very straightforward - no magic, not readonly, and AvREAL - and that
  67 C<key> is not negative. This function MUST NOT be used in situations
  68 where any of those assumptions may not hold.
  69
  70 Stores an SV in an array.  The array index is specified as C<key>. It
  71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
  72
  73 Note that the caller is responsible for suitably incrementing the reference
  74 count of C<val> before the call.
  75
  76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
  77
  78 =cut
  79 */
  80
  81 PERL_STATIC_INLINE SV**
  82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
  83 {
  84     SV** ary;
  85
  86     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
  87     assert(SvTYPE(av) == SVt_PVAV);
  88     assert(!SvMAGICAL(av));
  89     assert(!SvREADONLY(av));
  90     assert(AvREAL(av));
  91     assert(key > -1);
  92
  93     ary = AvARRAY(av);
  94
  95     if (AvFILLp(av) < key) {
  96         if (key > AvMAX(av)) {
  97             av_extend(av,key);
  98             ary = AvARRAY(av);
  99         }
 100         AvFILLp(av) = key;
 101     } else
 102         SvREFCNT_dec(ary[key]);
 103
 104     ary[key] = val;
 105     return &ary[key];
 106 }
 107
 108 /*
 109 =for apidoc av_fetch_simple
 110
 111 This is a cut-down version of av_fetch that assumes that the array is
 112 very straightforward - no magic, not readonly, and AvREAL - and that
 113 C<key> is not negative. This function MUST NOT be used in situations
 114 where any of those assumptions may not hold.
 115
 116 Returns the SV at the specified index in the array.  The C<key> is the
 117 index.  If lval is true, you are guaranteed to get a real SV back (in case
 118 it wasn't real before), which you can then modify.  Check that the return
 119 value is non-null before dereferencing it to a C<SV*>.
 120
 121 The rough perl equivalent is C<$myarray[$key]>.
 122
 123 =cut
 124 */
 125
 126 PERL_STATIC_INLINE SV**
 127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
 128 {
 129     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
 130     assert(SvTYPE(av) == SVt_PVAV);
 131     assert(!SvMAGICAL(av));
 132     assert(!SvREADONLY(av));
 133     assert(AvREAL(av));
 134     assert(key > -1);
 135
 136     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
 137         return lval ? av_store_simple(av,key,newSV(0)) : NULL;
 138     } else {
 139         return &AvARRAY(av)[key];
 140     }
 141 }
 142
 143 /* ------------------------------- cv.h ------------------------------- */
 144
 145 /*
 146 =for apidoc_section $CV
 147 =for apidoc CvGV
 148 Returns the GV associated with the CV C<sv>, reifying it if necessary.
 149
 150 =cut
 151 */
 152 PERL_STATIC_INLINE GV *
 153 Perl_CvGV(pTHX_ CV *sv)
 154 {
 155     PERL_ARGS_ASSERT_CVGV;
 156
 157     return CvNAMED(sv)
 158         ? Perl_cvgv_from_hek(aTHX_ sv)
 159         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
 160 }
 161
 162 PERL_STATIC_INLINE I32 *
 163 Perl_CvDEPTH(const CV * const sv)
 164 {
 165     PERL_ARGS_ASSERT_CVDEPTH;
 166     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
 167
 168     return &((XPVCV*)SvANY(sv))->xcv_depth;
 169 }
 170
 171 /*
 172  CvPROTO returns the prototype as stored, which is not necessarily what
 173  the interpreter should be using. Specifically, the interpreter assumes
 174  that spaces have been stripped, which has been the case if the prototype
 175  was added by toke.c, but is generally not the case if it was added elsewhere.
 176  Since we can't enforce the spacelessness at assignment time, this routine
 177  provides a temporary copy at parse time with spaces removed.
 178  I<orig> is the start of the original buffer, I<len> is the length of the
 179  prototype and will be updated when this returns.
 180  */
 181
 182 #ifdef PERL_CORE
 183 PERL_STATIC_INLINE char *
 184 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
 185 {
 186     SV * tmpsv;
 187     char * tmps;
 188     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
 189     tmps = SvPVX(tmpsv);
 190     while ((*len)--) {
 191         if (!isSPACE(*orig))
 192             *tmps++ = *orig;
 193         orig++;
 194     }
 195     *tmps = '\0';
 196     *len = tmps - SvPVX(tmpsv);
 197                 return SvPVX(tmpsv);
 198 }
 199 #endif
 200
 201 /* ------------------------------- mg.h ------------------------------- */
 202
 203 #if defined(PERL_CORE) || defined(PERL_EXT)
 204 /* assumes get-magic and stringification have already occurred */
 205 PERL_STATIC_INLINE STRLEN
 206 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
 207 {
 208     assert(mg->mg_type == PERL_MAGIC_regex_global);
 209     assert(mg->mg_len != -1);
 210     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
 211         return (STRLEN)mg->mg_len;
 212     else {
 213         const STRLEN pos = (STRLEN)mg->mg_len;
 214         /* Without this check, we may read past the end of the buffer: */
 215         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
 216         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
 217     }
 218 }
 219 #endif
 220
 221 /* ------------------------------- pad.h ------------------------------ */
 222
 223 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
 224 PERL_STATIC_INLINE bool
 225 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
 226 {
 227     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
 228
 229     /* is seq within the range _LOW to _HIGH ?
 230      * This is complicated by the fact that PL_cop_seqmax
 231      * may have wrapped around at some point */
 232     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
 233         return FALSE; /* not yet introduced */
 234
 235     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
 236     /* in compiling scope */
 237         if (
 238             (seq >  COP_SEQ_RANGE_LOW(pn))
 239             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
 240             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
 241         )
 242             return TRUE;
 243     }
 244     else if (
 245         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
 246         ?
 247             (  seq >  COP_SEQ_RANGE_LOW(pn)
 248             || seq <= COP_SEQ_RANGE_HIGH(pn))
 249
 250         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
 251              && seq <= COP_SEQ_RANGE_HIGH(pn))
 252     )
 253         return TRUE;
 254     return FALSE;
 255 }
 256 #endif
 257
 258 /* ------------------------------- pp.h ------------------------------- */
 259
 260 PERL_STATIC_INLINE I32
 261 Perl_TOPMARK(pTHX)
 262 {
 263     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 264                                  "MARK top  %p %" IVdf "\n",
 265                                   PL_markstack_ptr,
 266                                   (IV)*PL_markstack_ptr)));
 267     return *PL_markstack_ptr;
 268 }
 269
 270 PERL_STATIC_INLINE I32
 271 Perl_POPMARK(pTHX)
 272 {
 273     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 274                                  "MARK pop  %p %" IVdf "\n",
 275                                   (PL_markstack_ptr-1),
 276                                   (IV)*(PL_markstack_ptr-1))));
 277     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
 278     return *PL_markstack_ptr--;
 279 }
 280
 281 /* ----------------------------- regexp.h ----------------------------- */
 282
 283 PERL_STATIC_INLINE struct regexp *
 284 Perl_ReANY(const REGEXP * const re)
 285 {
 286     XPV* const p = (XPV*)SvANY(re);
 287
 288     PERL_ARGS_ASSERT_REANY;
 289     assert(isREGEXP(re));
 290
 291     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
 292                                    : (struct regexp *)p;
 293 }
 294
 295 /* ------------------------------- sv.h ------------------------------- */
 296
 297 PERL_STATIC_INLINE bool
 298 Perl_SvTRUE(pTHX_ SV *sv)
 299 {
 300     PERL_ARGS_ASSERT_SVTRUE;
 301
 302     if (UNLIKELY(sv == NULL))
 303         return FALSE;
 304     SvGETMAGIC(sv);
 305     return SvTRUE_nomg_NN(sv);
 306 }
 307
 308 PERL_STATIC_INLINE bool
 309 Perl_SvTRUE_nomg(pTHX_ SV *sv)
 310 {
 311     PERL_ARGS_ASSERT_SVTRUE_NOMG;
 312
 313     if (UNLIKELY(sv == NULL))
 314         return FALSE;
 315     return SvTRUE_nomg_NN(sv);
 316 }
 317
 318 PERL_STATIC_INLINE bool
 319 Perl_SvTRUE_NN(pTHX_ SV *sv)
 320 {
 321     PERL_ARGS_ASSERT_SVTRUE_NN;
 322
 323     SvGETMAGIC(sv);
 324     return SvTRUE_nomg_NN(sv);
 325 }
 326
 327 PERL_STATIC_INLINE bool
 328 Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
 329 {
 330     PERL_ARGS_ASSERT_SVTRUE_COMMON;
 331
 332     if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
 333         return SvIMMORTAL_TRUE(sv);
 334
 335     if (! SvOK(sv))
 336         return FALSE;
 337
 338     if (SvPOK(sv))
 339         return SvPVXtrue(sv);
 340
 341     if (SvIOK(sv))
 342         return SvIVX(sv) != 0; /* casts to bool */
 343
 344     if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
 345         return TRUE;
 346
 347     if (sv_2bool_is_fallback)
 348         return sv_2bool_nomg(sv);
 349
 350     return isGV_with_GP(sv);
 351 }
 352
 353
 354 PERL_STATIC_INLINE SV *
 355 Perl_SvREFCNT_inc(SV *sv)
 356 {
 357     if (LIKELY(sv != NULL))
 358         SvREFCNT(sv)++;
 359     return sv;
 360 }
 361 PERL_STATIC_INLINE SV *
 362 Perl_SvREFCNT_inc_NN(SV *sv)
 363 {
 364     PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
 365
 366     SvREFCNT(sv)++;
 367     return sv;
 368 }
 369 PERL_STATIC_INLINE void
 370 Perl_SvREFCNT_inc_void(SV *sv)
 371 {
 372     if (LIKELY(sv != NULL))
 373         SvREFCNT(sv)++;
 374 }
 375 PERL_STATIC_INLINE void
 376 Perl_SvREFCNT_dec(pTHX_ SV *sv)
 377 {
 378     if (LIKELY(sv != NULL)) {
 379         U32 rc = SvREFCNT(sv);
 380         if (LIKELY(rc > 1))
 381             SvREFCNT(sv) = rc - 1;
 382         else
 383             Perl_sv_free2(aTHX_ sv, rc);
 384     }
 385 }
 386
 387 PERL_STATIC_INLINE void
 388 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
 389 {
 390     U32 rc = SvREFCNT(sv);
 391
 392     PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
 393
 394     if (LIKELY(rc > 1))
 395         SvREFCNT(sv) = rc - 1;
 396     else
 397         Perl_sv_free2(aTHX_ sv, rc);
 398 }
 399
 400 PERL_STATIC_INLINE void
 401 Perl_SvAMAGIC_on(SV *sv)
 402 {
 403     PERL_ARGS_ASSERT_SVAMAGIC_ON;
 404     assert(SvROK(sv));
 405
 406     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
 407 }
 408 PERL_STATIC_INLINE void
 409 Perl_SvAMAGIC_off(SV *sv)
 410 {
 411     PERL_ARGS_ASSERT_SVAMAGIC_OFF;
 412
 413     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
 414         HvAMAGIC_off(SvSTASH(SvRV(sv)));
 415 }
 416
 417 PERL_STATIC_INLINE U32
 418 Perl_SvPADSTALE_on(SV *sv)
 419 {
 420     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 421     return SvFLAGS(sv) |= SVs_PADSTALE;
 422 }
 423 PERL_STATIC_INLINE U32
 424 Perl_SvPADSTALE_off(SV *sv)
 425 {
 426     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 427     return SvFLAGS(sv) &= ~SVs_PADSTALE;
 428 }
 429 #if defined(PERL_CORE) || defined (PERL_EXT)
 430 PERL_STATIC_INLINE STRLEN
 431 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
 432 {
 433     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
 434     if (SvGAMAGIC(sv)) {
 435         U8 *hopped = utf8_hop((U8 *)pv, pos);
 436         if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
 437         return (STRLEN)(hopped - (U8 *)pv);
 438     }
 439     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
 440 }
 441 #endif
 442
 443 /* ------------------------------- utf8.h ------------------------------- */
 444
 445 /*
 446 =for apidoc_section $unicode
 447 */
 448
 449 PERL_STATIC_INLINE void
 450 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
 451 {
 452     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
 453      * encoded string at '*dest', updating '*dest' to include it */
 454
 455     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 456
 457     if (NATIVE_BYTE_IS_INVARIANT(byte))
 458         *((*dest)++) = byte;
 459     else {
 460         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
 461         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
 462     }
 463 }
 464
 465 /*
 466 =for apidoc valid_utf8_to_uvchr
 467 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
 468 known that the next character in the input UTF-8 string C<s> is well-formed
 469 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
 470 points, and non-Unicode code points are allowed.
 471
 472 =cut
 473
 474  */
 475
 476 PERL_STATIC_INLINE UV
 477 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 478 {
 479     const UV expectlen = UTF8SKIP(s);
 480     const U8* send = s + expectlen;
 481     UV uv = *s;
 482
 483     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 484
 485     if (retlen) {
 486         *retlen = expectlen;
 487     }
 488
 489     /* An invariant is trivially returned */
 490     if (expectlen == 1) {
 491         return uv;
 492     }
 493
 494     /* Remove the leading bits that indicate the number of bytes, leaving just
 495      * the bits that are part of the value */
 496     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 497
 498     /* Now, loop through the remaining bytes, accumulating each into the
 499      * working total as we go.  (I khw tried unrolling the loop for up to 4
 500      * bytes, but there was no performance improvement) */
 501     for (++s; s < send; s++) {
 502         uv = UTF8_ACCUMULATE(uv, *s);
 503     }
 504
 505     return UNI_TO_NATIVE(uv);
 506
 507 }
 508
 509 /*
 510 =for apidoc is_utf8_invariant_string
 511
 512 Returns TRUE if the first C<len> bytes of the string C<s> are the same
 513 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
 514 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
 515 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
 516 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
 517 characters are invariant, but so also are the C1 controls.
 518
 519 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 520 use this option, that C<s> can't have embedded C<NUL> characters and has to
 521 have a terminating C<NUL> byte).
 522
 523 See also
 524 C<L</is_utf8_string>>,
 525 C<L</is_utf8_string_flags>>,
 526 C<L</is_utf8_string_loc>>,
 527 C<L</is_utf8_string_loc_flags>>,
 528 C<L</is_utf8_string_loclen>>,
 529 C<L</is_utf8_string_loclen_flags>>,
 530 C<L</is_utf8_fixed_width_buf_flags>>,
 531 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 532 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 533 C<L</is_strict_utf8_string>>,
 534 C<L</is_strict_utf8_string_loc>>,
 535 C<L</is_strict_utf8_string_loclen>>,
 536 C<L</is_c9strict_utf8_string>>,
 537 C<L</is_c9strict_utf8_string_loc>>,
 538 and
 539 C<L</is_c9strict_utf8_string_loclen>>.
 540
 541 =cut
 542
 543 */
 544
 545 #define is_utf8_invariant_string(s, len)                                    \
 546                                 is_utf8_invariant_string_loc(s, len, NULL)
 547
 548 /*
 549 =for apidoc is_utf8_invariant_string_loc
 550
 551 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
 552 the first UTF-8 variant character in the C<ep> pointer; if all characters are
 553 UTF-8 invariant, this function does not change the contents of C<*ep>.
 554
 555 =cut
 556
 557 */
 558
 559 PERL_STATIC_INLINE bool
 560 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 561 {
 562     const U8* send;
 563     const U8* x = s;
 564
 565     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 566
 567     if (len == 0) {
 568         len = strlen((const char *)s);
 569     }
 570
 571     send = s + len;
 572
 573 /* This looks like 0x010101... */
 574 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
 575
 576 /* This looks like 0x808080... */
 577 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
 578 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
 579 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 580
 581 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
 582  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
 583  * optimized out completely on a 32-bit system, and its mask gets optimized out
 584  * on a 64-bit system */
 585 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
 586                                       |   (  PTR2nat(x) >> 1)                 \
 587                                       | ( ( (PTR2nat(x)                       \
 588                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
 589
 590 #ifndef EBCDIC
 591
 592     /* Do the word-at-a-time iff there is at least one usable full word.  That
 593      * means that after advancing to a word boundary, there still is at least a
 594      * full word left.  The number of bytes needed to advance is 'wordsize -
 595      * offset' unless offset is 0. */
 596     if ((STRLEN) (send - x) >= PERL_WORDSIZE
 597
 598                             /* This term is wordsize if subword; 0 if not */
 599                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
 600
 601                             /* 'offset' */
 602                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
 603     {
 604
 605         /* Process per-byte until reach word boundary.  XXX This loop could be
 606          * eliminated if we knew that this platform had fast unaligned reads */
 607         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
 608             if (! UTF8_IS_INVARIANT(*x)) {
 609                 if (ep) {
 610                     *ep = x;
 611                 }
 612
 613                 return FALSE;
 614             }
 615             x++;
 616         }
 617
 618         /* Here, we know we have at least one full word to process.  Process
 619          * per-word as long as we have at least a full word left */
 620         do {
 621             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
 622
 623                 /* Found a variant.  Just return if caller doesn't want its
 624                  * exact position */
 625                 if (! ep) {
 626                     return FALSE;
 627                 }
 628
 629 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 630      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 631
 632                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
 633                 assert(*ep >= s && *ep < send);
 634
 635                 return FALSE;
 636
 637 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 638            checks. */
 639
 640                 break;
 641 #  endif
 642             }
 643
 644             x += PERL_WORDSIZE;
 645
 646         } while (x + PERL_WORDSIZE <= send);
 647     }
 648
 649 #endif      /* End of ! EBCDIC */
 650
 651     /* Process per-byte */
 652     while (x < send) {
 653         if (! UTF8_IS_INVARIANT(*x)) {
 654             if (ep) {
 655                 *ep = x;
 656             }
 657
 658             return FALSE;
 659         }
 660
 661         x++;
 662     }
 663
 664     return TRUE;
 665 }
 666
 667 /* See if the platform has builtins for finding the most/least significant bit,
 668  * and which one is right for using on 32 and 64 bit operands */
 669 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
 670 #  if U32SIZE == INTSIZE
 671 #    define PERL_CLZ_32 __builtin_clz
 672 #  endif
 673 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 674 #    define PERL_CLZ_64 __builtin_clz
 675 #  endif
 676 #endif
 677 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
 678 #  if U32SIZE == INTSIZE
 679 #    define PERL_CTZ_32 __builtin_ctz
 680 #  endif
 681 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 682 #    define PERL_CTZ_64 __builtin_ctz
 683 #  endif
 684 #endif
 685
 686 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
 687 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
 688 #    define PERL_CLZ_32 __builtin_clzl
 689 #  endif
 690 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
 691 #    define PERL_CLZ_64 __builtin_clzl
 692 #  endif
 693 #endif
 694 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
 695 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
 696 #    define PERL_CTZ_32 __builtin_ctzl
 697 #  endif
 698 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
 699 #    define PERL_CTZ_64 __builtin_ctzl
 700 #  endif
 701 #endif
 702
 703 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
 704 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
 705 #    define PERL_CLZ_32 __builtin_clzll
 706 #  endif
 707 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
 708 #    define PERL_CLZ_64 __builtin_clzll
 709 #  endif
 710 #endif
 711 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
 712 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
 713 #    define PERL_CTZ_32 __builtin_ctzll
 714 #  endif
 715 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
 716 #    define PERL_CTZ_64 __builtin_ctzll
 717 #  endif
 718 #endif
 719
 720 #if defined(_MSC_VER) && _MSC_VER >= 1400
 721 #  include <intrin.h>
 722 #  pragma intrinsic(_BitScanForward)
 723 #  pragma intrinsic(_BitScanReverse)
 724 #  ifdef _WIN64
 725 #    pragma intrinsic(_BitScanForward64)
 726 #    pragma intrinsic(_BitScanReverse64)
 727 #  endif
 728 #endif
 729
 730 /* The reason there are not checks to see if ffs() and ffsl() are available for
 731  * determining the lsb, is because these don't improve on the deBruijn method
 732  * fallback, which is just a branchless integer multiply, array element
 733  * retrieval, and shift.  The others, even if the function call overhead is
 734  * optimized out, have to cope with the possibility of the input being all
 735  * zeroes, and almost certainly will have conditionals for this eventuality.
 736  * khw, at the time of this commit, looked at the source for both gcc and clang
 737  * to verify this.  (gcc used a method inferior to deBruijn.) */
 738
 739 /* Below are functions to find the first, last, or only set bit in a word.  On
 740  * platforms with 64-bit capability, there is a pair for each operation; the
 741  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
 742  * the same in each pair, so the second is stripped of most comments. */
 743
 744 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 745
 746 PERL_STATIC_INLINE unsigned
 747 Perl_lsbit_pos64(U64 word)
 748 {
 749     /* Find the position (0..63) of the least significant set bit in the input
 750      * word */
 751
 752     ASSUME(word != 0);
 753
 754     /* If we can determine that the platform has a usable fast method to get
 755      * this info, use that */
 756
 757 #  if defined(PERL_CTZ_64)
 758 #    define PERL_HAS_FAST_GET_LSB_POS64
 759
 760     return (unsigned) PERL_CTZ_64(word);
 761
 762 #  elif U64SIZE == 8 && defined(_MSC_VER) && _MSC_VER >= 1400
 763 #    define PERL_HAS_FAST_GET_LSB_POS64
 764
 765     {
 766         unsigned long index;
 767         _BitScanForward64(&index, word);
 768         return (unsigned)index;
 769     }
 770
 771 #  else
 772
 773     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
 774      * making the lsb the only set bit in the word, and use our function that
 775      * works on words with a single bit set.
 776      *
 777      * Isolate the lsb;
 778      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
 779      *
 780      * The word will look like this, with a rightmost set bit in position 's':
 781      * ('x's are don't cares, and 'y's are their complements)
 782      *      s
 783      *  x..x100..00
 784      *  y..y011..11      Complement
 785      *  y..y100..00      Add 1
 786      *  0..0100..00      And with the original
 787      *
 788      *  (Yes, complementing and adding 1 is just taking the negative on 2's
 789      *  complement machines, but not on 1's complement ones, and some compilers
 790      *  complain about negating an unsigned.)
 791      */
 792     return single_1bit_pos64(word & (~word + 1));
 793
 794 #  endif
 795
 796 }
 797
 798 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
 799 #else   /* ! QUAD */
 800 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
 801 #endif
 802
 803 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
 804 Perl_lsbit_pos32(U32 word)
 805 {
 806     /* Find the position (0..31) of the least significant set bit in the input
 807      * word */
 808
 809     ASSUME(word != 0);
 810
 811 #if defined(PERL_CTZ_32)
 812 #  define PERL_HAS_FAST_GET_LSB_POS32
 813
 814     return (unsigned) PERL_CTZ_32(word);
 815
 816 #elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
 817 #  define PERL_HAS_FAST_GET_LSB_POS32
 818
 819     {
 820         unsigned long index;
 821         _BitScanForward(&index, word);
 822         return (unsigned)index;
 823     }
 824
 825 #else
 826
 827     return single_1bit_pos32(word & (~word + 1));
 828
 829 #endif
 830
 831 }
 832
 833
 834 /* Convert the leading zeros count to the bit position of the first set bit.
 835  * This just subtracts from the highest position, 31 or 63.  But some compilers
 836  * don't optimize this optimally, and so a bit of bit twiddling encourages them
 837  * to do the right thing.  It turns out that subtracting a smaller non-negative
 838  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
 839  * the two numbers.  To see why, first note that the sum of any number, x, and
 840  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
 841  * the xor of x and all ones is x'. */
 842 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
 843
 844 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 845
 846 PERL_STATIC_INLINE unsigned
 847 Perl_msbit_pos64(U64 word)
 848 {
 849     /* Find the position (0..63) of the most significant set bit in the input
 850      * word */
 851
 852     ASSUME(word != 0);
 853
 854     /* If we can determine that the platform has a usable fast method to get
 855      * this, use that */
 856
 857 #  if defined(PERL_CLZ_64)
 858 #    define PERL_HAS_FAST_GET_MSB_POS64
 859
 860     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
 861
 862 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400
 863 #    define PERL_HAS_FAST_GET_MSB_POS64
 864
 865     {
 866         unsigned long index;
 867         _BitScanReverse64(&index, word);
 868         return (unsigned)index;
 869     }
 870
 871 #  else
 872
 873     /* Here, we didn't find a fast method for finding the msb.  Fall back to
 874      * making the msb the only set bit in the word, and use our function that
 875      * works on words with a single bit set.
 876      *
 877      * Isolate the msb; http://codeforces.com/blog/entry/10330
 878      *
 879      * Only the most significant set bit matters.  Or'ing word with its right
 880      * shift of 1 makes that bit and the next one to its right both 1.
 881      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
 882      * ...  We end with the msb and all to the right being 1. */
 883     word |= (word >>  1);
 884     word |= (word >>  2);
 885     word |= (word >>  4);
 886     word |= (word >>  8);
 887     word |= (word >> 16);
 888     word |= (word >> 32);
 889
 890     /* Then subtracting the right shift by 1 clears all but the left-most of
 891      * the 1 bits, which is our desired result */
 892     word -= (word >> 1);
 893
 894     /* Now we have a single bit set */
 895     return single_1bit_pos64(word);
 896
 897 #  endif
 898
 899 }
 900
 901 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
 902 #else   /* ! QUAD */
 903 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
 904 #endif
 905
 906 PERL_STATIC_INLINE unsigned
 907 Perl_msbit_pos32(U32 word)
 908 {
 909     /* Find the position (0..31) of the most significant set bit in the input
 910      * word */
 911
 912     ASSUME(word != 0);
 913
 914 #if defined(PERL_CLZ_32)
 915 #  define PERL_HAS_FAST_GET_MSB_POS32
 916
 917     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
 918
 919 #elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
 920 #  define PERL_HAS_FAST_GET_MSB_POS32
 921
 922     {
 923         unsigned long index;
 924         _BitScanReverse(&index, word);
 925         return (unsigned)index;
 926     }
 927
 928 #else
 929
 930     word |= (word >>  1);
 931     word |= (word >>  2);
 932     word |= (word >>  4);
 933     word |= (word >>  8);
 934     word |= (word >> 16);
 935     word -= (word >> 1);
 936     return single_1bit_pos32(word);
 937
 938 #endif
 939
 940 }
 941
 942 #if UVSIZE == U64SIZE
 943 #  define msbit_pos(word)  msbit_pos64(word)
 944 #  define lsbit_pos(word)  lsbit_pos64(word)
 945 #elif UVSIZE == U32SIZE
 946 #  define msbit_pos(word)  msbit_pos32(word)
 947 #  define lsbit_pos(word)  lsbit_pos32(word)
 948 #endif
 949
 950 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 951
 952 PERL_STATIC_INLINE unsigned
 953 Perl_single_1bit_pos64(U64 word)
 954 {
 955     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
 956      * find and return the 1's position: 0..63 */
 957
 958 #  ifdef PERL_CORE    /* macro not exported */
 959     ASSUME(isPOWER_OF_2(word));
 960 #  else
 961     ASSUME(word && (word & (word-1)) == 0);
 962 #  endif
 963
 964     /* The only set bit is both the most and least significant bit.  If we have
 965      * a fast way of finding either one, use that.
 966      *
 967      * It may appear at first glance that those functions call this one, but
 968      * they don't if the corresponding #define is set */
 969
 970 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
 971
 972     return msbit_pos64(word);
 973
 974 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
 975
 976     return lsbit_pos64(word);
 977
 978 #  else
 979
 980     /* The position of the only set bit in a word can be quickly calculated
 981      * using deBruijn sequences.  See for example
 982      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
 983     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
 984                                                     >> PERL_deBruijnShift64_];
 985 #  endif
 986
 987 }
 988
 989 #endif
 990
 991 PERL_STATIC_INLINE unsigned
 992 Perl_single_1bit_pos32(U32 word)
 993 {
 994     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
 995      * find and return the 1's position: 0..31 */
 996
 997 #ifdef PERL_CORE    /* macro not exported */
 998     ASSUME(isPOWER_OF_2(word));
 999 #else
1000     ASSUME(word && (word & (word-1)) == 0);
1001 #endif
1002 #ifdef PERL_HAS_FAST_GET_MSB_POS32
1003
1004     return msbit_pos32(word);
1005
1006 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1007
1008     return lsbit_pos32(word);
1009
1010 /* Unlikely, but possible for the platform to have a wider fast operation but
1011  * not a narrower one.  But easy enough to handle the case by widening the
1012  * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
1013  * would be slower than the deBruijn method.) */
1014 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1015
1016     return msbit_pos64(word);
1017
1018 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1019
1020     return lsbit_pos64(word);
1021
1022 #else
1023
1024     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1025                                                     >> PERL_deBruijnShift32_];
1026 #endif
1027
1028 }
1029
1030 #ifndef EBCDIC
1031
1032 PERL_STATIC_INLINE unsigned int
1033 Perl_variant_byte_number(PERL_UINTMAX_T word)
1034 {
1035     /* This returns the position in a word (0..7) of the first variant byte in
1036      * it.  This is a helper function.  Note that there are no branches */
1037
1038     /* Get just the msb bits of each byte */
1039     word &= PERL_VARIANTS_WORD_MASK;
1040
1041     /* This should only be called if we know there is a variant byte in the
1042      * word */
1043     assert(word);
1044
1045 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1046
1047     /* Bytes are stored like
1048      *  Byte8 ... Byte2 Byte1
1049      *  63..56...15...8 7...0
1050      * so getting the lsb of the whole modified word is getting the msb of the
1051      * first byte that has its msb set */
1052     word = lsbit_pos_uintmax_(word);
1053
1054     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1055      * to 0..7 */
1056     return (unsigned int) ((word + 1) >> 3) - 1;
1057
1058 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1059
1060     /* Bytes are stored like
1061      *  Byte1 Byte2  ... Byte8
1062      * 63..56 55..47 ... 7...0
1063      * so getting the msb of the whole modified word is getting the msb of the
1064      * first byte that has its msb set */
1065     word = msbit_pos_uintmax_(word);
1066
1067     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1068      * to 0..7 */
1069     word = ((word + 1) >> 3) - 1;
1070
1071     /* And invert the result because of the reversed byte order on this
1072      * platform */
1073     word = CHARBITS - word - 1;
1074
1075     return (unsigned int) word;
1076
1077 #  else
1078 #    error Unexpected byte order
1079 #  endif
1080
1081 }
1082
1083 #endif
1084 #if defined(PERL_CORE) || defined(PERL_EXT)
1085
1086 /*
1087 =for apidoc variant_under_utf8_count
1088
1089 This function looks at the sequence of bytes between C<s> and C<e>, which are
1090 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1091 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1092 each of these would occupy two bytes instead of the single one in the input
1093 string.  Thus, this function returns the precise number of bytes the string
1094 would expand by when translated to UTF-8.
1095
1096 Unlike most of the other functions that have C<utf8> in their name, the input
1097 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1098 I<odd> to emphasize this.
1099
1100 This function is internal to Perl because khw thinks that any XS code that
1101 would want this is probably operating too close to the internals.  Presenting a
1102 valid use case could change that.
1103
1104 See also
1105 C<L<perlapi/is_utf8_invariant_string>>
1106 and
1107 C<L<perlapi/is_utf8_invariant_string_loc>>,
1108
1109 =cut
1110
1111 */
1112
1113 PERL_STATIC_INLINE Size_t
1114 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1115 {
1116     const U8* x = s;
1117     Size_t count = 0;
1118
1119     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1120
1121 #  ifndef EBCDIC
1122
1123     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1124      * same as for is_utf8_invariant_string()) */
1125     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1126                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1127                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1128     {
1129
1130         /* Process per-byte until reach word boundary.  XXX This loop could be
1131          * eliminated if we knew that this platform had fast unaligned reads */
1132         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1133             count += ! UTF8_IS_INVARIANT(*x++);
1134         }
1135
1136         /* Process per-word as long as we have at least a full word left */
1137         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1138                    explanation of how this works */
1139             PERL_UINTMAX_T increment
1140                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1141                       * PERL_COUNT_MULTIPLIER)
1142                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1143             count += (Size_t) increment;
1144             x += PERL_WORDSIZE;
1145         } while (x + PERL_WORDSIZE <= e);
1146     }
1147
1148 #  endif
1149
1150     /* Process per-byte */
1151     while (x < e) {
1152         if (! UTF8_IS_INVARIANT(*x)) {
1153             count++;
1154         }
1155
1156         x++;
1157     }
1158
1159     return count;
1160 }
1161
1162 #endif
1163
1164 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
1165 #  undef PERL_WORDSIZE
1166 #  undef PERL_COUNT_MULTIPLIER
1167 #  undef PERL_WORD_BOUNDARY_MASK
1168 #  undef PERL_VARIANTS_WORD_MASK
1169 #endif
1170
1171 /*
1172 =for apidoc is_utf8_string
1173
1174 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1175 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1176 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1177 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1178 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1179
1180 This function considers Perl's extended UTF-8 to be valid.  That means that
1181 code points above Unicode, surrogates, and non-character code points are
1182 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1183 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1184 code points are considered valid.
1185
1186 See also
1187 C<L</is_utf8_invariant_string>>,
1188 C<L</is_utf8_invariant_string_loc>>,
1189 C<L</is_utf8_string_loc>>,
1190 C<L</is_utf8_string_loclen>>,
1191 C<L</is_utf8_fixed_width_buf_flags>>,
1192 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1193 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1194
1195 =cut
1196 */
1197
1198 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1199
1200 #if defined(PERL_CORE) || defined (PERL_EXT)
1201
1202 /*
1203 =for apidoc is_utf8_non_invariant_string
1204
1205 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1206 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1207 UTF-8; otherwise returns FALSE.
1208
1209 A TRUE return means that at least one code point represented by the sequence
1210 either is a wide character not representable as a single byte, or the
1211 representation differs depending on whether the sequence is encoded in UTF-8 or
1212 not.
1213
1214 See also
1215 C<L<perlapi/is_utf8_invariant_string>>,
1216 C<L<perlapi/is_utf8_string>>
1217
1218 =cut
1219
1220 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1221 It generally needn't be if its string is entirely UTF-8 invariant, and it
1222 shouldn't be if it otherwise contains invalid UTF-8.
1223
1224 It is an internal function because khw thinks that XS code shouldn't be working
1225 at this low a level.  A valid use case could change that.
1226
1227 */
1228
1229 PERL_STATIC_INLINE bool
1230 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1231 {
1232     const U8 * first_variant;
1233
1234     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1235
1236     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1237         return FALSE;
1238     }
1239
1240     return is_utf8_string(first_variant, len - (first_variant - s));
1241 }
1242
1243 #endif
1244
1245 /*
1246 =for apidoc is_strict_utf8_string
1247
1248 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1249 UTF-8-encoded string that is fully interchangeable by any application using
1250 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1251 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1252 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1253 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1254
1255 This function returns FALSE for strings containing any
1256 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1257 non-character code points.
1258
1259 See also
1260 C<L</is_utf8_invariant_string>>,
1261 C<L</is_utf8_invariant_string_loc>>,
1262 C<L</is_utf8_string>>,
1263 C<L</is_utf8_string_flags>>,
1264 C<L</is_utf8_string_loc>>,
1265 C<L</is_utf8_string_loc_flags>>,
1266 C<L</is_utf8_string_loclen>>,
1267 C<L</is_utf8_string_loclen_flags>>,
1268 C<L</is_utf8_fixed_width_buf_flags>>,
1269 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1270 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1271 C<L</is_strict_utf8_string_loc>>,
1272 C<L</is_strict_utf8_string_loclen>>,
1273 C<L</is_c9strict_utf8_string>>,
1274 C<L</is_c9strict_utf8_string_loc>>,
1275 and
1276 C<L</is_c9strict_utf8_string_loclen>>.
1277
1278 =cut
1279 */
1280
1281 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1282
1283 /*
1284 =for apidoc is_c9strict_utf8_string
1285
1286 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1287 UTF-8-encoded string that conforms to
1288 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1289 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1290 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1291 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1292 characters being ASCII constitute 'a valid UTF-8 string'.
1293
1294 This function returns FALSE for strings containing any code points above the
1295 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1296 code points per
1297 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1298
1299 See also
1300 C<L</is_utf8_invariant_string>>,
1301 C<L</is_utf8_invariant_string_loc>>,
1302 C<L</is_utf8_string>>,
1303 C<L</is_utf8_string_flags>>,
1304 C<L</is_utf8_string_loc>>,
1305 C<L</is_utf8_string_loc_flags>>,
1306 C<L</is_utf8_string_loclen>>,
1307 C<L</is_utf8_string_loclen_flags>>,
1308 C<L</is_utf8_fixed_width_buf_flags>>,
1309 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1310 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1311 C<L</is_strict_utf8_string>>,
1312 C<L</is_strict_utf8_string_loc>>,
1313 C<L</is_strict_utf8_string_loclen>>,
1314 C<L</is_c9strict_utf8_string_loc>>,
1315 and
1316 C<L</is_c9strict_utf8_string_loclen>>.
1317
1318 =cut
1319 */
1320
1321 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1322
1323 /*
1324 =for apidoc is_utf8_string_flags
1325
1326 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1327 UTF-8 string, subject to the restrictions imposed by C<flags>;
1328 returns FALSE otherwise.  If C<len> is 0, it will be calculated
1329 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1330 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1331 that all characters being ASCII constitute 'a valid UTF-8 string'.
1332
1333 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1334 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1335 as C<L</is_strict_utf8_string>>; and if C<flags> is
1336 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1337 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1338 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1339 C<L</utf8n_to_uvchr>>, with the same meanings.
1340
1341 See also
1342 C<L</is_utf8_invariant_string>>,
1343 C<L</is_utf8_invariant_string_loc>>,
1344 C<L</is_utf8_string>>,
1345 C<L</is_utf8_string_loc>>,
1346 C<L</is_utf8_string_loc_flags>>,
1347 C<L</is_utf8_string_loclen>>,
1348 C<L</is_utf8_string_loclen_flags>>,
1349 C<L</is_utf8_fixed_width_buf_flags>>,
1350 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1351 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1352 C<L</is_strict_utf8_string>>,
1353 C<L</is_strict_utf8_string_loc>>,
1354 C<L</is_strict_utf8_string_loclen>>,
1355 C<L</is_c9strict_utf8_string>>,
1356 C<L</is_c9strict_utf8_string_loc>>,
1357 and
1358 C<L</is_c9strict_utf8_string_loclen>>.
1359
1360 =cut
1361 */
1362
1363 PERL_STATIC_INLINE bool
1364 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1365 {
1366     const U8 * first_variant;
1367
1368     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1369     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1370                           |UTF8_DISALLOW_PERL_EXTENDED)));
1371
1372     if (len == 0) {
1373         len = strlen((const char *)s);
1374     }
1375
1376     if (flags == 0) {
1377         return is_utf8_string(s, len);
1378     }
1379
1380     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1381                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1382     {
1383         return is_strict_utf8_string(s, len);
1384     }
1385
1386     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1387                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1388     {
1389         return is_c9strict_utf8_string(s, len);
1390     }
1391
1392     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1393         const U8* const send = s + len;
1394         const U8* x = first_variant;
1395
1396         while (x < send) {
1397             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1398             if (UNLIKELY(! cur_len)) {
1399                 return FALSE;
1400             }
1401             x += cur_len;
1402         }
1403     }
1404
1405     return TRUE;
1406 }
1407
1408 /*
1409
1410 =for apidoc is_utf8_string_loc
1411
1412 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1413 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1414 "utf8ness success") in the C<ep> pointer.
1415
1416 See also C<L</is_utf8_string_loclen>>.
1417
1418 =cut
1419 */
1420
1421 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1422
1423 /*
1424
1425 =for apidoc is_utf8_string_loclen
1426
1427 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1428 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1429 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1430 encoded characters in the C<el> pointer.
1431
1432 See also C<L</is_utf8_string_loc>>.
1433
1434 =cut
1435 */
1436
1437 PERL_STATIC_INLINE bool
1438 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1439 {
1440     const U8 * first_variant;
1441
1442     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1443
1444     if (len == 0) {
1445         len = strlen((const char *) s);
1446     }
1447
1448     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1449         if (el)
1450             *el = len;
1451
1452         if (ep) {
1453             *ep = s + len;
1454         }
1455
1456         return TRUE;
1457     }
1458
1459     {
1460         const U8* const send = s + len;
1461         const U8* x = first_variant;
1462         STRLEN outlen = first_variant - s;
1463
1464         while (x < send) {
1465             const STRLEN cur_len = isUTF8_CHAR(x, send);
1466             if (UNLIKELY(! cur_len)) {
1467                 break;
1468             }
1469             x += cur_len;
1470             outlen++;
1471         }
1472
1473         if (el)
1474             *el = outlen;
1475
1476         if (ep) {
1477             *ep = x;
1478         }
1479
1480         return (x == send);
1481     }
1482 }
1483
1484 /*
1485  * DFA for checking input is valid UTF-8 syntax.
1486  *
1487  * This uses adaptations of the table and algorithm given in
1488  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1489  * documentation of the original version.  A copyright notice for the original
1490  * version is given at the beginning of this file.  The Perl adapations are
1491  * documented at the definition of PL_extended_utf8_dfa_tab[].
1492  *
1493  * This dfa is fast.  There are three exit conditions:
1494  *  1) a well-formed code point, acceptable to the table
1495  *  2) the beginning bytes of an incomplete character, whose completion might
1496  *     or might not be acceptable
1497  *  3) unacceptable to the table.  Some of the adaptations have certain,
1498  *     hopefully less likely to occur, legal inputs be unacceptable to the
1499  *     table, so these must be sorted out afterwards.
1500  *
1501  * This macro is a complete implementation of the code executing the DFA.  It
1502  * is passed the input sequence bounds and the table to use, and what to do
1503  * for each of the exit conditions.  There are three canned actions, likely to
1504  * be the ones you want:
1505  *      DFA_RETURN_SUCCESS_
1506  *      DFA_RETURN_FAILURE_
1507  *      DFA_GOTO_TEASE_APART_FF_
1508  *
1509  * You pass a parameter giving the action to take for each of the three
1510  * possible exit conditions:
1511  *
1512  * 'accept_action'  This is executed when the DFA accepts the input.
1513  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1514  * 'reject_action'  This is executed when the DFA rejects the input.
1515  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1516  *                  you have written code to distinguish the rejecting state
1517  *                  results.  Because it happens in several places, and
1518  *                  involves #ifdefs, the special action
1519  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1520  *                  PL_extended_utf8_dfa_tab.  On platforms without
1521  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1522  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1523  *                  need to have a label 'tease_apart_FF' that it will transfer
1524  *                  to.
1525  * 'incomplete_char_action'  This is executed when the DFA ran off the end
1526  *                  before accepting or rejecting the input.
1527  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1528  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1529  *                  off the end, and you place your code to handle this case
1530  *                  immediately after it.
1531  */
1532
1533 #define DFA_RETURN_SUCCESS_      return s - s0
1534 #define DFA_RETURN_FAILURE_      return 0
1535 #ifdef HAS_EXTRA_LONG_UTF8
1536 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1537 #else
1538 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1539 #endif
1540
1541 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1542                               accept_action,                                \
1543                               reject_action,                                \
1544                               incomplete_char_action)                       \
1545     STMT_START {                                                            \
1546         const U8 * s = s0;                                                  \
1547         UV state = 0;                                                       \
1548                                                                             \
1549         while (s < e) {                                                     \
1550             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1551             s++;                                                            \
1552                                                                             \
1553             if (state == 0) {   /* Accepting state */                       \
1554                 accept_action;                                              \
1555             }                                                               \
1556                                                                             \
1557             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1558                 reject_action;                                              \
1559             }                                                               \
1560         }                                                                   \
1561                                                                             \
1562         /* Here, dropped out of loop before end-of-char */                  \
1563         incomplete_char_action;                                             \
1564     } STMT_END
1565
1566
1567 /*
1568
1569 =for apidoc isUTF8_CHAR
1570
1571 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1572 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1573 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1574 value gives how many bytes starting at C<s> comprise the code point's
1575 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1576 form the first code point in C<s>, are not examined.
1577
1578 The code point can be any that will fit in an IV on this machine, using Perl's
1579 extension to official UTF-8 to represent those higher than the Unicode maximum
1580 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1581 next few bytes in C<s> is legal UTF-8 for a single character.
1582
1583 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1584 defined by Unicode to be fully interchangeable across applications;
1585 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1586 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1587 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1588
1589 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1590 C<L</is_utf8_string_loclen>> to check entire strings.
1591
1592 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1593 machines) is a valid UTF-8 character.
1594
1595 =cut
1596
1597 This uses an adaptation of the table and algorithm given in
1598 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1599 documentation of the original version.  A copyright notice for the original
1600 version is given at the beginning of this file.  The Perl adapation is
1601 documented at the definition of PL_extended_utf8_dfa_tab[].
1602 */
1603
1604 PERL_STATIC_INLINE Size_t
1605 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1606 {
1607     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1608
1609     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1610                           DFA_RETURN_SUCCESS_,
1611                           DFA_TEASE_APART_FF_,
1612                           DFA_RETURN_FAILURE_);
1613
1614     /* Here, we didn't return success, but dropped out of the loop.  In the
1615      * case of PL_extended_utf8_dfa_tab, this means the input is either
1616      * malformed, or the start byte was FF on a platform that the dfa doesn't
1617      * handle FF's.  Call a helper function. */
1618
1619 #ifdef HAS_EXTRA_LONG_UTF8
1620
1621   tease_apart_FF:
1622
1623     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1624      * either malformed, or was for the largest possible start byte, which we
1625      * now check, not inline */
1626     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1627         return 0;
1628     }
1629
1630     return is_utf8_FF_helper_(s0, e,
1631                               FALSE /* require full, not partial char */
1632                              );
1633 #endif
1634
1635 }
1636
1637 /*
1638
1639 =for apidoc isSTRICT_UTF8_CHAR
1640
1641 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1642 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1643 Unicode code point completely acceptable for open interchange between all
1644 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1645 many bytes starting at C<s> comprise the code point's representation.  Any
1646 bytes remaining before C<e>, but beyond the ones needed to form the first code
1647 point in C<s>, are not examined.
1648
1649 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1650 be a surrogate nor a non-character code point.  Thus this excludes any code
1651 point from Perl's extended UTF-8.
1652
1653 This is used to efficiently decide if the next few bytes in C<s> is
1654 legal Unicode-acceptable UTF-8 for a single character.
1655
1656 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1657 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1658 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1659 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1660
1661 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1662 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1663
1664 =cut
1665
1666 This uses an adaptation of the tables and algorithm given in
1667 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1668 documentation of the original version.  A copyright notice for the original
1669 version is given at the beginning of this file.  The Perl adapation is
1670 documented at the definition of strict_extended_utf8_dfa_tab[].
1671
1672 */
1673
1674 PERL_STATIC_INLINE Size_t
1675 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1676 {
1677     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1678
1679     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1680                           DFA_RETURN_SUCCESS_,
1681                           goto check_hanguls,
1682                           DFA_RETURN_FAILURE_);
1683   check_hanguls:
1684
1685     /* Here, we didn't return success, but dropped out of the loop.  In the
1686      * case of PL_strict_utf8_dfa_tab, this means the input is either
1687      * malformed, or was for certain Hanguls; handle them specially */
1688
1689     /* The dfa above drops out for incomplete or illegal inputs, and certain
1690      * legal Hanguls; check and return accordingly */
1691     return is_HANGUL_ED_utf8_safe(s0, e);
1692 }
1693
1694 /*
1695
1696 =for apidoc isC9_STRICT_UTF8_CHAR
1697
1698 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1699 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1700 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1701 the value gives how many bytes starting at C<s> comprise the code point's
1702 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1703 form the first code point in C<s>, are not examined.
1704
1705 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1706 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1707 code points.  This corresponds to
1708 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1709 which said that non-character code points are merely discouraged rather than
1710 completely forbidden in open interchange.  See
1711 L<perlunicode/Noncharacter code points>.
1712
1713 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1714 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1715
1716 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1717 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1718
1719 =cut
1720
1721 This uses an adaptation of the tables and algorithm given in
1722 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1723 documentation of the original version.  A copyright notice for the original
1724 version is given at the beginning of this file.  The Perl adapation is
1725 documented at the definition of PL_c9_utf8_dfa_tab[].
1726
1727 */
1728
1729 PERL_STATIC_INLINE Size_t
1730 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1731 {
1732     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1733
1734     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1735                           DFA_RETURN_SUCCESS_,
1736                           DFA_RETURN_FAILURE_,
1737                           DFA_RETURN_FAILURE_);
1738 }
1739
1740 /*
1741
1742 =for apidoc is_strict_utf8_string_loc
1743
1744 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1745 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1746 "utf8ness success") in the C<ep> pointer.
1747
1748 See also C<L</is_strict_utf8_string_loclen>>.
1749
1750 =cut
1751 */
1752
1753 #define is_strict_utf8_string_loc(s, len, ep)                               \
1754                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1755
1756 /*
1757
1758 =for apidoc is_strict_utf8_string_loclen
1759
1760 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1761 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1762 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1763 encoded characters in the C<el> pointer.
1764
1765 See also C<L</is_strict_utf8_string_loc>>.
1766
1767 =cut
1768 */
1769
1770 PERL_STATIC_INLINE bool
1771 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1772 {
1773     const U8 * first_variant;
1774
1775     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1776
1777     if (len == 0) {
1778         len = strlen((const char *) s);
1779     }
1780
1781     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1782         if (el)
1783             *el = len;
1784
1785         if (ep) {
1786             *ep = s + len;
1787         }
1788
1789         return TRUE;
1790     }
1791
1792     {
1793         const U8* const send = s + len;
1794         const U8* x = first_variant;
1795         STRLEN outlen = first_variant - s;
1796
1797         while (x < send) {
1798             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1799             if (UNLIKELY(! cur_len)) {
1800                 break;
1801             }
1802             x += cur_len;
1803             outlen++;
1804         }
1805
1806         if (el)
1807             *el = outlen;
1808
1809         if (ep) {
1810             *ep = x;
1811         }
1812
1813         return (x == send);
1814     }
1815 }
1816
1817 /*
1818
1819 =for apidoc is_c9strict_utf8_string_loc
1820
1821 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1822 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1823 "utf8ness success") in the C<ep> pointer.
1824
1825 See also C<L</is_c9strict_utf8_string_loclen>>.
1826
1827 =cut
1828 */
1829
1830 #define is_c9strict_utf8_string_loc(s, len, ep)                             \
1831                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1832
1833 /*
1834
1835 =for apidoc is_c9strict_utf8_string_loclen
1836
1837 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1838 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1839 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1840 characters in the C<el> pointer.
1841
1842 See also C<L</is_c9strict_utf8_string_loc>>.
1843
1844 =cut
1845 */
1846
1847 PERL_STATIC_INLINE bool
1848 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1849 {
1850     const U8 * first_variant;
1851
1852     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1853
1854     if (len == 0) {
1855         len = strlen((const char *) s);
1856     }
1857
1858     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1859         if (el)
1860             *el = len;
1861
1862         if (ep) {
1863             *ep = s + len;
1864         }
1865
1866         return TRUE;
1867     }
1868
1869     {
1870         const U8* const send = s + len;
1871         const U8* x = first_variant;
1872         STRLEN outlen = first_variant - s;
1873
1874         while (x < send) {
1875             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1876             if (UNLIKELY(! cur_len)) {
1877                 break;
1878             }
1879             x += cur_len;
1880             outlen++;
1881         }
1882
1883         if (el)
1884             *el = outlen;
1885
1886         if (ep) {
1887             *ep = x;
1888         }
1889
1890         return (x == send);
1891     }
1892 }
1893
1894 /*
1895
1896 =for apidoc is_utf8_string_loc_flags
1897
1898 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1899 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1900 "utf8ness success") in the C<ep> pointer.
1901
1902 See also C<L</is_utf8_string_loclen_flags>>.
1903
1904 =cut
1905 */
1906
1907 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1908                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1909
1910
1911 /* The above 3 actual functions could have been moved into the more general one
1912  * just below, and made #defines that call it with the right 'flags'.  They are
1913  * currently kept separate to increase their chances of getting inlined */
1914
1915 /*
1916
1917 =for apidoc is_utf8_string_loclen_flags
1918
1919 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1920 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1921 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1922 encoded characters in the C<el> pointer.
1923
1924 See also C<L</is_utf8_string_loc_flags>>.
1925
1926 =cut
1927 */
1928
1929 PERL_STATIC_INLINE bool
1930 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1931 {
1932     const U8 * first_variant;
1933
1934     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1935     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1936                           |UTF8_DISALLOW_PERL_EXTENDED)));
1937
1938     if (len == 0) {
1939         len = strlen((const char *) s);
1940     }
1941
1942     if (flags == 0) {
1943         return is_utf8_string_loclen(s, len, ep, el);
1944     }
1945
1946     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1947                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1948     {
1949         return is_strict_utf8_string_loclen(s, len, ep, el);
1950     }
1951
1952     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1953                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1954     {
1955         return is_c9strict_utf8_string_loclen(s, len, ep, el);
1956     }
1957
1958     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1959         if (el)
1960             *el = len;
1961
1962         if (ep) {
1963             *ep = s + len;
1964         }
1965
1966         return TRUE;
1967     }
1968
1969     {
1970         const U8* send = s + len;
1971         const U8* x = first_variant;
1972         STRLEN outlen = first_variant - s;
1973
1974         while (x < send) {
1975             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1976             if (UNLIKELY(! cur_len)) {
1977                 break;
1978             }
1979             x += cur_len;
1980             outlen++;
1981         }
1982
1983         if (el)
1984             *el = outlen;
1985
1986         if (ep) {
1987             *ep = x;
1988         }
1989
1990         return (x == send);
1991     }
1992 }
1993
1994 /*
1995 =for apidoc utf8_distance
1996
1997 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1998 and C<b>.
1999
2000 WARNING: use only if you *know* that the pointers point inside the
2001 same UTF-8 buffer.
2002
2003 =cut
2004 */
2005
2006 PERL_STATIC_INLINE IV
2007 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2008 {
2009     PERL_ARGS_ASSERT_UTF8_DISTANCE;
2010
2011     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2012 }
2013
2014 /*
2015 =for apidoc utf8_hop
2016
2017 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2018 forward or backward.
2019
2020 WARNING: do not use the following unless you *know* C<off> is within
2021 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
2022 on the first byte of character or just after the last byte of a character.
2023
2024 =cut
2025 */
2026
2027 PERL_STATIC_INLINE U8 *
2028 Perl_utf8_hop(const U8 *s, SSize_t off)
2029 {
2030     PERL_ARGS_ASSERT_UTF8_HOP;
2031
2032     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2033      * the bitops (especially ~) can create illegal UTF-8.
2034      * In other words: in Perl UTF-8 is not just for Unicode. */
2035
2036     if (off >= 0) {
2037         while (off--)
2038             s += UTF8SKIP(s);
2039     }
2040     else {
2041         while (off++) {
2042             s--;
2043             while (UTF8_IS_CONTINUATION(*s))
2044                 s--;
2045         }
2046     }
2047     GCC_DIAG_IGNORE(-Wcast-qual)
2048     return (U8 *)s;
2049     GCC_DIAG_RESTORE
2050 }
2051
2052 /*
2053 =for apidoc utf8_hop_forward
2054
2055 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2056 forward.
2057
2058 C<off> must be non-negative.
2059
2060 C<s> must be before or equal to C<end>.
2061
2062 When moving forward it will not move beyond C<end>.
2063
2064 Will not exceed this limit even if the string is not valid "UTF-8".
2065
2066 =cut
2067 */
2068
2069 PERL_STATIC_INLINE U8 *
2070 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2071 {
2072     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2073
2074     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2075      * the bitops (especially ~) can create illegal UTF-8.
2076      * In other words: in Perl UTF-8 is not just for Unicode. */
2077
2078     assert(s <= end);
2079     assert(off >= 0);
2080
2081     while (off--) {
2082         STRLEN skip = UTF8SKIP(s);
2083         if ((STRLEN)(end - s) <= skip) {
2084             GCC_DIAG_IGNORE(-Wcast-qual)
2085             return (U8 *)end;
2086             GCC_DIAG_RESTORE
2087         }
2088         s += skip;
2089     }
2090
2091     GCC_DIAG_IGNORE(-Wcast-qual)
2092     return (U8 *)s;
2093     GCC_DIAG_RESTORE
2094 }
2095
2096 /*
2097 =for apidoc utf8_hop_back
2098
2099 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2100 backward.
2101
2102 C<off> must be non-positive.
2103
2104 C<s> must be after or equal to C<start>.
2105
2106 When moving backward it will not move before C<start>.
2107
2108 Will not exceed this limit even if the string is not valid "UTF-8".
2109
2110 =cut
2111 */
2112
2113 PERL_STATIC_INLINE U8 *
2114 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2115 {
2116     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2117
2118     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2119      * the bitops (especially ~) can create illegal UTF-8.
2120      * In other words: in Perl UTF-8 is not just for Unicode. */
2121
2122     assert(start <= s);
2123     assert(off <= 0);
2124
2125     while (off++ && s > start) {
2126         do {
2127             s--;
2128         } while (UTF8_IS_CONTINUATION(*s) && s > start);
2129     }
2130
2131     GCC_DIAG_IGNORE(-Wcast-qual)
2132     return (U8 *)s;
2133     GCC_DIAG_RESTORE
2134 }
2135
2136 /*
2137 =for apidoc utf8_hop_safe
2138
2139 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2140 either forward or backward.
2141
2142 When moving backward it will not move before C<start>.
2143
2144 When moving forward it will not move beyond C<end>.
2145
2146 Will not exceed those limits even if the string is not valid "UTF-8".
2147
2148 =cut
2149 */
2150
2151 PERL_STATIC_INLINE U8 *
2152 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2153 {
2154     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2155
2156     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2157      * the bitops (especially ~) can create illegal UTF-8.
2158      * In other words: in Perl UTF-8 is not just for Unicode. */
2159
2160     assert(start <= s && s <= end);
2161
2162     if (off >= 0) {
2163         return utf8_hop_forward(s, off, end);
2164     }
2165     else {
2166         return utf8_hop_back(s, off, start);
2167     }
2168 }
2169
2170 /*
2171
2172 =for apidoc is_utf8_valid_partial_char
2173
2174 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2175 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2176 points.  Otherwise, it returns 1 if there exists at least one non-empty
2177 sequence of bytes that when appended to sequence C<s>, starting at position
2178 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2179 otherwise returns 0.
2180
2181 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2182 point.
2183
2184 This is useful when a fixed-length buffer is being tested for being well-formed
2185 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2186 it is split somewhere in the middle of the final code point's UTF-8
2187 representation.  (Presumably when the buffer is refreshed with the next chunk
2188 of data, the new first bytes will complete the partial code point.)   This
2189 function is used to verify that the final bytes in the current buffer are in
2190 fact the legal beginning of some code point, so that if they aren't, the
2191 failure can be signalled without having to wait for the next read.
2192
2193 =cut
2194 */
2195 #define is_utf8_valid_partial_char(s, e)                                    \
2196                                 is_utf8_valid_partial_char_flags(s, e, 0)
2197
2198 /*
2199
2200 =for apidoc is_utf8_valid_partial_char_flags
2201
2202 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2203 or not the input is a valid UTF-8 encoded partial character, but it takes an
2204 extra parameter, C<flags>, which can further restrict which code points are
2205 considered valid.
2206
2207 If C<flags> is 0, this behaves identically to
2208 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2209 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2210 there is any sequence of bytes that can complete the input partial character in
2211 such a way that a non-prohibited character is formed, the function returns
2212 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2213 partial character input.  But many  of the other possible excluded types can be
2214 determined from just the first one or two bytes.
2215
2216 =cut
2217  */
2218
2219 PERL_STATIC_INLINE bool
2220 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
2221 {
2222     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2223
2224     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2225                           |UTF8_DISALLOW_PERL_EXTENDED)));
2226
2227     if (s >= e || s + UTF8SKIP(s) <= e) {
2228         return FALSE;
2229     }
2230
2231     return cBOOL(is_utf8_char_helper(s, e, flags));
2232 }
2233
2234 /*
2235
2236 =for apidoc is_utf8_fixed_width_buf_flags
2237
2238 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2239 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2240 otherwise it returns FALSE.
2241
2242 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2243 without restriction.  If the final few bytes of the buffer do not form a
2244 complete code point, this will return TRUE anyway, provided that
2245 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2246
2247 If C<flags> in non-zero, it can be any combination of the
2248 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2249 same meanings.
2250
2251 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2252 returns FALSE if the final few bytes of the string don't form a complete code
2253 point.
2254
2255 =cut
2256  */
2257 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2258                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2259
2260 /*
2261
2262 =for apidoc is_utf8_fixed_width_buf_loc_flags
2263
2264 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2265 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2266 to the beginning of any partial character at the end of the buffer; if there is
2267 no partial character C<*ep> will contain C<s>+C<len>.
2268
2269 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2270
2271 =cut
2272 */
2273
2274 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2275                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2276
2277 /*
2278
2279 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2280
2281 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2282 complete, valid characters found in the C<el> pointer.
2283
2284 =cut
2285 */
2286
2287 PERL_STATIC_INLINE bool
2288 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2289                                        STRLEN len,
2290                                        const U8 **ep,
2291                                        STRLEN *el,
2292                                        const U32 flags)
2293 {
2294     const U8 * maybe_partial;
2295
2296     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2297
2298     if (! ep) {
2299         ep  = &maybe_partial;
2300     }
2301
2302     /* If it's entirely valid, return that; otherwise see if the only error is
2303      * that the final few bytes are for a partial character */
2304     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2305            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2306 }
2307
2308 PERL_STATIC_INLINE UV
2309 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2310                       STRLEN curlen,
2311                       STRLEN *retlen,
2312                       const U32 flags,
2313                       U32 * errors,
2314                       AV ** msgs)
2315 {
2316     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2317      * simple cases, and, if necessary calls a helper function to deal with the
2318      * more complex ones.  Almost all well-formed non-problematic code points
2319      * are considered simple, so that it's unlikely that the helper function
2320      * will need to be called.
2321      *
2322      * This is an adaptation of the tables and algorithm given in
2323      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2324      * comprehensive documentation of the original version.  A copyright notice
2325      * for the original version is given at the beginning of this file.  The
2326      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
2327      */
2328
2329     const U8 * const s0 = s;
2330     const U8 * send = s0 + curlen;
2331     UV uv = 0;      /* The 0 silences some stupid compilers */
2332     UV state = 0;
2333
2334     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2335
2336     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2337      * non-problematic code point, which can be returned immediately.
2338      * Otherwise we call a helper function to figure out the more complicated
2339      * cases. */
2340
2341     while (s < send && LIKELY(state != 1)) {
2342         UV type = PL_strict_utf8_dfa_tab[*s];
2343
2344         uv = (state == 0)
2345              ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
2346              : UTF8_ACCUMULATE(uv, *s);
2347         state = PL_strict_utf8_dfa_tab[256 + state + type];
2348
2349         if (state != 0) {
2350             s++;
2351             continue;
2352         }
2353
2354         if (retlen) {
2355             *retlen = s - s0 + 1;
2356         }
2357         if (errors) {
2358             *errors = 0;
2359         }
2360         if (msgs) {
2361             *msgs = NULL;
2362         }
2363
2364         return UNI_TO_NATIVE(uv);
2365     }
2366
2367     /* Here is potentially problematic.  Use the full mechanism */
2368     return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
2369 }
2370
2371 PERL_STATIC_INLINE UV
2372 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2373 {
2374     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2375
2376     assert(s < send);
2377
2378     if (! ckWARN_d(WARN_UTF8)) {
2379
2380         /* EMPTY is not really allowed, and asserts on debugging builds.  But
2381          * on non-debugging we have to deal with it, and this causes it to
2382          * return the REPLACEMENT CHARACTER, as the documentation indicates */
2383         return utf8n_to_uvchr(s, send - s, retlen,
2384                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2385     }
2386     else {
2387         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2388         if (retlen && ret == 0 && *s != '\0') {
2389             *retlen = (STRLEN) -1;
2390         }
2391
2392         return ret;
2393     }
2394 }
2395
2396 /* ------------------------------- perl.h ----------------------------- */
2397
2398 /*
2399 =for apidoc_section $utility
2400
2401 =for apidoc is_safe_syscall
2402
2403 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2404 C<NUL> characters.
2405 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2406 category, and return FALSE.
2407
2408 Return TRUE if the name is safe.
2409
2410 C<what> and C<op_name> are used in any warning.
2411
2412 Used by the C<IS_SAFE_SYSCALL()> macro.
2413
2414 =cut
2415 */
2416
2417 PERL_STATIC_INLINE bool
2418 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2419 {
2420     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2421      * perl itself uses xce*() functions which accept 8-bit strings.
2422      */
2423
2424     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2425
2426     if (len > 1) {
2427         char *null_at;
2428         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2429                 SETERRNO(ENOENT, LIB_INVARG);
2430                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2431                                    "Invalid \\0 character in %s for %s: %s\\0%s",
2432                                    what, op_name, pv, null_at+1);
2433                 return FALSE;
2434         }
2435     }
2436
2437     return TRUE;
2438 }
2439
2440 /*
2441
2442 Return true if the supplied filename has a newline character
2443 immediately before the first (hopefully only) NUL.
2444
2445 My original look at this incorrectly used the len from SvPV(), but
2446 that's incorrect, since we allow for a NUL in pv[len-1].
2447
2448 So instead, strlen() and work from there.
2449
2450 This allow for the user reading a filename, forgetting to chomp it,
2451 then calling:
2452
2453   open my $foo, "$file\0";
2454
2455 */
2456
2457 #ifdef PERL_CORE
2458
2459 PERL_STATIC_INLINE bool
2460 S_should_warn_nl(const char *pv)
2461 {
2462     STRLEN len;
2463
2464     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2465
2466     len = strlen(pv);
2467
2468     return len > 0 && pv[len-1] == '\n';
2469 }
2470
2471 #endif
2472
2473 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2474
2475 PERL_STATIC_INLINE bool
2476 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2477 {
2478     /* This function determines if the input NV 'nv' may be converted without
2479      * loss of data to an IV.  If not, it returns FALSE taking no other action.
2480      * But if it is possible, it does the conversion, returning TRUE, and
2481      * storing the converted result in '*ivp' */
2482
2483     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2484
2485 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2486     /* Normally any comparison with a NaN returns false; if we can't rely
2487      * on that behaviour, check explicitly */
2488     if (UNLIKELY(Perl_isnan(nv))) {
2489         return FALSE;
2490     }
2491 #  endif
2492
2493     /* Written this way so that with an always-false NaN comparison we
2494      * return false */
2495     if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2496         return FALSE;
2497     }
2498
2499     if ((IV) nv != nv) {
2500         return FALSE;
2501     }
2502
2503     *ivp = (IV) nv;
2504     return TRUE;
2505 }
2506
2507 #endif
2508
2509 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2510
2511 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2512
2513 #define MAX_CHARSET_NAME_LENGTH 2
2514
2515 PERL_STATIC_INLINE const char *
2516 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2517 {
2518     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2519
2520     /* Returns a string that corresponds to the name of the regex character set
2521      * given by 'flags', and *lenp is set the length of that string, which
2522      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2523
2524     *lenp = 1;
2525     switch (get_regex_charset(flags)) {
2526         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2527         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2528         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2529         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2530         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2531             *lenp = 2;
2532             return ASCII_MORE_RESTRICT_PAT_MODS;
2533     }
2534     /* The NOT_REACHED; hides an assert() which has a rather complex
2535      * definition in perl.h. */
2536     NOT_REACHED; /* NOTREACHED */
2537     return "?";     /* Unknown */
2538 }
2539
2540 #endif
2541
2542 /*
2543
2544 Return false if any get magic is on the SV other than taint magic.
2545
2546 */
2547
2548 PERL_STATIC_INLINE bool
2549 Perl_sv_only_taint_gmagic(SV *sv)
2550 {
2551     MAGIC *mg = SvMAGIC(sv);
2552
2553     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2554
2555     while (mg) {
2556         if (mg->mg_type != PERL_MAGIC_taint
2557             && !(mg->mg_flags & MGf_GSKIP)
2558             && mg->mg_virtual->svt_get) {
2559             return FALSE;
2560         }
2561         mg = mg->mg_moremagic;
2562     }
2563
2564     return TRUE;
2565 }
2566
2567 /* ------------------ cop.h ------------------------------------------- */
2568
2569 /* implement GIMME_V() macro */
2570
2571 PERL_STATIC_INLINE U8
2572 Perl_gimme_V(pTHX)
2573 {
2574     I32 cxix;
2575     U8  gimme = (PL_op->op_flags & OPf_WANT);
2576
2577     if (gimme)
2578         return gimme;
2579     cxix = PL_curstackinfo->si_cxsubix;
2580     if (cxix < 0)
2581         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2582     assert(cxstack[cxix].blk_gimme & G_WANT);
2583     return (cxstack[cxix].blk_gimme & G_WANT);
2584 }
2585
2586
2587 /* Enter a block. Push a new base context and return its address. */
2588
2589 PERL_STATIC_INLINE PERL_CONTEXT *
2590 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2591 {
2592     PERL_CONTEXT * cx;
2593
2594     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2595
2596     CXINC;
2597     cx = CX_CUR();
2598     cx->cx_type        = type;
2599     cx->blk_gimme      = gimme;
2600     cx->blk_oldsaveix  = saveix;
2601     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2602     cx->blk_oldcop     = PL_curcop;
2603     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2604     cx->blk_oldscopesp = PL_scopestack_ix;
2605     cx->blk_oldpm      = PL_curpm;
2606     cx->blk_old_tmpsfloor = PL_tmps_floor;
2607
2608     PL_tmps_floor        = PL_tmps_ix;
2609     CX_DEBUG(cx, "PUSH");
2610     return cx;
2611 }
2612
2613
2614 /* Exit a block (RETURN and LAST). */
2615
2616 PERL_STATIC_INLINE void
2617 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2618 {
2619     PERL_ARGS_ASSERT_CX_POPBLOCK;
2620
2621     CX_DEBUG(cx, "POP");
2622     /* these 3 are common to cx_popblock and cx_topblock */
2623     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2624     PL_scopestack_ix = cx->blk_oldscopesp;
2625     PL_curpm         = cx->blk_oldpm;
2626
2627     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2628      * and leaves a CX entry lying around for repeated use, so
2629      * skip for multicall */                  \
2630     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2631             || PL_savestack_ix == cx->blk_oldsaveix);
2632     PL_curcop     = cx->blk_oldcop;
2633     PL_tmps_floor = cx->blk_old_tmpsfloor;
2634 }
2635
2636 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2637  * Whereas cx_popblock() restores the state to the point just before
2638  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2639  * *after* cx_pushblock() was called. */
2640
2641 PERL_STATIC_INLINE void
2642 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2643 {
2644     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2645
2646     CX_DEBUG(cx, "TOP");
2647     /* these 3 are common to cx_popblock and cx_topblock */
2648     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2649     PL_scopestack_ix = cx->blk_oldscopesp;
2650     PL_curpm         = cx->blk_oldpm;
2651
2652     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2653 }
2654
2655
2656 PERL_STATIC_INLINE void
2657 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2658 {
2659     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2660
2661     PERL_ARGS_ASSERT_CX_PUSHSUB;
2662
2663     PERL_DTRACE_PROBE_ENTRY(cv);
2664     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2665     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2666     cx->blk_sub.cv = cv;
2667     cx->blk_sub.olddepth = CvDEPTH(cv);
2668     cx->blk_sub.prevcomppad = PL_comppad;
2669     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2670     cx->blk_sub.retop = retop;
2671     SvREFCNT_inc_simple_void_NN(cv);
2672     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2673 }
2674
2675
2676 /* subsets of cx_popsub() */
2677
2678 PERL_STATIC_INLINE void
2679 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2680 {
2681     CV *cv;
2682
2683     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2684     assert(CxTYPE(cx) == CXt_SUB);
2685
2686     PL_comppad = cx->blk_sub.prevcomppad;
2687     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2688     cv = cx->blk_sub.cv;
2689     CvDEPTH(cv) = cx->blk_sub.olddepth;
2690     cx->blk_sub.cv = NULL;
2691     SvREFCNT_dec(cv);
2692     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2693 }
2694
2695
2696 /* handle the @_ part of leaving a sub */
2697
2698 PERL_STATIC_INLINE void
2699 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2700 {
2701     AV *av;
2702
2703     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2704     assert(CxTYPE(cx) == CXt_SUB);
2705     assert(AvARRAY(MUTABLE_AV(
2706         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2707                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2708
2709     CX_POP_SAVEARRAY(cx);
2710     av = MUTABLE_AV(PAD_SVl(0));
2711     if (UNLIKELY(AvREAL(av)))
2712         /* abandon @_ if it got reified */
2713         clear_defarray(av, 0);
2714     else {
2715         CLEAR_ARGARRAY(av);
2716     }
2717 }
2718
2719
2720 PERL_STATIC_INLINE void
2721 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2722 {
2723     PERL_ARGS_ASSERT_CX_POPSUB;
2724     assert(CxTYPE(cx) == CXt_SUB);
2725
2726     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2727
2728     if (CxHASARGS(cx))
2729         cx_popsub_args(cx);
2730     cx_popsub_common(cx);
2731 }
2732
2733
2734 PERL_STATIC_INLINE void
2735 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2736 {
2737     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2738
2739     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2740     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2741     cx->blk_format.cv          = cv;
2742     cx->blk_format.retop       = retop;
2743     cx->blk_format.gv          = gv;
2744     cx->blk_format.dfoutgv     = PL_defoutgv;
2745     cx->blk_format.prevcomppad = PL_comppad;
2746     cx->blk_u16                = 0;
2747
2748     SvREFCNT_inc_simple_void_NN(cv);
2749     CvDEPTH(cv)++;
2750     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2751 }
2752
2753
2754 PERL_STATIC_INLINE void
2755 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2756 {
2757     CV *cv;
2758     GV *dfout;
2759
2760     PERL_ARGS_ASSERT_CX_POPFORMAT;
2761     assert(CxTYPE(cx) == CXt_FORMAT);
2762
2763     dfout = cx->blk_format.dfoutgv;
2764     setdefout(dfout);
2765     cx->blk_format.dfoutgv = NULL;
2766     SvREFCNT_dec_NN(dfout);
2767
2768     PL_comppad = cx->blk_format.prevcomppad;
2769     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2770     cv = cx->blk_format.cv;
2771     cx->blk_format.cv = NULL;
2772     --CvDEPTH(cv);
2773     SvREFCNT_dec_NN(cv);
2774     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2775 }
2776
2777
2778 PERL_STATIC_INLINE void
2779 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2780 {
2781     cx->blk_eval.retop         = retop;
2782     cx->blk_eval.old_namesv    = namesv;
2783     cx->blk_eval.old_eval_root = PL_eval_root;
2784     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2785     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2786     cx->blk_eval.cur_top_env   = PL_top_env;
2787
2788     assert(!(PL_in_eval     & ~ 0x3F));
2789     assert(!(PL_op->op_type & ~0x1FF));
2790     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2791 }
2792
2793 PERL_STATIC_INLINE void
2794 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2795 {
2796     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2797
2798     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2799
2800     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2801     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2802 }
2803
2804 PERL_STATIC_INLINE void
2805 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2806 {
2807     PERL_ARGS_ASSERT_CX_PUSHTRY;
2808
2809     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2810
2811     /* Don't actually change it, just store the current value so it's restored
2812      * by the common popeval */
2813     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2814 }
2815
2816
2817 PERL_STATIC_INLINE void
2818 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2819 {
2820     SV *sv;
2821
2822     PERL_ARGS_ASSERT_CX_POPEVAL;
2823     assert(CxTYPE(cx) == CXt_EVAL);
2824
2825     PL_in_eval = CxOLD_IN_EVAL(cx);
2826     assert(!(PL_in_eval & 0xc0));
2827     PL_eval_root = cx->blk_eval.old_eval_root;
2828     sv = cx->blk_eval.cur_text;
2829     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2830         cx->blk_eval.cur_text = NULL;
2831         SvREFCNT_dec_NN(sv);
2832     }
2833
2834     sv = cx->blk_eval.old_namesv;
2835     if (sv) {
2836         cx->blk_eval.old_namesv = NULL;
2837         SvREFCNT_dec_NN(sv);
2838     }
2839     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2840 }
2841
2842
2843 /* push a plain loop, i.e.
2844  *     { block }
2845  *     while (cond) { block }
2846  *     for (init;cond;continue) { block }
2847  * This loop can be last/redo'ed etc.
2848  */
2849
2850 PERL_STATIC_INLINE void
2851 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2852 {
2853     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2854     cx->blk_loop.my_op = cLOOP;
2855 }
2856
2857
2858 /* push a true for loop, i.e.
2859  *     for var (list) { block }
2860  */
2861
2862 PERL_STATIC_INLINE void
2863 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2864 {
2865     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2866
2867     /* this one line is common with cx_pushloop_plain */
2868     cx->blk_loop.my_op = cLOOP;
2869
2870     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2871     cx->blk_loop.itersave      = itersave;
2872 #ifdef USE_ITHREADS
2873     cx->blk_loop.oldcomppad = PL_comppad;
2874 #endif
2875 }
2876
2877
2878 /* pop all loop types, including plain */
2879
2880 PERL_STATIC_INLINE void
2881 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2882 {
2883     PERL_ARGS_ASSERT_CX_POPLOOP;
2884
2885     assert(CxTYPE_is_LOOP(cx));
2886     if (  CxTYPE(cx) == CXt_LOOP_ARY
2887        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2888     {
2889         /* Free ary or cur. This assumes that state_u.ary.ary
2890          * aligns with state_u.lazysv.cur. See cx_dup() */
2891         SV *sv = cx->blk_loop.state_u.lazysv.cur;
2892         cx->blk_loop.state_u.lazysv.cur = NULL;
2893         SvREFCNT_dec_NN(sv);
2894         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2895             sv = cx->blk_loop.state_u.lazysv.end;
2896             cx->blk_loop.state_u.lazysv.end = NULL;
2897             SvREFCNT_dec_NN(sv);
2898         }
2899     }
2900     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2901         SV *cursv;
2902         SV **svp = (cx)->blk_loop.itervar_u.svp;
2903         if ((cx->cx_type & CXp_FOR_GV))
2904             svp = &GvSV((GV*)svp);
2905         cursv = *svp;
2906         *svp = cx->blk_loop.itersave;
2907         cx->blk_loop.itersave = NULL;
2908         SvREFCNT_dec(cursv);
2909     }
2910 }
2911
2912
2913 PERL_STATIC_INLINE void
2914 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2915 {
2916     PERL_ARGS_ASSERT_CX_PUSHWHEN;
2917
2918     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2919 }
2920
2921
2922 PERL_STATIC_INLINE void
2923 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2924 {
2925     PERL_ARGS_ASSERT_CX_POPWHEN;
2926     assert(CxTYPE(cx) == CXt_WHEN);
2927
2928     PERL_UNUSED_ARG(cx);
2929     PERL_UNUSED_CONTEXT;
2930     /* currently NOOP */
2931 }
2932
2933
2934 PERL_STATIC_INLINE void
2935 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2936 {
2937     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2938
2939     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2940     cx->blk_givwhen.defsv_save = orig_defsv;
2941 }
2942
2943
2944 PERL_STATIC_INLINE void
2945 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2946 {
2947     SV *sv;
2948
2949     PERL_ARGS_ASSERT_CX_POPGIVEN;
2950     assert(CxTYPE(cx) == CXt_GIVEN);
2951
2952     sv = GvSV(PL_defgv);
2953     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2954     cx->blk_givwhen.defsv_save = NULL;
2955     SvREFCNT_dec(sv);
2956 }
2957
2958 /* ------------------ util.h ------------------------------------------- */
2959
2960 /*
2961 =for apidoc_section $string
2962
2963 =for apidoc foldEQ
2964
2965 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2966 same
2967 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
2968 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
2969 range bytes match only themselves.
2970
2971 =cut
2972 */
2973
2974 PERL_STATIC_INLINE I32
2975 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2976 {
2977     const U8 *a = (const U8 *)s1;
2978     const U8 *b = (const U8 *)s2;
2979
2980     PERL_ARGS_ASSERT_FOLDEQ;
2981
2982     assert(len >= 0);
2983
2984     while (len--) {
2985         if (*a != *b && *a != PL_fold[*b])
2986             return 0;
2987         a++,b++;
2988     }
2989     return 1;
2990 }
2991
2992 PERL_STATIC_INLINE I32
2993 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2994 {
2995     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
2996      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2997      * does not check for this.  Nor does it check that the strings each have
2998      * at least 'len' characters. */
2999
3000     const U8 *a = (const U8 *)s1;
3001     const U8 *b = (const U8 *)s2;
3002
3003     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3004
3005     assert(len >= 0);
3006
3007     while (len--) {
3008         if (*a != *b && *a != PL_fold_latin1[*b]) {
3009             return 0;
3010         }
3011         a++, b++;
3012     }
3013     return 1;
3014 }
3015
3016 /*
3017 =for apidoc_section $locale
3018 =for apidoc foldEQ_locale
3019
3020 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3021 same case-insensitively in the current locale; false otherwise.
3022
3023 =cut
3024 */
3025
3026 PERL_STATIC_INLINE I32
3027 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3028 {
3029     const U8 *a = (const U8 *)s1;
3030     const U8 *b = (const U8 *)s2;
3031
3032     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3033
3034     assert(len >= 0);
3035
3036     while (len--) {
3037         if (*a != *b && *a != PL_fold_locale[*b])
3038             return 0;
3039         a++,b++;
3040     }
3041     return 1;
3042 }
3043
3044 /*
3045 =for apidoc_section $string
3046 =for apidoc my_strnlen
3047
3048 The C library C<strnlen> if available, or a Perl implementation of it.
3049
3050 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3051 characters.  It will never attempt to address more than C<maxlen>
3052 characters, making it suitable for use with strings that are not
3053 guaranteed to be NUL-terminated.
3054
3055 =cut
3056
3057 Description stolen from http://man.openbsd.org/strnlen.3,
3058 implementation stolen from PostgreSQL.
3059 */
3060 #ifndef HAS_STRNLEN
3061
3062 PERL_STATIC_INLINE Size_t
3063 Perl_my_strnlen(const char *str, Size_t maxlen)
3064 {
3065     const char *end = (char *) memchr(str, '\0', maxlen);
3066
3067     PERL_ARGS_ASSERT_MY_STRNLEN;
3068
3069     if (end == NULL) return maxlen;
3070     return end - str;
3071 }
3072
3073 #endif
3074
3075 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3076
3077 PERL_STATIC_INLINE void *
3078 S_my_memrchr(const char * s, const char c, const STRLEN len)
3079 {
3080     /* memrchr(), since many platforms lack it */
3081
3082     const char * t = s + len - 1;
3083
3084     PERL_ARGS_ASSERT_MY_MEMRCHR;
3085
3086     while (t >= s) {
3087         if (*t == c) {
3088             return (void *) t;
3089         }
3090         t--;
3091     }
3092
3093     return NULL;
3094 }
3095
3096 #endif
3097
3098 PERL_STATIC_INLINE char *
3099 Perl_mortal_getenv(const char * str)
3100 {
3101     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3102      *
3103      * It's (mostly) thread-safe because it uses a mutex to prevent other
3104      * threads (that look at this mutex) from destroying the result before this
3105      * routine has a chance to copy the result to a place that won't be
3106      * destroyed before the caller gets a chance to handle it.  That place is a
3107      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3108      * impression that the SV will hang around longer under more circumstances
3109      *
3110      * The reason it isn't completely thread-safe is that other code could
3111      * simply not pay attention to the mutex.  All of the Perl core uses the
3112      * mutex, but it is possible for code from, say XS, to not use this mutex,
3113      * defeating the safety.
3114      *
3115      * getenv() returns, in some implementations, a pointer to a spot in the
3116      * **environ array, which could be invalidated at any time by this or
3117      * another thread changing the environment.  Other implementations copy the
3118      * **environ value to a static buffer, returning a pointer to that.  That
3119      * buffer might or might not be invalidated by a getenv() call in another
3120      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3121      * many getenv() calls can safely be running simultaneously, so a
3122      * many-reader (but no simultaneous writers) lock is ok.  There is a
3123      * Configure probe to see if another thread destroys the buffer, and the
3124      * mutex is defined accordingly.
3125      *
3126      * But in all cases, using the mutex prevents these problems, as long as
3127      * all code uses the same mutex..
3128      *
3129      * A complication is that this can be called during phases where the
3130      * mortalization process isn't available.  These are in interpreter
3131      * destruction or early in construction.  khw believes that at these times
3132      * there shouldn't be anything else going on, so plain getenv is safe AS
3133      * LONG AS the caller acts on the return before calling it again. */
3134
3135     char * ret;
3136     dTHX;
3137
3138     PERL_ARGS_ASSERT_MORTAL_GETENV;
3139
3140     /* Can't mortalize without stacks.  khw believes that no other threads
3141      * should be running, so no need to lock things, and this may be during a
3142      * phase when locking isn't even available */
3143     if (UNLIKELY(PL_scopestack_ix == 0)) {
3144         return getenv(str);
3145     }
3146
3147 #ifdef PERL_MEM_LOG
3148
3149     /* A major complication arises under PERL_MEM_LOG.  When that is active,
3150      * every memory allocation may result in logging, depending on the value of
3151      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3152      * saving ENV{foo}'s value (but before saving it), the logging code will
3153      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3154      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3155      * lock a boolean mutex recursively); 3) destroying the getenv() static
3156      * buffer; or 4) destroying the temporary created by this for the copy
3157      * causes a log entry to be made which could cause a new temporary to be
3158      * created, which will need to be destroyed at some point, leading to an
3159      * infinite loop.
3160      *
3161      * The solution adopted here (after some gnashing of teeth) is to detect
3162      * the recursive calls and calls from the logger, and treat them specially.
3163      * Let's say we want to do getenv("foo").  We first find
3164      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3165      * variable, so no temporary is required.  Then we do getenv(foo}, and in
3166      * the process of creating a temporary to save it, this function will be
3167      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3168      * we detect that it is such a call and return our saved value instead of
3169      * locking and doing a new getenv().  This solves all of problems 1), 2),
3170      * and 3).  Because all the getenv()s are done while the mutex is locked,
3171      * the state cannot have changed.  To solve 4), we don't create a temporary
3172      * when this is called from the logging code.  That code disposes of the
3173      * return value while the mutex is still locked.
3174      *
3175      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3176      * digits and 3 particular letters are significant; the rest are ignored by
3177      * the memory logging code.  Thus the per-interpreter variable only needs
3178      * to be large enough to save the significant information, the size of
3179      * which is known at compile time.  The first byte is extra, reserved for
3180      * flags for our use.  To protect against overflowing, only the reserved
3181      * byte, as many digits as don't overflow, and the three letters are
3182      * stored.
3183      *
3184      * The reserved byte has two bits:
3185      *      0x1 if set indicates that if we get here, it is a recursive call of
3186      *          getenv()
3187      *      0x2 if set indicates that the call is from the logging code.
3188      *
3189      * If the flag indicates this is a recursive call, just return the stored
3190      * value of PL_mem_log;  An empty value gets turned into NULL. */
3191     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3192         if (PL_mem_log[1] == '\0') {
3193             return NULL;
3194         } else {
3195             return PL_mem_log + 1;
3196         }
3197     }
3198
3199 #endif
3200
3201     GETENV_LOCK;
3202
3203 #ifdef PERL_MEM_LOG
3204
3205     /* Here we are in a critical section.  As explained above, we do our own
3206      * getenv(PERL_MEM_LOG), saving the result safely. */
3207     ret = getenv("PERL_MEM_LOG");
3208     if (ret == NULL) {  /* No logging active */
3209
3210         /* Return that immediately if called from the logging code */
3211         if (PL_mem_log[0] & 0x2) {
3212             GETENV_UNLOCK;
3213             return NULL;
3214         }
3215
3216         PL_mem_log[1] = '\0';
3217     }
3218     else {
3219         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3220
3221         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3222          * extremely long string.  But we want only a few characters from it.
3223          * PL_mem_log has been made large enough to hold just the ones we need.
3224          * First the file descriptor. */
3225         if (isDIGIT(*ret)) {
3226             const char * s = ret;
3227             if (UNLIKELY(*s == '0')) {
3228
3229                 /* Reduce multiple leading zeros to a single one.  This is to
3230                  * allow the caller to change what to do with leading zeros. */
3231                 *mem_log_meat++ = '0';
3232                 s++;
3233                 while (*s == '0') {
3234                     s++;
3235                 }
3236             }
3237
3238             /* If the input overflows, copy just enough for the result to also
3239              * overflow, plus 1 to make sure */
3240             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3241                 *mem_log_meat++ = *s++;
3242             }
3243         }
3244
3245         /* Then each of the three significant characters */
3246         if (strchr(ret, 'm')) {
3247             *mem_log_meat++ = 'm';
3248         }
3249         if (strchr(ret, 's')) {
3250             *mem_log_meat++ = 's';
3251         }
3252         if (strchr(ret, 't')) {
3253             *mem_log_meat++ = 't';
3254         }
3255         *mem_log_meat = '\0';
3256
3257         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3258     }
3259
3260     /* If we are being called from the logger, it only needs the significant
3261      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3262     if (PL_mem_log[0] & 0x2) {
3263         assert(strEQ(str, "PERL_MEM_LOG"));
3264         GETENV_UNLOCK;
3265         return PL_mem_log + 1;
3266     }
3267
3268     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3269      * is coming from other than the logging code, so it should be treated the
3270      * same as any other getenv(), returning the full value, not just the
3271      * significant part, and having its value saved.  Set the flag that
3272      * indicates any call to this routine will be a recursion from here */
3273     PL_mem_log[0] = 0x1;
3274
3275 #endif
3276
3277     /* Now get the value of the real desired variable, and save a copy */
3278     ret = getenv(str);
3279
3280     if (ret != NULL) {
3281         ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
3282     }
3283
3284     GETENV_UNLOCK;
3285
3286 #ifdef PERL_MEM_LOG
3287
3288     /* Clear the buffer */
3289     Zero(PL_mem_log, sizeof(PL_mem_log), char);
3290
3291 #endif
3292
3293     return ret;
3294 }
3295
3296 /*
3297  * ex: set ts=8 sts=4 sw=4 et:
3298  */