inline.h

   1 /*    inline.h
   2  *
   3  *    Copyright (C) 2012 by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  *    This file contains tables and code adapted from
   9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
  10  *    copyright notice:
  11
  12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  13
  14 Permission is hereby granted, free of charge, to any person obtaining a copy of
  15 this software and associated documentation files (the "Software"), to deal in
  16 the Software without restriction, including without limitation the rights to
  17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  18 of the Software, and to permit persons to whom the Software is furnished to do
  19 so, subject to the following conditions:
  20
  21 The above copyright notice and this permission notice shall be included in all
  22 copies or substantial portions of the Software.
  23
  24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 SOFTWARE.
  31
  32  *
  33  * This file is a home for static inline functions that cannot go in other
  34  * header files, because they depend on proto.h (included after most other
  35  * headers) or struct definitions.
  36  *
  37  * Each section names the header file that the functions "belong" to.
  38  */
  39
  40 /* ------------------------------- av.h ------------------------------- */
  41
  42 /*
  43 =for apidoc_section $AV
  44 =for apidoc av_count
  45 Returns the number of elements in the array C<av>.  This is the true length of
  46 the array, including any undefined elements.  It is always the same as
  47 S<C<av_top_index(av) + 1>>.
  48
  49 =cut
  50 */
  51 PERL_STATIC_INLINE Size_t
  52 Perl_av_count(pTHX_ AV *av)
  53 {
  54     PERL_ARGS_ASSERT_AV_COUNT;
  55     assert(SvTYPE(av) == SVt_PVAV);
  56
  57     return AvFILL(av) + 1;
  58 }
  59
  60 /* ------------------------------- av.c ------------------------------- */
  61
  62 /*
  63 =for apidoc av_store_simple
  64
  65 This is a cut-down version of av_store that assumes that the array is
  66 very straightforward - no magic, not readonly, and AvREAL - and that
  67 C<key> is not negative. This function MUST NOT be used in situations
  68 where any of those assumptions may not hold.
  69
  70 Stores an SV in an array.  The array index is specified as C<key>. It
  71 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
  72
  73 Note that the caller is responsible for suitably incrementing the reference
  74 count of C<val> before the call.
  75
  76 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
  77
  78 =cut
  79 */
  80
  81 PERL_STATIC_INLINE SV**
  82 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
  83 {
  84     SV** ary;
  85
  86     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
  87     assert(SvTYPE(av) == SVt_PVAV);
  88     assert(!SvMAGICAL(av));
  89     assert(!SvREADONLY(av));
  90     assert(AvREAL(av));
  91     assert(key > -1);
  92
  93     ary = AvARRAY(av);
  94
  95     if (AvFILLp(av) < key) {
  96         if (key > AvMAX(av)) {
  97             av_extend(av,key);
  98             ary = AvARRAY(av);
  99         }
 100         AvFILLp(av) = key;
 101     } else
 102         SvREFCNT_dec(ary[key]);
 103
 104     ary[key] = val;
 105     return &ary[key];
 106 }
 107
 108 /*
 109 =for apidoc av_fetch_simple
 110
 111 This is a cut-down version of av_fetch that assumes that the array is
 112 very straightforward - no magic, not readonly, and AvREAL - and that
 113 C<key> is not negative. This function MUST NOT be used in situations
 114 where any of those assumptions may not hold.
 115
 116 Returns the SV at the specified index in the array.  The C<key> is the
 117 index.  If lval is true, you are guaranteed to get a real SV back (in case
 118 it wasn't real before), which you can then modify.  Check that the return
 119 value is non-null before dereferencing it to a C<SV*>.
 120
 121 The rough perl equivalent is C<$myarray[$key]>.
 122
 123 =cut
 124 */
 125
 126 PERL_STATIC_INLINE SV**
 127 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
 128 {
 129     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
 130     assert(SvTYPE(av) == SVt_PVAV);
 131     assert(!SvMAGICAL(av));
 132     assert(!SvREADONLY(av));
 133     assert(AvREAL(av));
 134     assert(key > -1);
 135
 136     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
 137         return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
 138     } else {
 139         return &AvARRAY(av)[key];
 140     }
 141 }
 142
 143 /*
 144 =for apidoc av_push_simple
 145
 146 This is a cut-down version of av_push that assumes that the array is very
 147 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
 148 not less than -1. This function MUST NOT be used in situations where any
 149 of those assumptions may not hold.
 150
 151 Pushes an SV (transferring control of one reference count) onto the end of the
 152 array.  The array will grow automatically to accommodate the addition.
 153
 154 Perl equivalent: C<push @myarray, $val;>.
 155
 156 =cut
 157 */
 158
 159 PERL_STATIC_INLINE void
 160 Perl_av_push_simple(pTHX_ AV *av, SV *val)
 161 {
 162     PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
 163     assert(SvTYPE(av) == SVt_PVAV);
 164     assert(!SvMAGICAL(av));
 165     assert(!SvREADONLY(av));
 166     assert(AvREAL(av));
 167     assert(AvFILLp(av) > -2);
 168
 169     (void)av_store_simple(av,AvFILLp(av)+1,val);
 170 }
 171
 172 /* ------------------------------- cv.h ------------------------------- */
 173
 174 /*
 175 =for apidoc_section $CV
 176 =for apidoc CvGV
 177 Returns the GV associated with the CV C<sv>, reifying it if necessary.
 178
 179 =cut
 180 */
 181 PERL_STATIC_INLINE GV *
 182 Perl_CvGV(pTHX_ CV *sv)
 183 {
 184     PERL_ARGS_ASSERT_CVGV;
 185
 186     return CvNAMED(sv)
 187         ? Perl_cvgv_from_hek(aTHX_ sv)
 188         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
 189 }
 190
 191 /*
 192 =for apidoc CvDEPTH
 193 Returns the recursion level of the CV C<sv>.  Hence >= 2 indicates we are in a
 194 recursive call.
 195
 196 =cut
 197 */
 198 PERL_STATIC_INLINE I32 *
 199 Perl_CvDEPTH(const CV * const sv)
 200 {
 201     PERL_ARGS_ASSERT_CVDEPTH;
 202     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
 203
 204     return &((XPVCV*)SvANY(sv))->xcv_depth;
 205 }
 206
 207 /*
 208  CvPROTO returns the prototype as stored, which is not necessarily what
 209  the interpreter should be using. Specifically, the interpreter assumes
 210  that spaces have been stripped, which has been the case if the prototype
 211  was added by toke.c, but is generally not the case if it was added elsewhere.
 212  Since we can't enforce the spacelessness at assignment time, this routine
 213  provides a temporary copy at parse time with spaces removed.
 214  I<orig> is the start of the original buffer, I<len> is the length of the
 215  prototype and will be updated when this returns.
 216  */
 217
 218 #ifdef PERL_CORE
 219 PERL_STATIC_INLINE char *
 220 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
 221 {
 222     SV * tmpsv;
 223     char * tmps;
 224     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
 225     tmps = SvPVX(tmpsv);
 226     while ((*len)--) {
 227         if (!isSPACE(*orig))
 228             *tmps++ = *orig;
 229         orig++;
 230     }
 231     *tmps = '\0';
 232     *len = tmps - SvPVX(tmpsv);
 233                 return SvPVX(tmpsv);
 234 }
 235 #endif
 236
 237 /* ------------------------------- mg.h ------------------------------- */
 238
 239 #if defined(PERL_CORE) || defined(PERL_EXT)
 240 /* assumes get-magic and stringification have already occurred */
 241 PERL_STATIC_INLINE STRLEN
 242 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
 243 {
 244     assert(mg->mg_type == PERL_MAGIC_regex_global);
 245     assert(mg->mg_len != -1);
 246     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
 247         return (STRLEN)mg->mg_len;
 248     else {
 249         const STRLEN pos = (STRLEN)mg->mg_len;
 250         /* Without this check, we may read past the end of the buffer: */
 251         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
 252         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
 253     }
 254 }
 255 #endif
 256
 257 /* ------------------------------- pad.h ------------------------------ */
 258
 259 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
 260 PERL_STATIC_INLINE bool
 261 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
 262 {
 263     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
 264
 265     /* is seq within the range _LOW to _HIGH ?
 266      * This is complicated by the fact that PL_cop_seqmax
 267      * may have wrapped around at some point */
 268     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
 269         return FALSE; /* not yet introduced */
 270
 271     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
 272     /* in compiling scope */
 273         if (
 274             (seq >  COP_SEQ_RANGE_LOW(pn))
 275             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
 276             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
 277         )
 278             return TRUE;
 279     }
 280     else if (
 281         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
 282         ?
 283             (  seq >  COP_SEQ_RANGE_LOW(pn)
 284             || seq <= COP_SEQ_RANGE_HIGH(pn))
 285
 286         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
 287              && seq <= COP_SEQ_RANGE_HIGH(pn))
 288     )
 289         return TRUE;
 290     return FALSE;
 291 }
 292 #endif
 293
 294 /* ------------------------------- pp.h ------------------------------- */
 295
 296 PERL_STATIC_INLINE I32
 297 Perl_TOPMARK(pTHX)
 298 {
 299     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 300                                  "MARK top  %p %" IVdf "\n",
 301                                   PL_markstack_ptr,
 302                                   (IV)*PL_markstack_ptr)));
 303     return *PL_markstack_ptr;
 304 }
 305
 306 PERL_STATIC_INLINE I32
 307 Perl_POPMARK(pTHX)
 308 {
 309     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 310                                  "MARK pop  %p %" IVdf "\n",
 311                                   (PL_markstack_ptr-1),
 312                                   (IV)*(PL_markstack_ptr-1))));
 313     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
 314     return *PL_markstack_ptr--;
 315 }
 316
 317 /* ----------------------------- regexp.h ----------------------------- */
 318
 319 /* PVLVs need to act as a superset of all scalar types - they are basically
 320  * PVMGs with a few extra fields.
 321  * REGEXPs are first class scalars, but have many fields that can't be copied
 322  * into a PVLV body.
 323  *
 324  * Hence we take a different approach - instead of a copy, PVLVs store a pointer
 325  * back to the original body. To avoid increasing the size of PVLVs just for the
 326  * rare case of REGEXP assignment, this pointer is stored in the memory usually
 327  * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
 328  * read the pointer from the two possible locations. The macro SvLEN() wraps the
 329  * access to the union's member xpvlenu_len, but there is no equivalent macro
 330  * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
 331  *
 332  * See commit df6b4bd56551f2d3 for more details. */
 333
 334 PERL_STATIC_INLINE struct regexp *
 335 Perl_ReANY(const REGEXP * const re)
 336 {
 337     XPV* const p = (XPV*)SvANY(re);
 338
 339     PERL_ARGS_ASSERT_REANY;
 340     assert(isREGEXP(re));
 341
 342     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
 343                                    : (struct regexp *)p;
 344 }
 345
 346 /* ------------------------------- utf8.h ------------------------------- */
 347
 348 /*
 349 =for apidoc_section $unicode
 350 */
 351
 352 PERL_STATIC_INLINE void
 353 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
 354 {
 355     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
 356      * encoded string at '*dest', updating '*dest' to include it */
 357
 358     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 359
 360     if (NATIVE_BYTE_IS_INVARIANT(byte))
 361         *((*dest)++) = byte;
 362     else {
 363         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
 364         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
 365     }
 366 }
 367
 368 /*
 369 =for apidoc valid_utf8_to_uvchr
 370 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
 371 known that the next character in the input UTF-8 string C<s> is well-formed
 372 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
 373 points, and non-Unicode code points are allowed.
 374
 375 =cut
 376
 377  */
 378
 379 PERL_STATIC_INLINE UV
 380 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 381 {
 382     const UV expectlen = UTF8SKIP(s);
 383     const U8* send = s + expectlen;
 384     UV uv = *s;
 385
 386     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 387
 388     if (retlen) {
 389         *retlen = expectlen;
 390     }
 391
 392     /* An invariant is trivially returned */
 393     if (expectlen == 1) {
 394         return uv;
 395     }
 396
 397     /* Remove the leading bits that indicate the number of bytes, leaving just
 398      * the bits that are part of the value */
 399     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 400
 401     /* Now, loop through the remaining bytes, accumulating each into the
 402      * working total as we go.  (I khw tried unrolling the loop for up to 4
 403      * bytes, but there was no performance improvement) */
 404     for (++s; s < send; s++) {
 405         uv = UTF8_ACCUMULATE(uv, *s);
 406     }
 407
 408     return UNI_TO_NATIVE(uv);
 409
 410 }
 411
 412 /*
 413 =for apidoc is_utf8_invariant_string
 414
 415 Returns TRUE if the first C<len> bytes of the string C<s> are the same
 416 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
 417 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
 418 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
 419 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
 420 characters are invariant, but so also are the C1 controls.
 421
 422 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 423 use this option, that C<s> can't have embedded C<NUL> characters and has to
 424 have a terminating C<NUL> byte).
 425
 426 See also
 427 C<L</is_utf8_string>>,
 428 C<L</is_utf8_string_flags>>,
 429 C<L</is_utf8_string_loc>>,
 430 C<L</is_utf8_string_loc_flags>>,
 431 C<L</is_utf8_string_loclen>>,
 432 C<L</is_utf8_string_loclen_flags>>,
 433 C<L</is_utf8_fixed_width_buf_flags>>,
 434 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 435 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 436 C<L</is_strict_utf8_string>>,
 437 C<L</is_strict_utf8_string_loc>>,
 438 C<L</is_strict_utf8_string_loclen>>,
 439 C<L</is_c9strict_utf8_string>>,
 440 C<L</is_c9strict_utf8_string_loc>>,
 441 and
 442 C<L</is_c9strict_utf8_string_loclen>>.
 443
 444 =cut
 445
 446 */
 447
 448 #define is_utf8_invariant_string(s, len)                                    \
 449                                 is_utf8_invariant_string_loc(s, len, NULL)
 450
 451 /*
 452 =for apidoc is_utf8_invariant_string_loc
 453
 454 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
 455 the first UTF-8 variant character in the C<ep> pointer; if all characters are
 456 UTF-8 invariant, this function does not change the contents of C<*ep>.
 457
 458 =cut
 459
 460 */
 461
 462 PERL_STATIC_INLINE bool
 463 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 464 {
 465     const U8* send;
 466     const U8* x = s;
 467
 468     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 469
 470     if (len == 0) {
 471         len = strlen((const char *)s);
 472     }
 473
 474     send = s + len;
 475
 476 /* This looks like 0x010101... */
 477 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
 478
 479 /* This looks like 0x808080... */
 480 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
 481 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
 482 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 483
 484 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
 485  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
 486  * optimized out completely on a 32-bit system, and its mask gets optimized out
 487  * on a 64-bit system */
 488 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
 489                                       |   (  PTR2nat(x) >> 1)                 \
 490                                       | ( ( (PTR2nat(x)                       \
 491                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
 492
 493 #ifndef EBCDIC
 494
 495     /* Do the word-at-a-time iff there is at least one usable full word.  That
 496      * means that after advancing to a word boundary, there still is at least a
 497      * full word left.  The number of bytes needed to advance is 'wordsize -
 498      * offset' unless offset is 0. */
 499     if ((STRLEN) (send - x) >= PERL_WORDSIZE
 500
 501                             /* This term is wordsize if subword; 0 if not */
 502                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
 503
 504                             /* 'offset' */
 505                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
 506     {
 507
 508         /* Process per-byte until reach word boundary.  XXX This loop could be
 509          * eliminated if we knew that this platform had fast unaligned reads */
 510         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
 511             if (! UTF8_IS_INVARIANT(*x)) {
 512                 if (ep) {
 513                     *ep = x;
 514                 }
 515
 516                 return FALSE;
 517             }
 518             x++;
 519         }
 520
 521         /* Here, we know we have at least one full word to process.  Process
 522          * per-word as long as we have at least a full word left */
 523         do {
 524             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
 525
 526                 /* Found a variant.  Just return if caller doesn't want its
 527                  * exact position */
 528                 if (! ep) {
 529                     return FALSE;
 530                 }
 531
 532 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 533      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 534
 535                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
 536                 assert(*ep >= s && *ep < send);
 537
 538                 return FALSE;
 539
 540 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 541            checks. */
 542
 543                 break;
 544 #  endif
 545             }
 546
 547             x += PERL_WORDSIZE;
 548
 549         } while (x + PERL_WORDSIZE <= send);
 550     }
 551
 552 #endif      /* End of ! EBCDIC */
 553
 554     /* Process per-byte */
 555     while (x < send) {
 556         if (! UTF8_IS_INVARIANT(*x)) {
 557             if (ep) {
 558                 *ep = x;
 559             }
 560
 561             return FALSE;
 562         }
 563
 564         x++;
 565     }
 566
 567     return TRUE;
 568 }
 569
 570 /* See if the platform has builtins for finding the most/least significant bit,
 571  * and which one is right for using on 32 and 64 bit operands */
 572 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
 573 #  if U32SIZE == INTSIZE
 574 #    define PERL_CLZ_32 __builtin_clz
 575 #  endif
 576 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 577 #    define PERL_CLZ_64 __builtin_clz
 578 #  endif
 579 #endif
 580 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
 581 #  if U32SIZE == INTSIZE
 582 #    define PERL_CTZ_32 __builtin_ctz
 583 #  endif
 584 #  if defined(U64TYPE) && U64SIZE == INTSIZE
 585 #    define PERL_CTZ_64 __builtin_ctz
 586 #  endif
 587 #endif
 588
 589 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
 590 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
 591 #    define PERL_CLZ_32 __builtin_clzl
 592 #  endif
 593 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
 594 #    define PERL_CLZ_64 __builtin_clzl
 595 #  endif
 596 #endif
 597 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
 598 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
 599 #    define PERL_CTZ_32 __builtin_ctzl
 600 #  endif
 601 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
 602 #    define PERL_CTZ_64 __builtin_ctzl
 603 #  endif
 604 #endif
 605
 606 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
 607 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
 608 #    define PERL_CLZ_32 __builtin_clzll
 609 #  endif
 610 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
 611 #    define PERL_CLZ_64 __builtin_clzll
 612 #  endif
 613 #endif
 614 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
 615 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
 616 #    define PERL_CTZ_32 __builtin_ctzll
 617 #  endif
 618 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
 619 #    define PERL_CTZ_64 __builtin_ctzll
 620 #  endif
 621 #endif
 622
 623 #if defined(_MSC_VER)
 624 #  include <intrin.h>
 625 #  pragma intrinsic(_BitScanForward)
 626 #  pragma intrinsic(_BitScanReverse)
 627 #  ifdef _WIN64
 628 #    pragma intrinsic(_BitScanForward64)
 629 #    pragma intrinsic(_BitScanReverse64)
 630 #  endif
 631 #endif
 632
 633 /* The reason there are not checks to see if ffs() and ffsl() are available for
 634  * determining the lsb, is because these don't improve on the deBruijn method
 635  * fallback, which is just a branchless integer multiply, array element
 636  * retrieval, and shift.  The others, even if the function call overhead is
 637  * optimized out, have to cope with the possibility of the input being all
 638  * zeroes, and almost certainly will have conditionals for this eventuality.
 639  * khw, at the time of this commit, looked at the source for both gcc and clang
 640  * to verify this.  (gcc used a method inferior to deBruijn.) */
 641
 642 /* Below are functions to find the first, last, or only set bit in a word.  On
 643  * platforms with 64-bit capability, there is a pair for each operation; the
 644  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
 645  * the same in each pair, so the second is stripped of most comments. */
 646
 647 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 648
 649 PERL_STATIC_INLINE unsigned
 650 Perl_lsbit_pos64(U64 word)
 651 {
 652     /* Find the position (0..63) of the least significant set bit in the input
 653      * word */
 654
 655     ASSUME(word != 0);
 656
 657     /* If we can determine that the platform has a usable fast method to get
 658      * this info, use that */
 659
 660 #  if defined(PERL_CTZ_64)
 661 #    define PERL_HAS_FAST_GET_LSB_POS64
 662
 663     return (unsigned) PERL_CTZ_64(word);
 664
 665 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
 666 #    define PERL_HAS_FAST_GET_LSB_POS64
 667
 668     {
 669         unsigned long index;
 670         _BitScanForward64(&index, word);
 671         return (unsigned)index;
 672     }
 673
 674 #  else
 675
 676     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
 677      * making the lsb the only set bit in the word, and use our function that
 678      * works on words with a single bit set.
 679      *
 680      * Isolate the lsb;
 681      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
 682      *
 683      * The word will look like this, with a rightmost set bit in position 's':
 684      * ('x's are don't cares, and 'y's are their complements)
 685      *      s
 686      *  x..x100..00
 687      *  y..y011..11      Complement
 688      *  y..y100..00      Add 1
 689      *  0..0100..00      And with the original
 690      *
 691      *  (Yes, complementing and adding 1 is just taking the negative on 2's
 692      *  complement machines, but not on 1's complement ones, and some compilers
 693      *  complain about negating an unsigned.)
 694      */
 695     return single_1bit_pos64(word & (~word + 1));
 696
 697 #  endif
 698
 699 }
 700
 701 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
 702 #else   /* ! QUAD */
 703 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
 704 #endif
 705
 706 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
 707 Perl_lsbit_pos32(U32 word)
 708 {
 709     /* Find the position (0..31) of the least significant set bit in the input
 710      * word */
 711
 712     ASSUME(word != 0);
 713
 714 #if defined(PERL_CTZ_32)
 715 #  define PERL_HAS_FAST_GET_LSB_POS32
 716
 717     return (unsigned) PERL_CTZ_32(word);
 718
 719 #elif U32SIZE == 4 && defined(_MSC_VER)
 720 #  define PERL_HAS_FAST_GET_LSB_POS32
 721
 722     {
 723         unsigned long index;
 724         _BitScanForward(&index, word);
 725         return (unsigned)index;
 726     }
 727
 728 #else
 729
 730     return single_1bit_pos32(word & (~word + 1));
 731
 732 #endif
 733
 734 }
 735
 736
 737 /* Convert the leading zeros count to the bit position of the first set bit.
 738  * This just subtracts from the highest position, 31 or 63.  But some compilers
 739  * don't optimize this optimally, and so a bit of bit twiddling encourages them
 740  * to do the right thing.  It turns out that subtracting a smaller non-negative
 741  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
 742  * the two numbers.  To see why, first note that the sum of any number, x, and
 743  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
 744  * the xor of x and all ones is x'. */
 745 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
 746
 747 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 748
 749 PERL_STATIC_INLINE unsigned
 750 Perl_msbit_pos64(U64 word)
 751 {
 752     /* Find the position (0..63) of the most significant set bit in the input
 753      * word */
 754
 755     ASSUME(word != 0);
 756
 757     /* If we can determine that the platform has a usable fast method to get
 758      * this, use that */
 759
 760 #  if defined(PERL_CLZ_64)
 761 #    define PERL_HAS_FAST_GET_MSB_POS64
 762
 763     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
 764
 765 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
 766 #    define PERL_HAS_FAST_GET_MSB_POS64
 767
 768     {
 769         unsigned long index;
 770         _BitScanReverse64(&index, word);
 771         return (unsigned)index;
 772     }
 773
 774 #  else
 775
 776     /* Here, we didn't find a fast method for finding the msb.  Fall back to
 777      * making the msb the only set bit in the word, and use our function that
 778      * works on words with a single bit set.
 779      *
 780      * Isolate the msb; http://codeforces.com/blog/entry/10330
 781      *
 782      * Only the most significant set bit matters.  Or'ing word with its right
 783      * shift of 1 makes that bit and the next one to its right both 1.
 784      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
 785      * ...  We end with the msb and all to the right being 1. */
 786     word |= (word >>  1);
 787     word |= (word >>  2);
 788     word |= (word >>  4);
 789     word |= (word >>  8);
 790     word |= (word >> 16);
 791     word |= (word >> 32);
 792
 793     /* Then subtracting the right shift by 1 clears all but the left-most of
 794      * the 1 bits, which is our desired result */
 795     word -= (word >> 1);
 796
 797     /* Now we have a single bit set */
 798     return single_1bit_pos64(word);
 799
 800 #  endif
 801
 802 }
 803
 804 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
 805 #else   /* ! QUAD */
 806 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
 807 #endif
 808
 809 PERL_STATIC_INLINE unsigned
 810 Perl_msbit_pos32(U32 word)
 811 {
 812     /* Find the position (0..31) of the most significant set bit in the input
 813      * word */
 814
 815     ASSUME(word != 0);
 816
 817 #if defined(PERL_CLZ_32)
 818 #  define PERL_HAS_FAST_GET_MSB_POS32
 819
 820     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
 821
 822 #elif U32SIZE == 4 && defined(_MSC_VER)
 823 #  define PERL_HAS_FAST_GET_MSB_POS32
 824
 825     {
 826         unsigned long index;
 827         _BitScanReverse(&index, word);
 828         return (unsigned)index;
 829     }
 830
 831 #else
 832
 833     word |= (word >>  1);
 834     word |= (word >>  2);
 835     word |= (word >>  4);
 836     word |= (word >>  8);
 837     word |= (word >> 16);
 838     word -= (word >> 1);
 839     return single_1bit_pos32(word);
 840
 841 #endif
 842
 843 }
 844
 845 #if UVSIZE == U64SIZE
 846 #  define msbit_pos(word)  msbit_pos64(word)
 847 #  define lsbit_pos(word)  lsbit_pos64(word)
 848 #elif UVSIZE == U32SIZE
 849 #  define msbit_pos(word)  msbit_pos32(word)
 850 #  define lsbit_pos(word)  lsbit_pos32(word)
 851 #endif
 852
 853 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
 854
 855 PERL_STATIC_INLINE unsigned
 856 Perl_single_1bit_pos64(U64 word)
 857 {
 858     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
 859      * find and return the 1's position: 0..63 */
 860
 861 #  ifdef PERL_CORE    /* macro not exported */
 862     ASSUME(isPOWER_OF_2(word));
 863 #  else
 864     ASSUME(word && (word & (word-1)) == 0);
 865 #  endif
 866
 867     /* The only set bit is both the most and least significant bit.  If we have
 868      * a fast way of finding either one, use that.
 869      *
 870      * It may appear at first glance that those functions call this one, but
 871      * they don't if the corresponding #define is set */
 872
 873 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
 874
 875     return msbit_pos64(word);
 876
 877 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
 878
 879     return lsbit_pos64(word);
 880
 881 #  else
 882
 883     /* The position of the only set bit in a word can be quickly calculated
 884      * using deBruijn sequences.  See for example
 885      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
 886     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
 887                                                     >> PERL_deBruijnShift64_];
 888 #  endif
 889
 890 }
 891
 892 #endif
 893
 894 PERL_STATIC_INLINE unsigned
 895 Perl_single_1bit_pos32(U32 word)
 896 {
 897     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
 898      * find and return the 1's position: 0..31 */
 899
 900 #ifdef PERL_CORE    /* macro not exported */
 901     ASSUME(isPOWER_OF_2(word));
 902 #else
 903     ASSUME(word && (word & (word-1)) == 0);
 904 #endif
 905 #ifdef PERL_HAS_FAST_GET_MSB_POS32
 906
 907     return msbit_pos32(word);
 908
 909 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
 910
 911     return lsbit_pos32(word);
 912
 913 /* Unlikely, but possible for the platform to have a wider fast operation but
 914  * not a narrower one.  But easy enough to handle the case by widening the
 915  * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
 916  * would be slower than the deBruijn method.) */
 917 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
 918
 919     return msbit_pos64(word);
 920
 921 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
 922
 923     return lsbit_pos64(word);
 924
 925 #else
 926
 927     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
 928                                                     >> PERL_deBruijnShift32_];
 929 #endif
 930
 931 }
 932
 933 #ifndef EBCDIC
 934
 935 PERL_STATIC_INLINE unsigned int
 936 Perl_variant_byte_number(PERL_UINTMAX_T word)
 937 {
 938     /* This returns the position in a word (0..7) of the first variant byte in
 939      * it.  This is a helper function.  Note that there are no branches */
 940
 941     /* Get just the msb bits of each byte */
 942     word &= PERL_VARIANTS_WORD_MASK;
 943
 944     /* This should only be called if we know there is a variant byte in the
 945      * word */
 946     assert(word);
 947
 948 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
 949
 950     /* Bytes are stored like
 951      *  Byte8 ... Byte2 Byte1
 952      *  63..56...15...8 7...0
 953      * so getting the lsb of the whole modified word is getting the msb of the
 954      * first byte that has its msb set */
 955     word = lsbit_pos_uintmax_(word);
 956
 957     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
 958      * to 0..7 */
 959     return (unsigned int) ((word + 1) >> 3) - 1;
 960
 961 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 962
 963     /* Bytes are stored like
 964      *  Byte1 Byte2  ... Byte8
 965      * 63..56 55..47 ... 7...0
 966      * so getting the msb of the whole modified word is getting the msb of the
 967      * first byte that has its msb set */
 968     word = msbit_pos_uintmax_(word);
 969
 970     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
 971      * to 0..7 */
 972     word = ((word + 1) >> 3) - 1;
 973
 974     /* And invert the result because of the reversed byte order on this
 975      * platform */
 976     word = CHARBITS - word - 1;
 977
 978     return (unsigned int) word;
 979
 980 #  else
 981 #    error Unexpected byte order
 982 #  endif
 983
 984 }
 985
 986 #endif
 987 #if defined(PERL_CORE) || defined(PERL_EXT)
 988
 989 /*
 990 =for apidoc variant_under_utf8_count
 991
 992 This function looks at the sequence of bytes between C<s> and C<e>, which are
 993 assumed to be encoded in ASCII/Latin1, and returns how many of them would
 994 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
 995 each of these would occupy two bytes instead of the single one in the input
 996 string.  Thus, this function returns the precise number of bytes the string
 997 would expand by when translated to UTF-8.
 998
 999 Unlike most of the other functions that have C<utf8> in their name, the input
1000 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1001 I<odd> to emphasize this.
1002
1003 This function is internal to Perl because khw thinks that any XS code that
1004 would want this is probably operating too close to the internals.  Presenting a
1005 valid use case could change that.
1006
1007 See also
1008 C<L<perlapi/is_utf8_invariant_string>>
1009 and
1010 C<L<perlapi/is_utf8_invariant_string_loc>>,
1011
1012 =cut
1013
1014 */
1015
1016 PERL_STATIC_INLINE Size_t
1017 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1018 {
1019     const U8* x = s;
1020     Size_t count = 0;
1021
1022     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1023
1024 #  ifndef EBCDIC
1025
1026     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1027      * same as for is_utf8_invariant_string()) */
1028     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1029                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1030                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1031     {
1032
1033         /* Process per-byte until reach word boundary.  XXX This loop could be
1034          * eliminated if we knew that this platform had fast unaligned reads */
1035         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1036             count += ! UTF8_IS_INVARIANT(*x++);
1037         }
1038
1039         /* Process per-word as long as we have at least a full word left */
1040         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1041                    explanation of how this works */
1042             PERL_UINTMAX_T increment
1043                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1044                       * PERL_COUNT_MULTIPLIER)
1045                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1046             count += (Size_t) increment;
1047             x += PERL_WORDSIZE;
1048         } while (x + PERL_WORDSIZE <= e);
1049     }
1050
1051 #  endif
1052
1053     /* Process per-byte */
1054     while (x < e) {
1055         if (! UTF8_IS_INVARIANT(*x)) {
1056             count++;
1057         }
1058
1059         x++;
1060     }
1061
1062     return count;
1063 }
1064
1065 #endif
1066
1067 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
1068 #  undef PERL_WORDSIZE
1069 #  undef PERL_COUNT_MULTIPLIER
1070 #  undef PERL_WORD_BOUNDARY_MASK
1071 #  undef PERL_VARIANTS_WORD_MASK
1072 #endif
1073
1074 /*
1075 =for apidoc is_utf8_string
1076
1077 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1078 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1079 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1080 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1081 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1082
1083 This function considers Perl's extended UTF-8 to be valid.  That means that
1084 code points above Unicode, surrogates, and non-character code points are
1085 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1086 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1087 code points are considered valid.
1088
1089 See also
1090 C<L</is_utf8_invariant_string>>,
1091 C<L</is_utf8_invariant_string_loc>>,
1092 C<L</is_utf8_string_loc>>,
1093 C<L</is_utf8_string_loclen>>,
1094 C<L</is_utf8_fixed_width_buf_flags>>,
1095 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1096 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1097
1098 =cut
1099 */
1100
1101 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1102
1103 #if defined(PERL_CORE) || defined (PERL_EXT)
1104
1105 /*
1106 =for apidoc is_utf8_non_invariant_string
1107
1108 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1109 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1110 UTF-8; otherwise returns FALSE.
1111
1112 A TRUE return means that at least one code point represented by the sequence
1113 either is a wide character not representable as a single byte, or the
1114 representation differs depending on whether the sequence is encoded in UTF-8 or
1115 not.
1116
1117 See also
1118 C<L<perlapi/is_utf8_invariant_string>>,
1119 C<L<perlapi/is_utf8_string>>
1120
1121 =cut
1122
1123 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1124 It generally needn't be if its string is entirely UTF-8 invariant, and it
1125 shouldn't be if it otherwise contains invalid UTF-8.
1126
1127 It is an internal function because khw thinks that XS code shouldn't be working
1128 at this low a level.  A valid use case could change that.
1129
1130 */
1131
1132 PERL_STATIC_INLINE bool
1133 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1134 {
1135     const U8 * first_variant;
1136
1137     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1138
1139     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1140         return FALSE;
1141     }
1142
1143     return is_utf8_string(first_variant, len - (first_variant - s));
1144 }
1145
1146 #endif
1147
1148 /*
1149 =for apidoc is_strict_utf8_string
1150
1151 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1152 UTF-8-encoded string that is fully interchangeable by any application using
1153 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1154 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1155 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1156 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1157
1158 This function returns FALSE for strings containing any
1159 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1160 non-character code points.
1161
1162 See also
1163 C<L</is_utf8_invariant_string>>,
1164 C<L</is_utf8_invariant_string_loc>>,
1165 C<L</is_utf8_string>>,
1166 C<L</is_utf8_string_flags>>,
1167 C<L</is_utf8_string_loc>>,
1168 C<L</is_utf8_string_loc_flags>>,
1169 C<L</is_utf8_string_loclen>>,
1170 C<L</is_utf8_string_loclen_flags>>,
1171 C<L</is_utf8_fixed_width_buf_flags>>,
1172 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1173 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1174 C<L</is_strict_utf8_string_loc>>,
1175 C<L</is_strict_utf8_string_loclen>>,
1176 C<L</is_c9strict_utf8_string>>,
1177 C<L</is_c9strict_utf8_string_loc>>,
1178 and
1179 C<L</is_c9strict_utf8_string_loclen>>.
1180
1181 =cut
1182 */
1183
1184 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1185
1186 /*
1187 =for apidoc is_c9strict_utf8_string
1188
1189 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1190 UTF-8-encoded string that conforms to
1191 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1192 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1193 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1194 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1195 characters being ASCII constitute 'a valid UTF-8 string'.
1196
1197 This function returns FALSE for strings containing any code points above the
1198 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1199 code points per
1200 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1201
1202 See also
1203 C<L</is_utf8_invariant_string>>,
1204 C<L</is_utf8_invariant_string_loc>>,
1205 C<L</is_utf8_string>>,
1206 C<L</is_utf8_string_flags>>,
1207 C<L</is_utf8_string_loc>>,
1208 C<L</is_utf8_string_loc_flags>>,
1209 C<L</is_utf8_string_loclen>>,
1210 C<L</is_utf8_string_loclen_flags>>,
1211 C<L</is_utf8_fixed_width_buf_flags>>,
1212 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1213 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1214 C<L</is_strict_utf8_string>>,
1215 C<L</is_strict_utf8_string_loc>>,
1216 C<L</is_strict_utf8_string_loclen>>,
1217 C<L</is_c9strict_utf8_string_loc>>,
1218 and
1219 C<L</is_c9strict_utf8_string_loclen>>.
1220
1221 =cut
1222 */
1223
1224 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1225
1226 /*
1227 =for apidoc is_utf8_string_flags
1228
1229 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1230 UTF-8 string, subject to the restrictions imposed by C<flags>;
1231 returns FALSE otherwise.  If C<len> is 0, it will be calculated
1232 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1233 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1234 that all characters being ASCII constitute 'a valid UTF-8 string'.
1235
1236 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1237 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1238 as C<L</is_strict_utf8_string>>; and if C<flags> is
1239 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1240 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1241 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1242 C<L</utf8n_to_uvchr>>, with the same meanings.
1243
1244 See also
1245 C<L</is_utf8_invariant_string>>,
1246 C<L</is_utf8_invariant_string_loc>>,
1247 C<L</is_utf8_string>>,
1248 C<L</is_utf8_string_loc>>,
1249 C<L</is_utf8_string_loc_flags>>,
1250 C<L</is_utf8_string_loclen>>,
1251 C<L</is_utf8_string_loclen_flags>>,
1252 C<L</is_utf8_fixed_width_buf_flags>>,
1253 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1254 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1255 C<L</is_strict_utf8_string>>,
1256 C<L</is_strict_utf8_string_loc>>,
1257 C<L</is_strict_utf8_string_loclen>>,
1258 C<L</is_c9strict_utf8_string>>,
1259 C<L</is_c9strict_utf8_string_loc>>,
1260 and
1261 C<L</is_c9strict_utf8_string_loclen>>.
1262
1263 =cut
1264 */
1265
1266 PERL_STATIC_INLINE bool
1267 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1268 {
1269     const U8 * first_variant;
1270
1271     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1272     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1273                           |UTF8_DISALLOW_PERL_EXTENDED)));
1274
1275     if (len == 0) {
1276         len = strlen((const char *)s);
1277     }
1278
1279     if (flags == 0) {
1280         return is_utf8_string(s, len);
1281     }
1282
1283     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1284                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1285     {
1286         return is_strict_utf8_string(s, len);
1287     }
1288
1289     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1290                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1291     {
1292         return is_c9strict_utf8_string(s, len);
1293     }
1294
1295     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1296         const U8* const send = s + len;
1297         const U8* x = first_variant;
1298
1299         while (x < send) {
1300             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1301             if (UNLIKELY(! cur_len)) {
1302                 return FALSE;
1303             }
1304             x += cur_len;
1305         }
1306     }
1307
1308     return TRUE;
1309 }
1310
1311 /*
1312
1313 =for apidoc is_utf8_string_loc
1314
1315 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1316 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1317 "utf8ness success") in the C<ep> pointer.
1318
1319 See also C<L</is_utf8_string_loclen>>.
1320
1321 =cut
1322 */
1323
1324 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1325
1326 /*
1327
1328 =for apidoc is_utf8_string_loclen
1329
1330 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1331 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1332 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1333 encoded characters in the C<el> pointer.
1334
1335 See also C<L</is_utf8_string_loc>>.
1336
1337 =cut
1338 */
1339
1340 PERL_STATIC_INLINE bool
1341 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1342 {
1343     const U8 * first_variant;
1344
1345     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1346
1347     if (len == 0) {
1348         len = strlen((const char *) s);
1349     }
1350
1351     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1352         if (el)
1353             *el = len;
1354
1355         if (ep) {
1356             *ep = s + len;
1357         }
1358
1359         return TRUE;
1360     }
1361
1362     {
1363         const U8* const send = s + len;
1364         const U8* x = first_variant;
1365         STRLEN outlen = first_variant - s;
1366
1367         while (x < send) {
1368             const STRLEN cur_len = isUTF8_CHAR(x, send);
1369             if (UNLIKELY(! cur_len)) {
1370                 break;
1371             }
1372             x += cur_len;
1373             outlen++;
1374         }
1375
1376         if (el)
1377             *el = outlen;
1378
1379         if (ep) {
1380             *ep = x;
1381         }
1382
1383         return (x == send);
1384     }
1385 }
1386
1387 /* The perl core arranges to never call the DFA below without there being at
1388  * least one byte available to look at.  This allows the DFA to use a do {}
1389  * while loop which means that calling it with a UTF-8 invariant has a single
1390  * conditional, same as the calling code checking for invariance ahead of time.
1391  * And having the calling code remove that conditional speeds up by that
1392  * conditional, the case where it wasn't invariant.  So there's no reason to
1393  * check before caling this.
1394  *
1395  * But we don't know this for non-core calls, so have to retain the check for
1396  * them. */
1397 #ifdef PERL_CORE
1398 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
1399 #else
1400 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
1401 #endif
1402
1403 /*
1404  * DFA for checking input is valid UTF-8 syntax.
1405  *
1406  * This uses adaptations of the table and algorithm given in
1407  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1408  * documentation of the original version.  A copyright notice for the original
1409  * version is given at the beginning of this file.  The Perl adapations are
1410  * documented at the definition of PL_extended_utf8_dfa_tab[].
1411  *
1412  * This dfa is fast.  There are three exit conditions:
1413  *  1) a well-formed code point, acceptable to the table
1414  *  2) the beginning bytes of an incomplete character, whose completion might
1415  *     or might not be acceptable
1416  *  3) unacceptable to the table.  Some of the adaptations have certain,
1417  *     hopefully less likely to occur, legal inputs be unacceptable to the
1418  *     table, so these must be sorted out afterwards.
1419  *
1420  * This macro is a complete implementation of the code executing the DFA.  It
1421  * is passed the input sequence bounds and the table to use, and what to do
1422  * for each of the exit conditions.  There are three canned actions, likely to
1423  * be the ones you want:
1424  *      DFA_RETURN_SUCCESS_
1425  *      DFA_RETURN_FAILURE_
1426  *      DFA_GOTO_TEASE_APART_FF_
1427  *
1428  * You pass a parameter giving the action to take for each of the three
1429  * possible exit conditions:
1430  *
1431  * 'accept_action'  This is executed when the DFA accepts the input.
1432  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1433  * 'reject_action'  This is executed when the DFA rejects the input.
1434  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1435  *                  you have written code to distinguish the rejecting state
1436  *                  results.  Because it happens in several places, and
1437  *                  involves #ifdefs, the special action
1438  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1439  *                  PL_extended_utf8_dfa_tab.  On platforms without
1440  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1441  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1442  *                  need to have a label 'tease_apart_FF' that it will transfer
1443  *                  to.
1444  * 'incomplete_char_action'  This is executed when the DFA ran off the end
1445  *                  before accepting or rejecting the input.
1446  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1447  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1448  *                  off the end, and you place your code to handle this case
1449  *                  immediately after it.
1450  */
1451
1452 #define DFA_RETURN_SUCCESS_      return s - s0
1453 #define DFA_RETURN_FAILURE_      return 0
1454 #ifdef HAS_EXTRA_LONG_UTF8
1455 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1456 #else
1457 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1458 #endif
1459
1460 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1461                               accept_action,                                \
1462                               reject_action,                                \
1463                               incomplete_char_action)                       \
1464     STMT_START {                                                            \
1465         const U8 * s = s0;                                                  \
1466         UV state = 0;                                                       \
1467                                                                             \
1468         PERL_NON_CORE_CHECK_EMPTY(s,e);                                     \
1469                                                                             \
1470         do {                                                                \
1471             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1472             s++;                                                            \
1473                                                                             \
1474             if (state == 0) {   /* Accepting state */                       \
1475                 accept_action;                                              \
1476             }                                                               \
1477                                                                             \
1478             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1479                 reject_action;                                              \
1480             }                                                               \
1481         } while (s < e);                                                    \
1482                                                                             \
1483         /* Here, dropped out of loop before end-of-char */                  \
1484         incomplete_char_action;                                             \
1485     } STMT_END
1486
1487
1488 /*
1489
1490 =for apidoc isUTF8_CHAR
1491
1492 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1493 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1494 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1495 value gives how many bytes starting at C<s> comprise the code point's
1496 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1497 form the first code point in C<s>, are not examined.
1498
1499 The code point can be any that will fit in an IV on this machine, using Perl's
1500 extension to official UTF-8 to represent those higher than the Unicode maximum
1501 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1502 next few bytes in C<s> is legal UTF-8 for a single character.
1503
1504 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1505 defined by Unicode to be fully interchangeable across applications;
1506 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1507 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1508 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1509
1510 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1511 C<L</is_utf8_string_loclen>> to check entire strings.
1512
1513 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1514 machines) is a valid UTF-8 character.
1515
1516 =cut
1517
1518 This uses an adaptation of the table and algorithm given in
1519 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1520 documentation of the original version.  A copyright notice for the original
1521 version is given at the beginning of this file.  The Perl adapation is
1522 documented at the definition of PL_extended_utf8_dfa_tab[].
1523 */
1524
1525 PERL_STATIC_INLINE Size_t
1526 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1527 {
1528     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1529
1530     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1531                           DFA_RETURN_SUCCESS_,
1532                           DFA_TEASE_APART_FF_,
1533                           DFA_RETURN_FAILURE_);
1534
1535     /* Here, we didn't return success, but dropped out of the loop.  In the
1536      * case of PL_extended_utf8_dfa_tab, this means the input is either
1537      * malformed, or the start byte was FF on a platform that the dfa doesn't
1538      * handle FF's.  Call a helper function. */
1539
1540 #ifdef HAS_EXTRA_LONG_UTF8
1541
1542   tease_apart_FF:
1543
1544     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1545      * either malformed, or was for the largest possible start byte, which we
1546      * now check, not inline */
1547     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1548         return 0;
1549     }
1550
1551     return is_utf8_FF_helper_(s0, e,
1552                               FALSE /* require full, not partial char */
1553                              );
1554 #endif
1555
1556 }
1557
1558 /*
1559
1560 =for apidoc isSTRICT_UTF8_CHAR
1561
1562 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1563 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1564 Unicode code point completely acceptable for open interchange between all
1565 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1566 many bytes starting at C<s> comprise the code point's representation.  Any
1567 bytes remaining before C<e>, but beyond the ones needed to form the first code
1568 point in C<s>, are not examined.
1569
1570 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1571 be a surrogate nor a non-character code point.  Thus this excludes any code
1572 point from Perl's extended UTF-8.
1573
1574 This is used to efficiently decide if the next few bytes in C<s> is
1575 legal Unicode-acceptable UTF-8 for a single character.
1576
1577 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1578 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1579 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1580 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1581
1582 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1583 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1584
1585 =cut
1586
1587 This uses an adaptation of the tables and algorithm given in
1588 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1589 documentation of the original version.  A copyright notice for the original
1590 version is given at the beginning of this file.  The Perl adapation is
1591 documented at the definition of strict_extended_utf8_dfa_tab[].
1592
1593 */
1594
1595 PERL_STATIC_INLINE Size_t
1596 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1597 {
1598     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1599
1600     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1601                           DFA_RETURN_SUCCESS_,
1602                           goto check_hanguls,
1603                           DFA_RETURN_FAILURE_);
1604   check_hanguls:
1605
1606     /* Here, we didn't return success, but dropped out of the loop.  In the
1607      * case of PL_strict_utf8_dfa_tab, this means the input is either
1608      * malformed, or was for certain Hanguls; handle them specially */
1609
1610     /* The dfa above drops out for incomplete or illegal inputs, and certain
1611      * legal Hanguls; check and return accordingly */
1612     return is_HANGUL_ED_utf8_safe(s0, e);
1613 }
1614
1615 /*
1616
1617 =for apidoc isC9_STRICT_UTF8_CHAR
1618
1619 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1620 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1621 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1622 the value gives how many bytes starting at C<s> comprise the code point's
1623 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1624 form the first code point in C<s>, are not examined.
1625
1626 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1627 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1628 code points.  This corresponds to
1629 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1630 which said that non-character code points are merely discouraged rather than
1631 completely forbidden in open interchange.  See
1632 L<perlunicode/Noncharacter code points>.
1633
1634 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1635 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1636
1637 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1638 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1639
1640 =cut
1641
1642 This uses an adaptation of the tables and algorithm given in
1643 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1644 documentation of the original version.  A copyright notice for the original
1645 version is given at the beginning of this file.  The Perl adapation is
1646 documented at the definition of PL_c9_utf8_dfa_tab[].
1647
1648 */
1649
1650 PERL_STATIC_INLINE Size_t
1651 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1652 {
1653     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1654
1655     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1656                           DFA_RETURN_SUCCESS_,
1657                           DFA_RETURN_FAILURE_,
1658                           DFA_RETURN_FAILURE_);
1659 }
1660
1661 /*
1662
1663 =for apidoc is_strict_utf8_string_loc
1664
1665 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1666 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1667 "utf8ness success") in the C<ep> pointer.
1668
1669 See also C<L</is_strict_utf8_string_loclen>>.
1670
1671 =cut
1672 */
1673
1674 #define is_strict_utf8_string_loc(s, len, ep)                               \
1675                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1676
1677 /*
1678
1679 =for apidoc is_strict_utf8_string_loclen
1680
1681 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1682 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1683 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1684 encoded characters in the C<el> pointer.
1685
1686 See also C<L</is_strict_utf8_string_loc>>.
1687
1688 =cut
1689 */
1690
1691 PERL_STATIC_INLINE bool
1692 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1693 {
1694     const U8 * first_variant;
1695
1696     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1697
1698     if (len == 0) {
1699         len = strlen((const char *) s);
1700     }
1701
1702     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1703         if (el)
1704             *el = len;
1705
1706         if (ep) {
1707             *ep = s + len;
1708         }
1709
1710         return TRUE;
1711     }
1712
1713     {
1714         const U8* const send = s + len;
1715         const U8* x = first_variant;
1716         STRLEN outlen = first_variant - s;
1717
1718         while (x < send) {
1719             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1720             if (UNLIKELY(! cur_len)) {
1721                 break;
1722             }
1723             x += cur_len;
1724             outlen++;
1725         }
1726
1727         if (el)
1728             *el = outlen;
1729
1730         if (ep) {
1731             *ep = x;
1732         }
1733
1734         return (x == send);
1735     }
1736 }
1737
1738 /*
1739
1740 =for apidoc is_c9strict_utf8_string_loc
1741
1742 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1743 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1744 "utf8ness success") in the C<ep> pointer.
1745
1746 See also C<L</is_c9strict_utf8_string_loclen>>.
1747
1748 =cut
1749 */
1750
1751 #define is_c9strict_utf8_string_loc(s, len, ep)                             \
1752                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1753
1754 /*
1755
1756 =for apidoc is_c9strict_utf8_string_loclen
1757
1758 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1759 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1760 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1761 characters in the C<el> pointer.
1762
1763 See also C<L</is_c9strict_utf8_string_loc>>.
1764
1765 =cut
1766 */
1767
1768 PERL_STATIC_INLINE bool
1769 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1770 {
1771     const U8 * first_variant;
1772
1773     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1774
1775     if (len == 0) {
1776         len = strlen((const char *) s);
1777     }
1778
1779     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1780         if (el)
1781             *el = len;
1782
1783         if (ep) {
1784             *ep = s + len;
1785         }
1786
1787         return TRUE;
1788     }
1789
1790     {
1791         const U8* const send = s + len;
1792         const U8* x = first_variant;
1793         STRLEN outlen = first_variant - s;
1794
1795         while (x < send) {
1796             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1797             if (UNLIKELY(! cur_len)) {
1798                 break;
1799             }
1800             x += cur_len;
1801             outlen++;
1802         }
1803
1804         if (el)
1805             *el = outlen;
1806
1807         if (ep) {
1808             *ep = x;
1809         }
1810
1811         return (x == send);
1812     }
1813 }
1814
1815 /*
1816
1817 =for apidoc is_utf8_string_loc_flags
1818
1819 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1820 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1821 "utf8ness success") in the C<ep> pointer.
1822
1823 See also C<L</is_utf8_string_loclen_flags>>.
1824
1825 =cut
1826 */
1827
1828 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1829                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1830
1831
1832 /* The above 3 actual functions could have been moved into the more general one
1833  * just below, and made #defines that call it with the right 'flags'.  They are
1834  * currently kept separate to increase their chances of getting inlined */
1835
1836 /*
1837
1838 =for apidoc is_utf8_string_loclen_flags
1839
1840 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1841 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1842 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1843 encoded characters in the C<el> pointer.
1844
1845 See also C<L</is_utf8_string_loc_flags>>.
1846
1847 =cut
1848 */
1849
1850 PERL_STATIC_INLINE bool
1851 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1852 {
1853     const U8 * first_variant;
1854
1855     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1856     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1857                           |UTF8_DISALLOW_PERL_EXTENDED)));
1858
1859     if (len == 0) {
1860         len = strlen((const char *) s);
1861     }
1862
1863     if (flags == 0) {
1864         return is_utf8_string_loclen(s, len, ep, el);
1865     }
1866
1867     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1868                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1869     {
1870         return is_strict_utf8_string_loclen(s, len, ep, el);
1871     }
1872
1873     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1874                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1875     {
1876         return is_c9strict_utf8_string_loclen(s, len, ep, el);
1877     }
1878
1879     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1880         if (el)
1881             *el = len;
1882
1883         if (ep) {
1884             *ep = s + len;
1885         }
1886
1887         return TRUE;
1888     }
1889
1890     {
1891         const U8* send = s + len;
1892         const U8* x = first_variant;
1893         STRLEN outlen = first_variant - s;
1894
1895         while (x < send) {
1896             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1897             if (UNLIKELY(! cur_len)) {
1898                 break;
1899             }
1900             x += cur_len;
1901             outlen++;
1902         }
1903
1904         if (el)
1905             *el = outlen;
1906
1907         if (ep) {
1908             *ep = x;
1909         }
1910
1911         return (x == send);
1912     }
1913 }
1914
1915 /*
1916 =for apidoc utf8_distance
1917
1918 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1919 and C<b>.
1920
1921 WARNING: use only if you *know* that the pointers point inside the
1922 same UTF-8 buffer.
1923
1924 =cut
1925 */
1926
1927 PERL_STATIC_INLINE IV
1928 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1929 {
1930     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1931
1932     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1933 }
1934
1935 /*
1936 =for apidoc utf8_hop
1937
1938 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1939 forward or backward.
1940
1941 WARNING: do not use the following unless you *know* C<off> is within
1942 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1943 on the first byte of character or just after the last byte of a character.
1944
1945 =cut
1946 */
1947
1948 PERL_STATIC_INLINE U8 *
1949 Perl_utf8_hop(const U8 *s, SSize_t off)
1950 {
1951     PERL_ARGS_ASSERT_UTF8_HOP;
1952
1953     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1954      * the bitops (especially ~) can create illegal UTF-8.
1955      * In other words: in Perl UTF-8 is not just for Unicode. */
1956
1957     if (off >= 0) {
1958         while (off--)
1959             s += UTF8SKIP(s);
1960     }
1961     else {
1962         while (off++) {
1963             s--;
1964             while (UTF8_IS_CONTINUATION(*s))
1965                 s--;
1966         }
1967     }
1968     GCC_DIAG_IGNORE(-Wcast-qual)
1969     return (U8 *)s;
1970     GCC_DIAG_RESTORE
1971 }
1972
1973 /*
1974 =for apidoc utf8_hop_forward
1975
1976 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1977 forward.
1978
1979 C<off> must be non-negative.
1980
1981 C<s> must be before or equal to C<end>.
1982
1983 When moving forward it will not move beyond C<end>.
1984
1985 Will not exceed this limit even if the string is not valid "UTF-8".
1986
1987 =cut
1988 */
1989
1990 PERL_STATIC_INLINE U8 *
1991 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1992 {
1993     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1994
1995     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1996      * the bitops (especially ~) can create illegal UTF-8.
1997      * In other words: in Perl UTF-8 is not just for Unicode. */
1998
1999     assert(s <= end);
2000     assert(off >= 0);
2001
2002     while (off--) {
2003         STRLEN skip = UTF8SKIP(s);
2004         if ((STRLEN)(end - s) <= skip) {
2005             GCC_DIAG_IGNORE(-Wcast-qual)
2006             return (U8 *)end;
2007             GCC_DIAG_RESTORE
2008         }
2009         s += skip;
2010     }
2011
2012     GCC_DIAG_IGNORE(-Wcast-qual)
2013     return (U8 *)s;
2014     GCC_DIAG_RESTORE
2015 }
2016
2017 /*
2018 =for apidoc utf8_hop_back
2019
2020 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2021 backward.
2022
2023 C<off> must be non-positive.
2024
2025 C<s> must be after or equal to C<start>.
2026
2027 When moving backward it will not move before C<start>.
2028
2029 Will not exceed this limit even if the string is not valid "UTF-8".
2030
2031 =cut
2032 */
2033
2034 PERL_STATIC_INLINE U8 *
2035 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2036 {
2037     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2038
2039     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2040      * the bitops (especially ~) can create illegal UTF-8.
2041      * In other words: in Perl UTF-8 is not just for Unicode. */
2042
2043     assert(start <= s);
2044     assert(off <= 0);
2045
2046     while (off++ && s > start) {
2047         do {
2048             s--;
2049         } while (UTF8_IS_CONTINUATION(*s) && s > start);
2050     }
2051
2052     GCC_DIAG_IGNORE(-Wcast-qual)
2053     return (U8 *)s;
2054     GCC_DIAG_RESTORE
2055 }
2056
2057 /*
2058 =for apidoc utf8_hop_safe
2059
2060 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2061 either forward or backward.
2062
2063 When moving backward it will not move before C<start>.
2064
2065 When moving forward it will not move beyond C<end>.
2066
2067 Will not exceed those limits even if the string is not valid "UTF-8".
2068
2069 =cut
2070 */
2071
2072 PERL_STATIC_INLINE U8 *
2073 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2074 {
2075     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2076
2077     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2078      * the bitops (especially ~) can create illegal UTF-8.
2079      * In other words: in Perl UTF-8 is not just for Unicode. */
2080
2081     assert(start <= s && s <= end);
2082
2083     if (off >= 0) {
2084         return utf8_hop_forward(s, off, end);
2085     }
2086     else {
2087         return utf8_hop_back(s, off, start);
2088     }
2089 }
2090
2091 /*
2092
2093 =for apidoc isUTF8_CHAR_flags
2094
2095 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2096 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2097 that represents some code point, subject to the restrictions given by C<flags>;
2098 otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
2099 starting at C<s> comprise the code point's representation.  Any bytes remaining
2100 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2101 are not examined.
2102
2103 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2104 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2105 as C<L</isSTRICT_UTF8_CHAR>>;
2106 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2107 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2108 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2109 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2110
2111 The three alternative macros are for the most commonly needed validations; they
2112 are likely to run somewhat faster than this more general one, as they can be
2113 inlined into your code.
2114
2115 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2116 L</is_utf8_string_loclen_flags> to check entire strings.
2117
2118 =cut
2119 */
2120
2121 PERL_STATIC_INLINE STRLEN
2122 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2123 {
2124     PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2125     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2126                           |UTF8_DISALLOW_PERL_EXTENDED)));
2127
2128     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2129                           goto check_success,
2130                           DFA_TEASE_APART_FF_,
2131                           DFA_RETURN_FAILURE_);
2132
2133   check_success:
2134
2135     return is_utf8_char_helper_(s0, e, flags);
2136
2137 #ifdef HAS_EXTRA_LONG_UTF8
2138
2139   tease_apart_FF:
2140
2141     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2142      * either malformed, or was for the largest possible start byte, which
2143      * indicates perl extended UTF-8, well above the Unicode maximum */
2144     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2145         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2146     {
2147         return 0;
2148     }
2149
2150     /* Otherwise examine the sequence not inline */
2151     return is_utf8_FF_helper_(s0, e,
2152                               FALSE /* require full, not partial char */
2153                              );
2154 #endif
2155
2156 }
2157
2158 /*
2159
2160 =for apidoc is_utf8_valid_partial_char
2161
2162 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2163 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2164 points.  Otherwise, it returns 1 if there exists at least one non-empty
2165 sequence of bytes that when appended to sequence C<s>, starting at position
2166 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2167 otherwise returns 0.
2168
2169 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2170 point.
2171
2172 This is useful when a fixed-length buffer is being tested for being well-formed
2173 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2174 it is split somewhere in the middle of the final code point's UTF-8
2175 representation.  (Presumably when the buffer is refreshed with the next chunk
2176 of data, the new first bytes will complete the partial code point.)   This
2177 function is used to verify that the final bytes in the current buffer are in
2178 fact the legal beginning of some code point, so that if they aren't, the
2179 failure can be signalled without having to wait for the next read.
2180
2181 =cut
2182 */
2183 #define is_utf8_valid_partial_char(s, e)                                    \
2184                                 is_utf8_valid_partial_char_flags(s, e, 0)
2185
2186 /*
2187
2188 =for apidoc is_utf8_valid_partial_char_flags
2189
2190 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2191 or not the input is a valid UTF-8 encoded partial character, but it takes an
2192 extra parameter, C<flags>, which can further restrict which code points are
2193 considered valid.
2194
2195 If C<flags> is 0, this behaves identically to
2196 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2197 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2198 there is any sequence of bytes that can complete the input partial character in
2199 such a way that a non-prohibited character is formed, the function returns
2200 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2201 partial character input.  But many  of the other possible excluded types can be
2202 determined from just the first one or two bytes.
2203
2204 =cut
2205  */
2206
2207 PERL_STATIC_INLINE bool
2208 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2209 {
2210     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2211     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2212                           |UTF8_DISALLOW_PERL_EXTENDED)));
2213
2214     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2215                           DFA_RETURN_FAILURE_,
2216                           DFA_TEASE_APART_FF_,
2217                           NOOP);
2218
2219     /* The NOOP above causes the DFA to drop down here iff the input was a
2220      * partial character.  flags=0 => can return TRUE immediately; otherwise we
2221      * need to check (not inline) if the partial character is the beginning of
2222      * a disallowed one */
2223     if (flags == 0) {
2224         return TRUE;
2225     }
2226
2227     return cBOOL(is_utf8_char_helper_(s0, e, flags));
2228
2229 #ifdef HAS_EXTRA_LONG_UTF8
2230
2231   tease_apart_FF:
2232
2233     /* Getting here means the input is either malformed, or, in the case of
2234      * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
2235      * latter case has to be extended UTF-8, so can fail immediately if that is
2236      * forbidden */
2237
2238     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2239         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2240     {
2241         return 0;
2242     }
2243
2244     return is_utf8_FF_helper_(s0, e,
2245                               TRUE /* Require to be a partial character */
2246                              );
2247 #endif
2248
2249 }
2250
2251 /*
2252
2253 =for apidoc is_utf8_fixed_width_buf_flags
2254
2255 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2256 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2257 otherwise it returns FALSE.
2258
2259 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2260 without restriction.  If the final few bytes of the buffer do not form a
2261 complete code point, this will return TRUE anyway, provided that
2262 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2263
2264 If C<flags> in non-zero, it can be any combination of the
2265 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2266 same meanings.
2267
2268 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2269 returns FALSE if the final few bytes of the string don't form a complete code
2270 point.
2271
2272 =cut
2273  */
2274 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2275                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2276
2277 /*
2278
2279 =for apidoc is_utf8_fixed_width_buf_loc_flags
2280
2281 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2282 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2283 to the beginning of any partial character at the end of the buffer; if there is
2284 no partial character C<*ep> will contain C<s>+C<len>.
2285
2286 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2287
2288 =cut
2289 */
2290
2291 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2292                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2293
2294 /*
2295
2296 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2297
2298 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2299 complete, valid characters found in the C<el> pointer.
2300
2301 =cut
2302 */
2303
2304 PERL_STATIC_INLINE bool
2305 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2306                                        STRLEN len,
2307                                        const U8 **ep,
2308                                        STRLEN *el,
2309                                        const U32 flags)
2310 {
2311     const U8 * maybe_partial;
2312
2313     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2314
2315     if (! ep) {
2316         ep  = &maybe_partial;
2317     }
2318
2319     /* If it's entirely valid, return that; otherwise see if the only error is
2320      * that the final few bytes are for a partial character */
2321     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2322            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2323 }
2324
2325 PERL_STATIC_INLINE UV
2326 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2327                       STRLEN curlen,
2328                       STRLEN *retlen,
2329                       const U32 flags,
2330                       U32 * errors,
2331                       AV ** msgs)
2332 {
2333     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2334      * simple cases, and, if necessary calls a helper function to deal with the
2335      * more complex ones.  Almost all well-formed non-problematic code points
2336      * are considered simple, so that it's unlikely that the helper function
2337      * will need to be called.
2338      *
2339      * This is an adaptation of the tables and algorithm given in
2340      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2341      * comprehensive documentation of the original version.  A copyright notice
2342      * for the original version is given at the beginning of this file.  The
2343      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
2344      */
2345
2346     const U8 * const s0 = s;
2347     const U8 * send = s0 + curlen;
2348     UV type;
2349     UV uv;
2350
2351     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2352
2353     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2354      * non-problematic code point, which can be returned immediately.
2355      * Otherwise we call a helper function to figure out the more complicated
2356      * cases. */
2357
2358     /* No calls from core pass in an empty string; non-core need a check */
2359 #ifdef PERL_CORE
2360     assert(curlen > 0);
2361 #else
2362     if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2363                                                         flags, errors, msgs);
2364 #endif
2365
2366     type = PL_strict_utf8_dfa_tab[*s];
2367
2368     /* The table is structured so that 'type' is 0 iff the input byte is
2369      * represented identically regardless of the UTF-8ness of the string */
2370     if (type == 0) {   /* UTF-8 invariants are returned unchanged */
2371         uv = *s;
2372     }
2373     else {
2374         UV state = PL_strict_utf8_dfa_tab[256 + type];
2375         uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2376
2377         while (++s < send) {
2378             type  = PL_strict_utf8_dfa_tab[*s];
2379             state = PL_strict_utf8_dfa_tab[256 + state + type];
2380
2381             uv = UTF8_ACCUMULATE(uv, *s);
2382
2383             if (state == 0) {
2384 #ifdef EBCDIC
2385                 uv = UNI_TO_NATIVE(uv);
2386 #endif
2387                 goto success;
2388             }
2389
2390             if (UNLIKELY(state == 1)) {
2391                 break;
2392             }
2393         }
2394
2395         /* Here is potentially problematic.  Use the full mechanism */
2396         return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2397                                            errors, msgs);
2398     }
2399
2400   success:
2401     if (retlen) {
2402         *retlen = s - s0 + 1;
2403     }
2404     if (errors) {
2405         *errors = 0;
2406     }
2407     if (msgs) {
2408         *msgs = NULL;
2409     }
2410
2411     return uv;
2412 }
2413
2414 PERL_STATIC_INLINE UV
2415 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2416 {
2417     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2418
2419     assert(s < send);
2420
2421     if (! ckWARN_d(WARN_UTF8)) {
2422
2423         /* EMPTY is not really allowed, and asserts on debugging builds.  But
2424          * on non-debugging we have to deal with it, and this causes it to
2425          * return the REPLACEMENT CHARACTER, as the documentation indicates */
2426         return utf8n_to_uvchr(s, send - s, retlen,
2427                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2428     }
2429     else {
2430         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2431         if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2432             *retlen = (STRLEN) -1;
2433         }
2434
2435         return ret;
2436     }
2437 }
2438
2439 /* ------------------------------- perl.h ----------------------------- */
2440
2441 /*
2442 =for apidoc_section $utility
2443
2444 =for apidoc is_safe_syscall
2445
2446 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2447 C<NUL> characters.
2448 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2449 category, and return FALSE.
2450
2451 Return TRUE if the name is safe.
2452
2453 C<what> and C<op_name> are used in any warning.
2454
2455 Used by the C<IS_SAFE_SYSCALL()> macro.
2456
2457 =cut
2458 */
2459
2460 PERL_STATIC_INLINE bool
2461 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2462 {
2463     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2464      * perl itself uses xce*() functions which accept 8-bit strings.
2465      */
2466
2467     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2468
2469     if (len > 1) {
2470         char *null_at;
2471         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2472                 SETERRNO(ENOENT, LIB_INVARG);
2473                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2474                                    "Invalid \\0 character in %s for %s: %s\\0%s",
2475                                    what, op_name, pv, null_at+1);
2476                 return FALSE;
2477         }
2478     }
2479
2480     return TRUE;
2481 }
2482
2483 /*
2484
2485 Return true if the supplied filename has a newline character
2486 immediately before the first (hopefully only) NUL.
2487
2488 My original look at this incorrectly used the len from SvPV(), but
2489 that's incorrect, since we allow for a NUL in pv[len-1].
2490
2491 So instead, strlen() and work from there.
2492
2493 This allow for the user reading a filename, forgetting to chomp it,
2494 then calling:
2495
2496   open my $foo, "$file\0";
2497
2498 */
2499
2500 #ifdef PERL_CORE
2501
2502 PERL_STATIC_INLINE bool
2503 S_should_warn_nl(const char *pv)
2504 {
2505     STRLEN len;
2506
2507     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2508
2509     len = strlen(pv);
2510
2511     return len > 0 && pv[len-1] == '\n';
2512 }
2513
2514 #endif
2515
2516 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2517
2518 PERL_STATIC_INLINE bool
2519 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2520 {
2521     /* This function determines if the input NV 'nv' may be converted without
2522      * loss of data to an IV.  If not, it returns FALSE taking no other action.
2523      * But if it is possible, it does the conversion, returning TRUE, and
2524      * storing the converted result in '*ivp' */
2525
2526     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2527
2528 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2529     /* Normally any comparison with a NaN returns false; if we can't rely
2530      * on that behaviour, check explicitly */
2531     if (UNLIKELY(Perl_isnan(nv))) {
2532         return FALSE;
2533     }
2534 #  endif
2535
2536     /* Written this way so that with an always-false NaN comparison we
2537      * return false */
2538     if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2539         return FALSE;
2540     }
2541
2542     if ((IV) nv != nv) {
2543         return FALSE;
2544     }
2545
2546     *ivp = (IV) nv;
2547     return TRUE;
2548 }
2549
2550 #endif
2551
2552 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2553
2554 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2555
2556 #define MAX_CHARSET_NAME_LENGTH 2
2557
2558 PERL_STATIC_INLINE const char *
2559 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2560 {
2561     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2562
2563     /* Returns a string that corresponds to the name of the regex character set
2564      * given by 'flags', and *lenp is set the length of that string, which
2565      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2566
2567     *lenp = 1;
2568     switch (get_regex_charset(flags)) {
2569         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2570         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2571         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2572         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2573         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2574             *lenp = 2;
2575             return ASCII_MORE_RESTRICT_PAT_MODS;
2576     }
2577     /* The NOT_REACHED; hides an assert() which has a rather complex
2578      * definition in perl.h. */
2579     NOT_REACHED; /* NOTREACHED */
2580     return "?";     /* Unknown */
2581 }
2582
2583 #endif
2584
2585 /*
2586
2587 Return false if any get magic is on the SV other than taint magic.
2588
2589 */
2590
2591 PERL_STATIC_INLINE bool
2592 Perl_sv_only_taint_gmagic(SV *sv)
2593 {
2594     MAGIC *mg = SvMAGIC(sv);
2595
2596     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2597
2598     while (mg) {
2599         if (mg->mg_type != PERL_MAGIC_taint
2600             && !(mg->mg_flags & MGf_GSKIP)
2601             && mg->mg_virtual->svt_get) {
2602             return FALSE;
2603         }
2604         mg = mg->mg_moremagic;
2605     }
2606
2607     return TRUE;
2608 }
2609
2610 /* ------------------ cop.h ------------------------------------------- */
2611
2612 /* implement GIMME_V() macro */
2613
2614 PERL_STATIC_INLINE U8
2615 Perl_gimme_V(pTHX)
2616 {
2617     I32 cxix;
2618     U8  gimme = (PL_op->op_flags & OPf_WANT);
2619
2620     if (gimme)
2621         return gimme;
2622     cxix = PL_curstackinfo->si_cxsubix;
2623     if (cxix < 0)
2624         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2625     assert(cxstack[cxix].blk_gimme & G_WANT);
2626     return (cxstack[cxix].blk_gimme & G_WANT);
2627 }
2628
2629
2630 /* Enter a block. Push a new base context and return its address. */
2631
2632 PERL_STATIC_INLINE PERL_CONTEXT *
2633 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2634 {
2635     PERL_CONTEXT * cx;
2636
2637     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2638
2639     CXINC;
2640     cx = CX_CUR();
2641     cx->cx_type        = type;
2642     cx->blk_gimme      = gimme;
2643     cx->blk_oldsaveix  = saveix;
2644     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2645     cx->blk_oldcop     = PL_curcop;
2646     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2647     cx->blk_oldscopesp = PL_scopestack_ix;
2648     cx->blk_oldpm      = PL_curpm;
2649     cx->blk_old_tmpsfloor = PL_tmps_floor;
2650
2651     PL_tmps_floor        = PL_tmps_ix;
2652     CX_DEBUG(cx, "PUSH");
2653     return cx;
2654 }
2655
2656
2657 /* Exit a block (RETURN and LAST). */
2658
2659 PERL_STATIC_INLINE void
2660 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2661 {
2662     PERL_ARGS_ASSERT_CX_POPBLOCK;
2663
2664     CX_DEBUG(cx, "POP");
2665     /* these 3 are common to cx_popblock and cx_topblock */
2666     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2667     PL_scopestack_ix = cx->blk_oldscopesp;
2668     PL_curpm         = cx->blk_oldpm;
2669
2670     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2671      * and leaves a CX entry lying around for repeated use, so
2672      * skip for multicall */                  \
2673     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2674             || PL_savestack_ix == cx->blk_oldsaveix);
2675     PL_curcop     = cx->blk_oldcop;
2676     PL_tmps_floor = cx->blk_old_tmpsfloor;
2677 }
2678
2679 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2680  * Whereas cx_popblock() restores the state to the point just before
2681  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2682  * *after* cx_pushblock() was called. */
2683
2684 PERL_STATIC_INLINE void
2685 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2686 {
2687     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2688
2689     CX_DEBUG(cx, "TOP");
2690     /* these 3 are common to cx_popblock and cx_topblock */
2691     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2692     PL_scopestack_ix = cx->blk_oldscopesp;
2693     PL_curpm         = cx->blk_oldpm;
2694
2695     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2696 }
2697
2698
2699 PERL_STATIC_INLINE void
2700 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2701 {
2702     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2703
2704     PERL_ARGS_ASSERT_CX_PUSHSUB;
2705
2706     PERL_DTRACE_PROBE_ENTRY(cv);
2707     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2708     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2709     cx->blk_sub.cv = cv;
2710     cx->blk_sub.olddepth = CvDEPTH(cv);
2711     cx->blk_sub.prevcomppad = PL_comppad;
2712     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2713     cx->blk_sub.retop = retop;
2714     SvREFCNT_inc_simple_void_NN(cv);
2715     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2716 }
2717
2718
2719 /* subsets of cx_popsub() */
2720
2721 PERL_STATIC_INLINE void
2722 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2723 {
2724     CV *cv;
2725
2726     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2727     assert(CxTYPE(cx) == CXt_SUB);
2728
2729     PL_comppad = cx->blk_sub.prevcomppad;
2730     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2731     cv = cx->blk_sub.cv;
2732     CvDEPTH(cv) = cx->blk_sub.olddepth;
2733     cx->blk_sub.cv = NULL;
2734     SvREFCNT_dec(cv);
2735     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2736 }
2737
2738
2739 /* handle the @_ part of leaving a sub */
2740
2741 PERL_STATIC_INLINE void
2742 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2743 {
2744     AV *av;
2745
2746     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2747     assert(CxTYPE(cx) == CXt_SUB);
2748     assert(AvARRAY(MUTABLE_AV(
2749         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2750                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2751
2752     CX_POP_SAVEARRAY(cx);
2753     av = MUTABLE_AV(PAD_SVl(0));
2754     if (UNLIKELY(AvREAL(av)))
2755         /* abandon @_ if it got reified */
2756         clear_defarray(av, 0);
2757     else {
2758         CLEAR_ARGARRAY(av);
2759     }
2760 }
2761
2762
2763 PERL_STATIC_INLINE void
2764 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2765 {
2766     PERL_ARGS_ASSERT_CX_POPSUB;
2767     assert(CxTYPE(cx) == CXt_SUB);
2768
2769     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2770
2771     if (CxHASARGS(cx))
2772         cx_popsub_args(cx);
2773     cx_popsub_common(cx);
2774 }
2775
2776
2777 PERL_STATIC_INLINE void
2778 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2779 {
2780     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2781
2782     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2783     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2784     cx->blk_format.cv          = cv;
2785     cx->blk_format.retop       = retop;
2786     cx->blk_format.gv          = gv;
2787     cx->blk_format.dfoutgv     = PL_defoutgv;
2788     cx->blk_format.prevcomppad = PL_comppad;
2789     cx->blk_u16                = 0;
2790
2791     SvREFCNT_inc_simple_void_NN(cv);
2792     CvDEPTH(cv)++;
2793     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2794 }
2795
2796
2797 PERL_STATIC_INLINE void
2798 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2799 {
2800     CV *cv;
2801     GV *dfout;
2802
2803     PERL_ARGS_ASSERT_CX_POPFORMAT;
2804     assert(CxTYPE(cx) == CXt_FORMAT);
2805
2806     dfout = cx->blk_format.dfoutgv;
2807     setdefout(dfout);
2808     cx->blk_format.dfoutgv = NULL;
2809     SvREFCNT_dec_NN(dfout);
2810
2811     PL_comppad = cx->blk_format.prevcomppad;
2812     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2813     cv = cx->blk_format.cv;
2814     cx->blk_format.cv = NULL;
2815     --CvDEPTH(cv);
2816     SvREFCNT_dec_NN(cv);
2817     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2818 }
2819
2820
2821 PERL_STATIC_INLINE void
2822 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2823 {
2824     cx->blk_eval.retop         = retop;
2825     cx->blk_eval.old_namesv    = namesv;
2826     cx->blk_eval.old_eval_root = PL_eval_root;
2827     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2828     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2829     cx->blk_eval.cur_top_env   = PL_top_env;
2830
2831     assert(!(PL_in_eval     & ~ 0x3F));
2832     assert(!(PL_op->op_type & ~0x1FF));
2833     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2834 }
2835
2836 PERL_STATIC_INLINE void
2837 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2838 {
2839     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2840
2841     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2842
2843     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2844     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2845 }
2846
2847 PERL_STATIC_INLINE void
2848 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2849 {
2850     PERL_ARGS_ASSERT_CX_PUSHTRY;
2851
2852     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2853
2854     /* Don't actually change it, just store the current value so it's restored
2855      * by the common popeval */
2856     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2857 }
2858
2859
2860 PERL_STATIC_INLINE void
2861 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2862 {
2863     SV *sv;
2864
2865     PERL_ARGS_ASSERT_CX_POPEVAL;
2866     assert(CxTYPE(cx) == CXt_EVAL);
2867
2868     PL_in_eval = CxOLD_IN_EVAL(cx);
2869     assert(!(PL_in_eval & 0xc0));
2870     PL_eval_root = cx->blk_eval.old_eval_root;
2871     sv = cx->blk_eval.cur_text;
2872     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2873         cx->blk_eval.cur_text = NULL;
2874         SvREFCNT_dec_NN(sv);
2875     }
2876
2877     sv = cx->blk_eval.old_namesv;
2878     if (sv) {
2879         cx->blk_eval.old_namesv = NULL;
2880         SvREFCNT_dec_NN(sv);
2881     }
2882     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2883 }
2884
2885
2886 /* push a plain loop, i.e.
2887  *     { block }
2888  *     while (cond) { block }
2889  *     for (init;cond;continue) { block }
2890  * This loop can be last/redo'ed etc.
2891  */
2892
2893 PERL_STATIC_INLINE void
2894 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2895 {
2896     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2897     cx->blk_loop.my_op = cLOOP;
2898 }
2899
2900
2901 /* push a true for loop, i.e.
2902  *     for var (list) { block }
2903  */
2904
2905 PERL_STATIC_INLINE void
2906 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2907 {
2908     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2909
2910     /* this one line is common with cx_pushloop_plain */
2911     cx->blk_loop.my_op = cLOOP;
2912
2913     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2914     cx->blk_loop.itersave      = itersave;
2915 #ifdef USE_ITHREADS
2916     cx->blk_loop.oldcomppad = PL_comppad;
2917 #endif
2918 }
2919
2920
2921 /* pop all loop types, including plain */
2922
2923 PERL_STATIC_INLINE void
2924 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2925 {
2926     PERL_ARGS_ASSERT_CX_POPLOOP;
2927
2928     assert(CxTYPE_is_LOOP(cx));
2929     if (  CxTYPE(cx) == CXt_LOOP_ARY
2930        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2931     {
2932         /* Free ary or cur. This assumes that state_u.ary.ary
2933          * aligns with state_u.lazysv.cur. See cx_dup() */
2934         SV *sv = cx->blk_loop.state_u.lazysv.cur;
2935         cx->blk_loop.state_u.lazysv.cur = NULL;
2936         SvREFCNT_dec_NN(sv);
2937         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2938             sv = cx->blk_loop.state_u.lazysv.end;
2939             cx->blk_loop.state_u.lazysv.end = NULL;
2940             SvREFCNT_dec_NN(sv);
2941         }
2942     }
2943     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2944         SV *cursv;
2945         SV **svp = (cx)->blk_loop.itervar_u.svp;
2946         if ((cx->cx_type & CXp_FOR_GV))
2947             svp = &GvSV((GV*)svp);
2948         cursv = *svp;
2949         *svp = cx->blk_loop.itersave;
2950         cx->blk_loop.itersave = NULL;
2951         SvREFCNT_dec(cursv);
2952     }
2953 }
2954
2955
2956 PERL_STATIC_INLINE void
2957 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2958 {
2959     PERL_ARGS_ASSERT_CX_PUSHWHEN;
2960
2961     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2962 }
2963
2964
2965 PERL_STATIC_INLINE void
2966 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2967 {
2968     PERL_ARGS_ASSERT_CX_POPWHEN;
2969     assert(CxTYPE(cx) == CXt_WHEN);
2970
2971     PERL_UNUSED_ARG(cx);
2972     PERL_UNUSED_CONTEXT;
2973     /* currently NOOP */
2974 }
2975
2976
2977 PERL_STATIC_INLINE void
2978 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2979 {
2980     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2981
2982     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2983     cx->blk_givwhen.defsv_save = orig_defsv;
2984 }
2985
2986
2987 PERL_STATIC_INLINE void
2988 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2989 {
2990     SV *sv;
2991
2992     PERL_ARGS_ASSERT_CX_POPGIVEN;
2993     assert(CxTYPE(cx) == CXt_GIVEN);
2994
2995     sv = GvSV(PL_defgv);
2996     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2997     cx->blk_givwhen.defsv_save = NULL;
2998     SvREFCNT_dec(sv);
2999 }
3000
3001 /* ------------------ util.h ------------------------------------------- */
3002
3003 /*
3004 =for apidoc_section $string
3005
3006 =for apidoc foldEQ
3007
3008 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3009 same
3010 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
3011 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
3012 range bytes match only themselves.
3013
3014 =cut
3015 */
3016
3017 PERL_STATIC_INLINE I32
3018 Perl_foldEQ(const char *s1, const char *s2, I32 len)
3019 {
3020     const U8 *a = (const U8 *)s1;
3021     const U8 *b = (const U8 *)s2;
3022
3023     PERL_ARGS_ASSERT_FOLDEQ;
3024
3025     assert(len >= 0);
3026
3027     while (len--) {
3028         if (*a != *b && *a != PL_fold[*b])
3029             return 0;
3030         a++,b++;
3031     }
3032     return 1;
3033 }
3034
3035 PERL_STATIC_INLINE I32
3036 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
3037 {
3038     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
3039      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3040      * does not check for this.  Nor does it check that the strings each have
3041      * at least 'len' characters. */
3042
3043     const U8 *a = (const U8 *)s1;
3044     const U8 *b = (const U8 *)s2;
3045
3046     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3047
3048     assert(len >= 0);
3049
3050     while (len--) {
3051         if (*a != *b && *a != PL_fold_latin1[*b]) {
3052             return 0;
3053         }
3054         a++, b++;
3055     }
3056     return 1;
3057 }
3058
3059 /*
3060 =for apidoc_section $locale
3061 =for apidoc foldEQ_locale
3062
3063 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3064 same case-insensitively in the current locale; false otherwise.
3065
3066 =cut
3067 */
3068
3069 PERL_STATIC_INLINE I32
3070 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
3071 {
3072     const U8 *a = (const U8 *)s1;
3073     const U8 *b = (const U8 *)s2;
3074
3075     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3076
3077     assert(len >= 0);
3078
3079     while (len--) {
3080         if (*a != *b && *a != PL_fold_locale[*b])
3081             return 0;
3082         a++,b++;
3083     }
3084     return 1;
3085 }
3086
3087 /*
3088 =for apidoc_section $string
3089 =for apidoc my_strnlen
3090
3091 The C library C<strnlen> if available, or a Perl implementation of it.
3092
3093 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3094 characters.  It will never attempt to address more than C<maxlen>
3095 characters, making it suitable for use with strings that are not
3096 guaranteed to be NUL-terminated.
3097
3098 =cut
3099
3100 Description stolen from http://man.openbsd.org/strnlen.3,
3101 implementation stolen from PostgreSQL.
3102 */
3103 #ifndef HAS_STRNLEN
3104
3105 PERL_STATIC_INLINE Size_t
3106 Perl_my_strnlen(const char *str, Size_t maxlen)
3107 {
3108     const char *end = (char *) memchr(str, '\0', maxlen);
3109
3110     PERL_ARGS_ASSERT_MY_STRNLEN;
3111
3112     if (end == NULL) return maxlen;
3113     return end - str;
3114 }
3115
3116 #endif
3117
3118 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3119
3120 PERL_STATIC_INLINE void *
3121 S_my_memrchr(const char * s, const char c, const STRLEN len)
3122 {
3123     /* memrchr(), since many platforms lack it */
3124
3125     const char * t = s + len - 1;
3126
3127     PERL_ARGS_ASSERT_MY_MEMRCHR;
3128
3129     while (t >= s) {
3130         if (*t == c) {
3131             return (void *) t;
3132         }
3133         t--;
3134     }
3135
3136     return NULL;
3137 }
3138
3139 #endif
3140
3141 PERL_STATIC_INLINE char *
3142 Perl_mortal_getenv(const char * str)
3143 {
3144     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3145      *
3146      * It's (mostly) thread-safe because it uses a mutex to prevent other
3147      * threads (that look at this mutex) from destroying the result before this
3148      * routine has a chance to copy the result to a place that won't be
3149      * destroyed before the caller gets a chance to handle it.  That place is a
3150      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3151      * impression that the SV will hang around longer under more circumstances
3152      *
3153      * The reason it isn't completely thread-safe is that other code could
3154      * simply not pay attention to the mutex.  All of the Perl core uses the
3155      * mutex, but it is possible for code from, say XS, to not use this mutex,
3156      * defeating the safety.
3157      *
3158      * getenv() returns, in some implementations, a pointer to a spot in the
3159      * **environ array, which could be invalidated at any time by this or
3160      * another thread changing the environment.  Other implementations copy the
3161      * **environ value to a static buffer, returning a pointer to that.  That
3162      * buffer might or might not be invalidated by a getenv() call in another
3163      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3164      * many getenv() calls can safely be running simultaneously, so a
3165      * many-reader (but no simultaneous writers) lock is ok.  There is a
3166      * Configure probe to see if another thread destroys the buffer, and the
3167      * mutex is defined accordingly.
3168      *
3169      * But in all cases, using the mutex prevents these problems, as long as
3170      * all code uses the same mutex.
3171      *
3172      * A complication is that this can be called during phases where the
3173      * mortalization process isn't available.  These are in interpreter
3174      * destruction or early in construction.  khw believes that at these times
3175      * there shouldn't be anything else going on, so plain getenv is safe AS
3176      * LONG AS the caller acts on the return before calling it again. */
3177
3178     char * ret;
3179     dTHX;
3180
3181     PERL_ARGS_ASSERT_MORTAL_GETENV;
3182
3183     /* Can't mortalize without stacks.  khw believes that no other threads
3184      * should be running, so no need to lock things, and this may be during a
3185      * phase when locking isn't even available */
3186     if (UNLIKELY(PL_scopestack_ix == 0)) {
3187         return getenv(str);
3188     }
3189
3190 #ifdef PERL_MEM_LOG
3191
3192     /* A major complication arises under PERL_MEM_LOG.  When that is active,
3193      * every memory allocation may result in logging, depending on the value of
3194      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3195      * saving ENV{foo}'s value (but before saving it), the logging code will
3196      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3197      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3198      * lock a boolean mutex recursively); 3) destroying the getenv() static
3199      * buffer; or 4) destroying the temporary created by this for the copy
3200      * causes a log entry to be made which could cause a new temporary to be
3201      * created, which will need to be destroyed at some point, leading to an
3202      * infinite loop.
3203      *
3204      * The solution adopted here (after some gnashing of teeth) is to detect
3205      * the recursive calls and calls from the logger, and treat them specially.
3206      * Let's say we want to do getenv("foo").  We first find
3207      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3208      * variable, so no temporary is required.  Then we do getenv(foo}, and in
3209      * the process of creating a temporary to save it, this function will be
3210      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3211      * we detect that it is such a call and return our saved value instead of
3212      * locking and doing a new getenv().  This solves all of problems 1), 2),
3213      * and 3).  Because all the getenv()s are done while the mutex is locked,
3214      * the state cannot have changed.  To solve 4), we don't create a temporary
3215      * when this is called from the logging code.  That code disposes of the
3216      * return value while the mutex is still locked.
3217      *
3218      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3219      * digits and 3 particular letters are significant; the rest are ignored by
3220      * the memory logging code.  Thus the per-interpreter variable only needs
3221      * to be large enough to save the significant information, the size of
3222      * which is known at compile time.  The first byte is extra, reserved for
3223      * flags for our use.  To protect against overflowing, only the reserved
3224      * byte, as many digits as don't overflow, and the three letters are
3225      * stored.
3226      *
3227      * The reserved byte has two bits:
3228      *      0x1 if set indicates that if we get here, it is a recursive call of
3229      *          getenv()
3230      *      0x2 if set indicates that the call is from the logging code.
3231      *
3232      * If the flag indicates this is a recursive call, just return the stored
3233      * value of PL_mem_log;  An empty value gets turned into NULL. */
3234     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3235         if (PL_mem_log[1] == '\0') {
3236             return NULL;
3237         } else {
3238             return PL_mem_log + 1;
3239         }
3240     }
3241
3242 #endif
3243
3244     GETENV_LOCK;
3245
3246 #ifdef PERL_MEM_LOG
3247
3248     /* Here we are in a critical section.  As explained above, we do our own
3249      * getenv(PERL_MEM_LOG), saving the result safely. */
3250     ret = getenv("PERL_MEM_LOG");
3251     if (ret == NULL) {  /* No logging active */
3252
3253         /* Return that immediately if called from the logging code */
3254         if (PL_mem_log[0] & 0x2) {
3255             GETENV_UNLOCK;
3256             return NULL;
3257         }
3258
3259         PL_mem_log[1] = '\0';
3260     }
3261     else {
3262         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3263
3264         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3265          * extremely long string.  But we want only a few characters from it.
3266          * PL_mem_log has been made large enough to hold just the ones we need.
3267          * First the file descriptor. */
3268         if (isDIGIT(*ret)) {
3269             const char * s = ret;
3270             if (UNLIKELY(*s == '0')) {
3271
3272                 /* Reduce multiple leading zeros to a single one.  This is to
3273                  * allow the caller to change what to do with leading zeros. */
3274                 *mem_log_meat++ = '0';
3275                 s++;
3276                 while (*s == '0') {
3277                     s++;
3278                 }
3279             }
3280
3281             /* If the input overflows, copy just enough for the result to also
3282              * overflow, plus 1 to make sure */
3283             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3284                 *mem_log_meat++ = *s++;
3285             }
3286         }
3287
3288         /* Then each of the three significant characters */
3289         if (strchr(ret, 'm')) {
3290             *mem_log_meat++ = 'm';
3291         }
3292         if (strchr(ret, 's')) {
3293             *mem_log_meat++ = 's';
3294         }
3295         if (strchr(ret, 't')) {
3296             *mem_log_meat++ = 't';
3297         }
3298         *mem_log_meat = '\0';
3299
3300         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3301     }
3302
3303     /* If we are being called from the logger, it only needs the significant
3304      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3305     if (PL_mem_log[0] & 0x2) {
3306         assert(strEQ(str, "PERL_MEM_LOG"));
3307         GETENV_UNLOCK;
3308         return PL_mem_log + 1;
3309     }
3310
3311     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3312      * is coming from other than the logging code, so it should be treated the
3313      * same as any other getenv(), returning the full value, not just the
3314      * significant part, and having its value saved.  Set the flag that
3315      * indicates any call to this routine will be a recursion from here */
3316     PL_mem_log[0] = 0x1;
3317
3318 #endif
3319
3320     /* Now get the value of the real desired variable, and save a copy */
3321     ret = getenv(str);
3322
3323     if (ret != NULL) {
3324         ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3325     }
3326
3327     GETENV_UNLOCK;
3328
3329 #ifdef PERL_MEM_LOG
3330
3331     /* Clear the buffer */
3332     Zero(PL_mem_log, sizeof(PL_mem_log), char);
3333
3334 #endif
3335
3336     return ret;
3337 }
3338
3339 PERL_STATIC_INLINE bool
3340 Perl_sv_isbool(pTHX_ const SV *sv)
3341 {
3342     /* change to the following in 5.37, logically the same but
3343      * more efficient and more future proof */
3344 #if 0
3345     return (SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv));
3346 #else
3347     return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
3348         (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
3349 #endif
3350
3351 }
3352
3353 #ifdef USE_ITHREADS
3354
3355 PERL_STATIC_INLINE AV *
3356 Perl_cop_file_avn(pTHX_ const COP *cop) {
3357
3358     PERL_ARGS_ASSERT_COP_FILE_AVN;
3359
3360     const char *file = CopFILE(cop);
3361     if (file) {
3362         GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3363         if (gv) {
3364             return GvAVn(gv);
3365         }
3366         else
3367             return NULL;
3368      }
3369      else
3370          return NULL;
3371 }
3372
3373 #endif
3374
3375 /*
3376  * ex: set ts=8 sts=4 sw=4 et:
3377  */