inline.h

   1 /*    inline.h
   2  *
   3  *    Copyright (C) 2012 by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  *    This file contains tables and code adapted from
   9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
  10  *    copyright notice:
  11
  12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  13
  14 Permission is hereby granted, free of charge, to any person obtaining a copy of
  15 this software and associated documentation files (the "Software"), to deal in
  16 the Software without restriction, including without limitation the rights to
  17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
  18 of the Software, and to permit persons to whom the Software is furnished to do
  19 so, subject to the following conditions:
  20
  21 The above copyright notice and this permission notice shall be included in all
  22 copies or substantial portions of the Software.
  23
  24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 SOFTWARE.
  31
  32  *
  33  * This file is a home for static inline functions that cannot go in other
  34  * header files, because they depend on proto.h (included after most other
  35  * headers) or struct definitions.
  36  *
  37  * Each section names the header file that the functions "belong" to.
  38  */
  39
  40 /* ------------------------------- av.h ------------------------------- */
  41
  42 PERL_STATIC_INLINE SSize_t
  43 Perl_av_top_index(pTHX_ AV *av)
  44 {
  45     PERL_ARGS_ASSERT_AV_TOP_INDEX;
  46     assert(SvTYPE(av) == SVt_PVAV);
  47
  48     return AvFILL(av);
  49 }
  50
  51 /* ------------------------------- cv.h ------------------------------- */
  52
  53 PERL_STATIC_INLINE GV *
  54 Perl_CvGV(pTHX_ CV *sv)
  55 {
  56     PERL_ARGS_ASSERT_CVGV;
  57
  58     return CvNAMED(sv)
  59         ? Perl_cvgv_from_hek(aTHX_ sv)
  60         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
  61 }
  62
  63 PERL_STATIC_INLINE I32 *
  64 Perl_CvDEPTH(const CV * const sv)
  65 {
  66     PERL_ARGS_ASSERT_CVDEPTH;
  67     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
  68
  69     return &((XPVCV*)SvANY(sv))->xcv_depth;
  70 }
  71
  72 /*
  73  CvPROTO returns the prototype as stored, which is not necessarily what
  74  the interpreter should be using. Specifically, the interpreter assumes
  75  that spaces have been stripped, which has been the case if the prototype
  76  was added by toke.c, but is generally not the case if it was added elsewhere.
  77  Since we can't enforce the spacelessness at assignment time, this routine
  78  provides a temporary copy at parse time with spaces removed.
  79  I<orig> is the start of the original buffer, I<len> is the length of the
  80  prototype and will be updated when this returns.
  81  */
  82
  83 #ifdef PERL_CORE
  84 PERL_STATIC_INLINE char *
  85 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
  86 {
  87     SV * tmpsv;
  88     char * tmps;
  89     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
  90     tmps = SvPVX(tmpsv);
  91     while ((*len)--) {
  92         if (!isSPACE(*orig))
  93             *tmps++ = *orig;
  94         orig++;
  95     }
  96     *tmps = '\0';
  97     *len = tmps - SvPVX(tmpsv);
  98                 return SvPVX(tmpsv);
  99 }
 100 #endif
 101
 102 /* ------------------------------- mg.h ------------------------------- */
 103
 104 #if defined(PERL_CORE) || defined(PERL_EXT)
 105 /* assumes get-magic and stringification have already occurred */
 106 PERL_STATIC_INLINE STRLEN
 107 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
 108 {
 109     assert(mg->mg_type == PERL_MAGIC_regex_global);
 110     assert(mg->mg_len != -1);
 111     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
 112         return (STRLEN)mg->mg_len;
 113     else {
 114         const STRLEN pos = (STRLEN)mg->mg_len;
 115         /* Without this check, we may read past the end of the buffer: */
 116         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
 117         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
 118     }
 119 }
 120 #endif
 121
 122 /* ------------------------------- pad.h ------------------------------ */
 123
 124 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
 125 PERL_STATIC_INLINE bool
 126 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
 127 {
 128     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
 129
 130     /* is seq within the range _LOW to _HIGH ?
 131      * This is complicated by the fact that PL_cop_seqmax
 132      * may have wrapped around at some point */
 133     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
 134         return FALSE; /* not yet introduced */
 135
 136     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
 137     /* in compiling scope */
 138         if (
 139             (seq >  COP_SEQ_RANGE_LOW(pn))
 140             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
 141             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
 142         )
 143             return TRUE;
 144     }
 145     else if (
 146         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
 147         ?
 148             (  seq >  COP_SEQ_RANGE_LOW(pn)
 149             || seq <= COP_SEQ_RANGE_HIGH(pn))
 150
 151         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
 152              && seq <= COP_SEQ_RANGE_HIGH(pn))
 153     )
 154         return TRUE;
 155     return FALSE;
 156 }
 157 #endif
 158
 159 /* ------------------------------- pp.h ------------------------------- */
 160
 161 PERL_STATIC_INLINE I32
 162 Perl_TOPMARK(pTHX)
 163 {
 164     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 165                                  "MARK top  %p %" IVdf "\n",
 166                                   PL_markstack_ptr,
 167                                   (IV)*PL_markstack_ptr)));
 168     return *PL_markstack_ptr;
 169 }
 170
 171 PERL_STATIC_INLINE I32
 172 Perl_POPMARK(pTHX)
 173 {
 174     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 175                                  "MARK pop  %p %" IVdf "\n",
 176                                   (PL_markstack_ptr-1),
 177                                   (IV)*(PL_markstack_ptr-1))));
 178     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
 179     return *PL_markstack_ptr--;
 180 }
 181
 182 /* ----------------------------- regexp.h ----------------------------- */
 183
 184 PERL_STATIC_INLINE struct regexp *
 185 Perl_ReANY(const REGEXP * const re)
 186 {
 187     XPV* const p = (XPV*)SvANY(re);
 188
 189     PERL_ARGS_ASSERT_REANY;
 190     assert(isREGEXP(re));
 191
 192     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
 193                                    : (struct regexp *)p;
 194 }
 195
 196 /* ------------------------------- sv.h ------------------------------- */
 197
 198 PERL_STATIC_INLINE bool
 199 Perl_SvTRUE(pTHX_ SV *sv) {
 200     if (!LIKELY(sv))
 201         return FALSE;
 202     SvGETMAGIC(sv);
 203     return SvTRUE_nomg_NN(sv);
 204 }
 205
 206 PERL_STATIC_INLINE SV *
 207 Perl_SvREFCNT_inc(SV *sv)
 208 {
 209     if (LIKELY(sv != NULL))
 210         SvREFCNT(sv)++;
 211     return sv;
 212 }
 213 PERL_STATIC_INLINE SV *
 214 Perl_SvREFCNT_inc_NN(SV *sv)
 215 {
 216     PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
 217
 218     SvREFCNT(sv)++;
 219     return sv;
 220 }
 221 PERL_STATIC_INLINE void
 222 Perl_SvREFCNT_inc_void(SV *sv)
 223 {
 224     if (LIKELY(sv != NULL))
 225         SvREFCNT(sv)++;
 226 }
 227 PERL_STATIC_INLINE void
 228 Perl_SvREFCNT_dec(pTHX_ SV *sv)
 229 {
 230     if (LIKELY(sv != NULL)) {
 231         U32 rc = SvREFCNT(sv);
 232         if (LIKELY(rc > 1))
 233             SvREFCNT(sv) = rc - 1;
 234         else
 235             Perl_sv_free2(aTHX_ sv, rc);
 236     }
 237 }
 238
 239 PERL_STATIC_INLINE void
 240 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
 241 {
 242     U32 rc = SvREFCNT(sv);
 243
 244     PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
 245
 246     if (LIKELY(rc > 1))
 247         SvREFCNT(sv) = rc - 1;
 248     else
 249         Perl_sv_free2(aTHX_ sv, rc);
 250 }
 251
 252 PERL_STATIC_INLINE void
 253 Perl_SvAMAGIC_on(SV *sv)
 254 {
 255     PERL_ARGS_ASSERT_SVAMAGIC_ON;
 256     assert(SvROK(sv));
 257
 258     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
 259 }
 260 PERL_STATIC_INLINE void
 261 Perl_SvAMAGIC_off(SV *sv)
 262 {
 263     PERL_ARGS_ASSERT_SVAMAGIC_OFF;
 264
 265     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
 266         HvAMAGIC_off(SvSTASH(SvRV(sv)));
 267 }
 268
 269 PERL_STATIC_INLINE U32
 270 Perl_SvPADSTALE_on(SV *sv)
 271 {
 272     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 273     return SvFLAGS(sv) |= SVs_PADSTALE;
 274 }
 275 PERL_STATIC_INLINE U32
 276 Perl_SvPADSTALE_off(SV *sv)
 277 {
 278     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 279     return SvFLAGS(sv) &= ~SVs_PADSTALE;
 280 }
 281 #if defined(PERL_CORE) || defined (PERL_EXT)
 282 PERL_STATIC_INLINE STRLEN
 283 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
 284 {
 285     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
 286     if (SvGAMAGIC(sv)) {
 287         U8 *hopped = utf8_hop((U8 *)pv, pos);
 288         if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
 289         return (STRLEN)(hopped - (U8 *)pv);
 290     }
 291     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
 292 }
 293 #endif
 294
 295 /* ------------------------------- utf8.h ------------------------------- */
 296
 297 /*
 298 =head1 Unicode Support
 299 */
 300
 301 PERL_STATIC_INLINE void
 302 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
 303 {
 304     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
 305      * encoded string at '*dest', updating '*dest' to include it */
 306
 307     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 308
 309     if (NATIVE_BYTE_IS_INVARIANT(byte))
 310         *((*dest)++) = byte;
 311     else {
 312         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
 313         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
 314     }
 315 }
 316
 317 /*
 318 =for apidoc valid_utf8_to_uvchr
 319 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
 320 known that the next character in the input UTF-8 string C<s> is well-formed
 321 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
 322 points, and non-Unicode code points are allowed.
 323
 324 =cut
 325
 326  */
 327
 328 PERL_STATIC_INLINE UV
 329 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 330 {
 331     const UV expectlen = UTF8SKIP(s);
 332     const U8* send = s + expectlen;
 333     UV uv = *s;
 334
 335     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 336
 337     if (retlen) {
 338         *retlen = expectlen;
 339     }
 340
 341     /* An invariant is trivially returned */
 342     if (expectlen == 1) {
 343         return uv;
 344     }
 345
 346     /* Remove the leading bits that indicate the number of bytes, leaving just
 347      * the bits that are part of the value */
 348     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 349
 350     /* Now, loop through the remaining bytes, accumulating each into the
 351      * working total as we go.  (I khw tried unrolling the loop for up to 4
 352      * bytes, but there was no performance improvement) */
 353     for (++s; s < send; s++) {
 354         uv = UTF8_ACCUMULATE(uv, *s);
 355     }
 356
 357     return UNI_TO_NATIVE(uv);
 358
 359 }
 360
 361 /*
 362 =for apidoc is_utf8_invariant_string
 363
 364 Returns TRUE if the first C<len> bytes of the string C<s> are the same
 365 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
 366 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
 367 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
 368 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
 369 characters are invariant, but so also are the C1 controls.
 370
 371 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 372 use this option, that C<s> can't have embedded C<NUL> characters and has to
 373 have a terminating C<NUL> byte).
 374
 375 See also
 376 C<L</is_utf8_string>>,
 377 C<L</is_utf8_string_flags>>,
 378 C<L</is_utf8_string_loc>>,
 379 C<L</is_utf8_string_loc_flags>>,
 380 C<L</is_utf8_string_loclen>>,
 381 C<L</is_utf8_string_loclen_flags>>,
 382 C<L</is_utf8_fixed_width_buf_flags>>,
 383 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 384 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 385 C<L</is_strict_utf8_string>>,
 386 C<L</is_strict_utf8_string_loc>>,
 387 C<L</is_strict_utf8_string_loclen>>,
 388 C<L</is_c9strict_utf8_string>>,
 389 C<L</is_c9strict_utf8_string_loc>>,
 390 and
 391 C<L</is_c9strict_utf8_string_loclen>>.
 392
 393 =cut
 394
 395 */
 396
 397 #define is_utf8_invariant_string(s, len)                                    \
 398                                 is_utf8_invariant_string_loc(s, len, NULL)
 399
 400 /*
 401 =for apidoc is_utf8_invariant_string_loc
 402
 403 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
 404 the first UTF-8 variant character in the C<ep> pointer; if all characters are
 405 UTF-8 invariant, this function does not change the contents of C<*ep>.
 406
 407 =cut
 408
 409 */
 410
 411 PERL_STATIC_INLINE bool
 412 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 413 {
 414     const U8* send;
 415     const U8* x = s;
 416
 417     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 418
 419     if (len == 0) {
 420         len = strlen((const char *)s);
 421     }
 422
 423     send = s + len;
 424
 425 /* This looks like 0x010101... */
 426 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
 427
 428 /* This looks like 0x808080... */
 429 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
 430 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
 431 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
 432
 433 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
 434  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
 435  * optimized out completely on a 32-bit system, and its mask gets optimized out
 436  * on a 64-bit system */
 437 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
 438                                       |   (  PTR2nat(x) >> 1)                 \
 439                                       | ( ( (PTR2nat(x)                       \
 440                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
 441
 442 #ifndef EBCDIC
 443
 444     /* Do the word-at-a-time iff there is at least one usable full word.  That
 445      * means that after advancing to a word boundary, there still is at least a
 446      * full word left.  The number of bytes needed to advance is 'wordsize -
 447      * offset' unless offset is 0. */
 448     if ((STRLEN) (send - x) >= PERL_WORDSIZE
 449
 450                             /* This term is wordsize if subword; 0 if not */
 451                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
 452
 453                             /* 'offset' */
 454                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
 455     {
 456
 457         /* Process per-byte until reach word boundary.  XXX This loop could be
 458          * eliminated if we knew that this platform had fast unaligned reads */
 459         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
 460             if (! UTF8_IS_INVARIANT(*x)) {
 461                 if (ep) {
 462                     *ep = x;
 463                 }
 464
 465                 return FALSE;
 466             }
 467             x++;
 468         }
 469
 470         /* Here, we know we have at least one full word to process.  Process
 471          * per-word as long as we have at least a full word left */
 472         do {
 473             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
 474
 475                 /* Found a variant.  Just return if caller doesn't want its
 476                  * exact position */
 477                 if (! ep) {
 478                     return FALSE;
 479                 }
 480
 481 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
 482      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 483
 484                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
 485                 assert(*ep >= s && *ep < send);
 486
 487                 return FALSE;
 488
 489 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
 490            checks. */
 491
 492                 break;
 493 #  endif
 494             }
 495
 496             x += PERL_WORDSIZE;
 497
 498         } while (x + PERL_WORDSIZE <= send);
 499     }
 500
 501 #endif      /* End of ! EBCDIC */
 502
 503     /* Process per-byte */
 504     while (x < send) {
 505         if (! UTF8_IS_INVARIANT(*x)) {
 506             if (ep) {
 507                 *ep = x;
 508             }
 509
 510             return FALSE;
 511         }
 512
 513         x++;
 514     }
 515
 516     return TRUE;
 517 }
 518
 519 #ifndef EBCDIC
 520
 521 PERL_STATIC_INLINE unsigned int
 522 Perl_variant_byte_number(PERL_UINTMAX_T word)
 523 {
 524
 525     /* This returns the position in a word (0..7) of the first variant byte in
 526      * it.  This is a helper function.  Note that there are no branches */
 527
 528     assert(word);
 529
 530     /* Get just the msb bits of each byte */
 531     word &= PERL_VARIANTS_WORD_MASK;
 532
 533 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
 534
 535     /* Bytes are stored like
 536      *  Byte8 ... Byte2 Byte1
 537      *  63..56...15...8 7...0
 538      *
 539      *  Isolate the lsb;
 540      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
 541      *
 542      * The word will look like this, with a rightmost set bit in position 's':
 543      * ('x's are don't cares)
 544      *      s
 545      *  x..x100..0
 546      *  x..xx10..0      Right shift (rightmost 0 is shifted off)
 547      *  x..xx01..1      Subtract 1, turns all the trailing zeros into 1's and
 548      *                  the 1 just to their left into a 0; the remainder is
 549      *                  untouched
 550      *  0..0011..1      The xor with the original, x..xx10..0, clears that
 551      *                  remainder, sets the bottom to all 1
 552      *  0..0100..0      Add 1 to clear the word except for the bit in 's'
 553      *
 554      * Another method is to do 'word &= -word'; but it generates a compiler
 555      * message on some platforms about taking the negative of an unsigned */
 556
 557     word >>= 1;
 558     word = 1 + (word ^ (word - 1));
 559
 560 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 561
 562     /* Bytes are stored like
 563      *  Byte1 Byte2  ... Byte8
 564      * 63..56 55..47 ... 7...0
 565      *
 566      * Isolate the msb; http://codeforces.com/blog/entry/10330
 567      *
 568      * Only the most significant set bit matters.  Or'ing word with its right
 569      * shift of 1 makes that bit and the next one to its right both 1.  Then
 570      * right shifting by 2 makes for 4 1-bits in a row. ...  We end with the
 571      * msb and all to the right being 1. */
 572     word |= word >>  1;
 573     word |= word >>  2;
 574     word |= word >>  4;
 575     word |= word >>  8;
 576     word |= word >> 16;
 577     word |= word >> 32;  /* This should get optimized out on 32-bit systems. */
 578
 579     /* Then subtracting the right shift by 1 clears all but the left-most of
 580      * the 1 bits, which is our desired result */
 581     word -= (word >> 1);
 582
 583 #  else
 584 #    error Unexpected byte order
 585 #  endif
 586
 587     /* Here 'word' has a single bit set: the  msb of the first byte in which it
 588      * is set.  Calculate that position in the word.  We can use this
 589      * specialized solution: https://stackoverflow.com/a/32339674/1626653,
 590      * assumes an 8-bit byte.  (On a 32-bit machine, the larger numbers should
 591      * just get shifted off at compile time) */
 592     word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
 593                         | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
 594                         |           (39 <<  24) |           (47 <<  16)
 595                         |           (55 <<   8) |           (63 <<   0));
 596     word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
 597
 598     /* Here, word contains the position 7..63 of that bit.  Convert to 0..7 */
 599     word = ((word + 1) >> 3) - 1;
 600
 601 #  if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
 602
 603     /* And invert the result */
 604     word = CHARBITS - word - 1;
 605
 606 #  endif
 607
 608     return (unsigned int) word;
 609 }
 610
 611 #endif
 612 #if defined(PERL_CORE) || defined(PERL_EXT)
 613
 614 /*
 615 =for apidoc variant_under_utf8_count
 616
 617 This function looks at the sequence of bytes between C<s> and C<e>, which are
 618 assumed to be encoded in ASCII/Latin1, and returns how many of them would
 619 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
 620 each of these would occupy two bytes instead of the single one in the input
 621 string.  Thus, this function returns the precise number of bytes the string
 622 would expand by when translated to UTF-8.
 623
 624 Unlike most of the other functions that have C<utf8> in their name, the input
 625 to this function is NOT a UTF-8-encoded string.  The function name is slightly
 626 I<odd> to emphasize this.
 627
 628 This function is internal to Perl because khw thinks that any XS code that
 629 would want this is probably operating too close to the internals.  Presenting a
 630 valid use case could change that.
 631
 632 See also
 633 C<L<perlapi/is_utf8_invariant_string>>
 634 and
 635 C<L<perlapi/is_utf8_invariant_string_loc>>,
 636
 637 =cut
 638
 639 */
 640
 641 PERL_STATIC_INLINE Size_t
 642 S_variant_under_utf8_count(const U8* const s, const U8* const e)
 643 {
 644     const U8* x = s;
 645     Size_t count = 0;
 646
 647     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
 648
 649 #  ifndef EBCDIC
 650
 651     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
 652      * same as for is_utf8_invariant_string()) */
 653     if ((STRLEN) (e - x) >= PERL_WORDSIZE
 654                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
 655                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
 656     {
 657
 658         /* Process per-byte until reach word boundary.  XXX This loop could be
 659          * eliminated if we knew that this platform had fast unaligned reads */
 660         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
 661             count += ! UTF8_IS_INVARIANT(*x++);
 662         }
 663
 664         /* Process per-word as long as we have at least a full word left */
 665         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
 666                    explanation of how this works */
 667             PERL_UINTMAX_T increment
 668                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
 669                       * PERL_COUNT_MULTIPLIER)
 670                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
 671             count += (Size_t) increment;
 672             x += PERL_WORDSIZE;
 673         } while (x + PERL_WORDSIZE <= e);
 674     }
 675
 676 #  endif
 677
 678     /* Process per-byte */
 679     while (x < e) {
 680         if (! UTF8_IS_INVARIANT(*x)) {
 681             count++;
 682         }
 683
 684         x++;
 685     }
 686
 687     return count;
 688 }
 689
 690 #endif
 691
 692 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
 693 #  undef PERL_WORDSIZE
 694 #  undef PERL_COUNT_MULTIPLIER
 695 #  undef PERL_WORD_BOUNDARY_MASK
 696 #  undef PERL_VARIANTS_WORD_MASK
 697 #endif
 698
 699 /*
 700 =for apidoc is_utf8_string
 701
 702 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 703 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
 704 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
 705 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
 706 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
 707
 708 This function considers Perl's extended UTF-8 to be valid.  That means that
 709 code points above Unicode, surrogates, and non-character code points are
 710 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
 711 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
 712 code points are considered valid.
 713
 714 See also
 715 C<L</is_utf8_invariant_string>>,
 716 C<L</is_utf8_invariant_string_loc>>,
 717 C<L</is_utf8_string_loc>>,
 718 C<L</is_utf8_string_loclen>>,
 719 C<L</is_utf8_fixed_width_buf_flags>>,
 720 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 721 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 722
 723 =cut
 724 */
 725
 726 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
 727
 728 #if defined(PERL_CORE) || defined (PERL_EXT)
 729
 730 /*
 731 =for apidoc is_utf8_non_invariant_string
 732
 733 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
 734 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
 735 UTF-8; otherwise returns FALSE.
 736
 737 A TRUE return means that at least one code point represented by the sequence
 738 either is a wide character not representable as a single byte, or the
 739 representation differs depending on whether the sequence is encoded in UTF-8 or
 740 not.
 741
 742 See also
 743 C<L<perlapi/is_utf8_invariant_string>>,
 744 C<L<perlapi/is_utf8_string>>
 745
 746 =cut
 747
 748 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
 749 It generally needn't be if its string is entirely UTF-8 invariant, and it
 750 shouldn't be if it otherwise contains invalid UTF-8.
 751
 752 It is an internal function because khw thinks that XS code shouldn't be working
 753 at this low a level.  A valid use case could change that.
 754
 755 */
 756
 757 PERL_STATIC_INLINE bool
 758 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
 759 {
 760     const U8 * first_variant;
 761
 762     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
 763
 764     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 765         return FALSE;
 766     }
 767
 768     return is_utf8_string(first_variant, len - (first_variant - s));
 769 }
 770
 771 #endif
 772
 773 /*
 774 =for apidoc is_strict_utf8_string
 775
 776 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 777 UTF-8-encoded string that is fully interchangeable by any application using
 778 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
 779 calculated using C<strlen(s)> (which means if you use this option, that C<s>
 780 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
 781 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
 782
 783 This function returns FALSE for strings containing any
 784 code points above the Unicode max of 0x10FFFF, surrogate code points, or
 785 non-character code points.
 786
 787 See also
 788 C<L</is_utf8_invariant_string>>,
 789 C<L</is_utf8_invariant_string_loc>>,
 790 C<L</is_utf8_string>>,
 791 C<L</is_utf8_string_flags>>,
 792 C<L</is_utf8_string_loc>>,
 793 C<L</is_utf8_string_loc_flags>>,
 794 C<L</is_utf8_string_loclen>>,
 795 C<L</is_utf8_string_loclen_flags>>,
 796 C<L</is_utf8_fixed_width_buf_flags>>,
 797 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 798 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 799 C<L</is_strict_utf8_string_loc>>,
 800 C<L</is_strict_utf8_string_loclen>>,
 801 C<L</is_c9strict_utf8_string>>,
 802 C<L</is_c9strict_utf8_string_loc>>,
 803 and
 804 C<L</is_c9strict_utf8_string_loclen>>.
 805
 806 =cut
 807 */
 808
 809 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
 810
 811 /*
 812 =for apidoc is_c9strict_utf8_string
 813
 814 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 815 UTF-8-encoded string that conforms to
 816 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
 817 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
 818 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
 819 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
 820 characters being ASCII constitute 'a valid UTF-8 string'.
 821
 822 This function returns FALSE for strings containing any code points above the
 823 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
 824 code points per
 825 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
 826
 827 See also
 828 C<L</is_utf8_invariant_string>>,
 829 C<L</is_utf8_invariant_string_loc>>,
 830 C<L</is_utf8_string>>,
 831 C<L</is_utf8_string_flags>>,
 832 C<L</is_utf8_string_loc>>,
 833 C<L</is_utf8_string_loc_flags>>,
 834 C<L</is_utf8_string_loclen>>,
 835 C<L</is_utf8_string_loclen_flags>>,
 836 C<L</is_utf8_fixed_width_buf_flags>>,
 837 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 838 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 839 C<L</is_strict_utf8_string>>,
 840 C<L</is_strict_utf8_string_loc>>,
 841 C<L</is_strict_utf8_string_loclen>>,
 842 C<L</is_c9strict_utf8_string_loc>>,
 843 and
 844 C<L</is_c9strict_utf8_string_loclen>>.
 845
 846 =cut
 847 */
 848
 849 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
 850
 851 /*
 852 =for apidoc is_utf8_string_flags
 853
 854 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 855 UTF-8 string, subject to the restrictions imposed by C<flags>;
 856 returns FALSE otherwise.  If C<len> is 0, it will be calculated
 857 using C<strlen(s)> (which means if you use this option, that C<s> can't have
 858 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
 859 that all characters being ASCII constitute 'a valid UTF-8 string'.
 860
 861 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
 862 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
 863 as C<L</is_strict_utf8_string>>; and if C<flags> is
 864 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
 865 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
 866 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
 867 C<L</utf8n_to_uvchr>>, with the same meanings.
 868
 869 See also
 870 C<L</is_utf8_invariant_string>>,
 871 C<L</is_utf8_invariant_string_loc>>,
 872 C<L</is_utf8_string>>,
 873 C<L</is_utf8_string_loc>>,
 874 C<L</is_utf8_string_loc_flags>>,
 875 C<L</is_utf8_string_loclen>>,
 876 C<L</is_utf8_string_loclen_flags>>,
 877 C<L</is_utf8_fixed_width_buf_flags>>,
 878 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 879 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 880 C<L</is_strict_utf8_string>>,
 881 C<L</is_strict_utf8_string_loc>>,
 882 C<L</is_strict_utf8_string_loclen>>,
 883 C<L</is_c9strict_utf8_string>>,
 884 C<L</is_c9strict_utf8_string_loc>>,
 885 and
 886 C<L</is_c9strict_utf8_string_loclen>>.
 887
 888 =cut
 889 */
 890
 891 PERL_STATIC_INLINE bool
 892 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
 893 {
 894     const U8 * first_variant;
 895
 896     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
 897     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 898                           |UTF8_DISALLOW_PERL_EXTENDED)));
 899
 900     if (len == 0) {
 901         len = strlen((const char *)s);
 902     }
 903
 904     if (flags == 0) {
 905         return is_utf8_string(s, len);
 906     }
 907
 908     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
 909                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
 910     {
 911         return is_strict_utf8_string(s, len);
 912     }
 913
 914     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
 915                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
 916     {
 917         return is_c9strict_utf8_string(s, len);
 918     }
 919
 920     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
 921         const U8* const send = s + len;
 922         const U8* x = first_variant;
 923
 924         while (x < send) {
 925             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
 926             if (UNLIKELY(! cur_len)) {
 927                 return FALSE;
 928             }
 929             x += cur_len;
 930         }
 931     }
 932
 933     return TRUE;
 934 }
 935
 936 /*
 937
 938 =for apidoc is_utf8_string_loc
 939
 940 Like C<L</is_utf8_string>> but stores the location of the failure (in the
 941 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 942 "utf8ness success") in the C<ep> pointer.
 943
 944 See also C<L</is_utf8_string_loclen>>.
 945
 946 =cut
 947 */
 948
 949 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
 950
 951 /*
 952
 953 =for apidoc is_utf8_string_loclen
 954
 955 Like C<L</is_utf8_string>> but stores the location of the failure (in the
 956 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 957 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
 958 encoded characters in the C<el> pointer.
 959
 960 See also C<L</is_utf8_string_loc>>.
 961
 962 =cut
 963 */
 964
 965 PERL_STATIC_INLINE bool
 966 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 967 {
 968     const U8 * first_variant;
 969
 970     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 971
 972     if (len == 0) {
 973         len = strlen((const char *) s);
 974     }
 975
 976     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 977         if (el)
 978             *el = len;
 979
 980         if (ep) {
 981             *ep = s + len;
 982         }
 983
 984         return TRUE;
 985     }
 986
 987     {
 988         const U8* const send = s + len;
 989         const U8* x = first_variant;
 990         STRLEN outlen = first_variant - s;
 991
 992         while (x < send) {
 993             const STRLEN cur_len = isUTF8_CHAR(x, send);
 994             if (UNLIKELY(! cur_len)) {
 995                 break;
 996             }
 997             x += cur_len;
 998             outlen++;
 999         }
1000
1001         if (el)
1002             *el = outlen;
1003
1004         if (ep) {
1005             *ep = x;
1006         }
1007
1008         return (x == send);
1009     }
1010 }
1011
1012 /*
1013
1014 =for apidoc isUTF8_CHAR
1015
1016 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1017 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1018 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1019 value gives how many bytes starting at C<s> comprise the code point's
1020 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1021 form the first code point in C<s>, are not examined.
1022
1023 The code point can be any that will fit in an IV on this machine, using Perl's
1024 extension to official UTF-8 to represent those higher than the Unicode maximum
1025 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1026 next few bytes in C<s> is legal UTF-8 for a single character.
1027
1028 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1029 defined by Unicode to be fully interchangeable across applications;
1030 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1031 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1032 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1033
1034 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1035 C<L</is_utf8_string_loclen>> to check entire strings.
1036
1037 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1038 machines) is a valid UTF-8 character.
1039
1040 =cut
1041
1042 This uses an adaptation of the table and algorithm given in
1043 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1044 documentation of the original version.  A copyright notice for the original
1045 version is given at the beginning of this file.  The Perl adapation is
1046 documented at the definition of PL_extended_utf8_dfa_tab[].
1047
1048 */
1049
1050 PERL_STATIC_INLINE Size_t
1051 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1052 {
1053     const U8 * s = s0;
1054     UV state = 0;
1055
1056     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1057
1058     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
1059      * code point, which can be returned immediately.  Otherwise, it is either
1060      * malformed, or for the start byte FF which the dfa doesn't handle (except
1061      * on 32-bit ASCII platforms where it trivially is an error).  Call a
1062      * helper function for the other platforms. */
1063
1064     while (s < e && LIKELY(state != 1)) {
1065         state = PL_extended_utf8_dfa_tab[256
1066                                          + state
1067                                          + PL_extended_utf8_dfa_tab[*s]];
1068         if (state != 0) {
1069             s++;
1070             continue;
1071         }
1072
1073         return s - s0 + 1;
1074     }
1075
1076 #if defined(UV_IS_QUAD) || defined(EBCDIC)
1077
1078     if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1079        return is_utf8_char_helper(s0, e, 0);
1080     }
1081
1082 #endif
1083
1084     return 0;
1085 }
1086
1087 /*
1088
1089 =for apidoc isSTRICT_UTF8_CHAR
1090
1091 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1092 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1093 Unicode code point completely acceptable for open interchange between all
1094 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1095 many bytes starting at C<s> comprise the code point's representation.  Any
1096 bytes remaining before C<e>, but beyond the ones needed to form the first code
1097 point in C<s>, are not examined.
1098
1099 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1100 be a surrogate nor a non-character code point.  Thus this excludes any code
1101 point from Perl's extended UTF-8.
1102
1103 This is used to efficiently decide if the next few bytes in C<s> is
1104 legal Unicode-acceptable UTF-8 for a single character.
1105
1106 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1107 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1108 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1109 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1110
1111 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1112 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1113
1114 =cut
1115
1116 This uses an adaptation of the tables and algorithm given in
1117 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1118 documentation of the original version.  A copyright notice for the original
1119 version is given at the beginning of this file.  The Perl adapation is
1120 documented at the definition of strict_extended_utf8_dfa_tab[].
1121
1122 */
1123
1124 PERL_STATIC_INLINE Size_t
1125 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1126 {
1127     const U8 * s = s0;
1128     UV state = 0;
1129
1130     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1131
1132     while (s < e && LIKELY(state != 1)) {
1133         state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
1134
1135         if (state != 0) {
1136             s++;
1137             continue;
1138         }
1139
1140         return s - s0 + 1;
1141     }
1142
1143 #ifndef EBCDIC
1144
1145     /* The dfa above drops out for certain Hanguls; handle them specially */
1146     if (is_HANGUL_ED_utf8_safe(s0, e)) {
1147         return 3;
1148     }
1149
1150 #endif
1151
1152     return 0;
1153 }
1154
1155 /*
1156
1157 =for apidoc isC9_STRICT_UTF8_CHAR
1158
1159 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1160 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1161 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1162 the value gives how many bytes starting at C<s> comprise the code point's
1163 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1164 form the first code point in C<s>, are not examined.
1165
1166 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1167 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1168 code points.  This corresponds to
1169 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1170 which said that non-character code points are merely discouraged rather than
1171 completely forbidden in open interchange.  See
1172 L<perlunicode/Noncharacter code points>.
1173
1174 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1175 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1176
1177 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1178 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1179
1180 =cut
1181
1182 This uses an adaptation of the tables and algorithm given in
1183 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1184 documentation of the original version.  A copyright notice for the original
1185 version is given at the beginning of this file.  The Perl adapation is
1186 documented at the definition of PL_c9_utf8_dfa_tab[].
1187
1188 */
1189
1190 PERL_STATIC_INLINE Size_t
1191 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1192 {
1193     const U8 * s = s0;
1194     UV state = 0;
1195
1196     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1197
1198     while (s < e && LIKELY(state != 1)) {
1199         state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
1200
1201         if (state != 0) {
1202             s++;
1203             continue;
1204         }
1205
1206         return s - s0 + 1;
1207     }
1208
1209     return 0;
1210 }
1211
1212 /*
1213
1214 =for apidoc is_strict_utf8_string_loc
1215
1216 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1217 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1218 "utf8ness success") in the C<ep> pointer.
1219
1220 See also C<L</is_strict_utf8_string_loclen>>.
1221
1222 =cut
1223 */
1224
1225 #define is_strict_utf8_string_loc(s, len, ep)                               \
1226                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1227
1228 /*
1229
1230 =for apidoc is_strict_utf8_string_loclen
1231
1232 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1233 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1234 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1235 encoded characters in the C<el> pointer.
1236
1237 See also C<L</is_strict_utf8_string_loc>>.
1238
1239 =cut
1240 */
1241
1242 PERL_STATIC_INLINE bool
1243 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1244 {
1245     const U8 * first_variant;
1246
1247     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1248
1249     if (len == 0) {
1250         len = strlen((const char *) s);
1251     }
1252
1253     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1254         if (el)
1255             *el = len;
1256
1257         if (ep) {
1258             *ep = s + len;
1259         }
1260
1261         return TRUE;
1262     }
1263
1264     {
1265         const U8* const send = s + len;
1266         const U8* x = first_variant;
1267         STRLEN outlen = first_variant - s;
1268
1269         while (x < send) {
1270             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1271             if (UNLIKELY(! cur_len)) {
1272                 break;
1273             }
1274             x += cur_len;
1275             outlen++;
1276         }
1277
1278         if (el)
1279             *el = outlen;
1280
1281         if (ep) {
1282             *ep = x;
1283         }
1284
1285         return (x == send);
1286     }
1287 }
1288
1289 /*
1290
1291 =for apidoc is_c9strict_utf8_string_loc
1292
1293 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1294 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1295 "utf8ness success") in the C<ep> pointer.
1296
1297 See also C<L</is_c9strict_utf8_string_loclen>>.
1298
1299 =cut
1300 */
1301
1302 #define is_c9strict_utf8_string_loc(s, len, ep)                             \
1303                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1304
1305 /*
1306
1307 =for apidoc is_c9strict_utf8_string_loclen
1308
1309 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1310 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1311 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1312 characters in the C<el> pointer.
1313
1314 See also C<L</is_c9strict_utf8_string_loc>>.
1315
1316 =cut
1317 */
1318
1319 PERL_STATIC_INLINE bool
1320 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1321 {
1322     const U8 * first_variant;
1323
1324     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1325
1326     if (len == 0) {
1327         len = strlen((const char *) s);
1328     }
1329
1330     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1331         if (el)
1332             *el = len;
1333
1334         if (ep) {
1335             *ep = s + len;
1336         }
1337
1338         return TRUE;
1339     }
1340
1341     {
1342         const U8* const send = s + len;
1343         const U8* x = first_variant;
1344         STRLEN outlen = first_variant - s;
1345
1346         while (x < send) {
1347             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1348             if (UNLIKELY(! cur_len)) {
1349                 break;
1350             }
1351             x += cur_len;
1352             outlen++;
1353         }
1354
1355         if (el)
1356             *el = outlen;
1357
1358         if (ep) {
1359             *ep = x;
1360         }
1361
1362         return (x == send);
1363     }
1364 }
1365
1366 /*
1367
1368 =for apidoc is_utf8_string_loc_flags
1369
1370 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1371 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1372 "utf8ness success") in the C<ep> pointer.
1373
1374 See also C<L</is_utf8_string_loclen_flags>>.
1375
1376 =cut
1377 */
1378
1379 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1380                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1381
1382
1383 /* The above 3 actual functions could have been moved into the more general one
1384  * just below, and made #defines that call it with the right 'flags'.  They are
1385  * currently kept separate to increase their chances of getting inlined */
1386
1387 /*
1388
1389 =for apidoc is_utf8_string_loclen_flags
1390
1391 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1392 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1393 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1394 encoded characters in the C<el> pointer.
1395
1396 See also C<L</is_utf8_string_loc_flags>>.
1397
1398 =cut
1399 */
1400
1401 PERL_STATIC_INLINE bool
1402 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1403 {
1404     const U8 * first_variant;
1405
1406     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1407     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1408                           |UTF8_DISALLOW_PERL_EXTENDED)));
1409
1410     if (len == 0) {
1411         len = strlen((const char *) s);
1412     }
1413
1414     if (flags == 0) {
1415         return is_utf8_string_loclen(s, len, ep, el);
1416     }
1417
1418     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1419                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1420     {
1421         return is_strict_utf8_string_loclen(s, len, ep, el);
1422     }
1423
1424     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1425                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1426     {
1427         return is_c9strict_utf8_string_loclen(s, len, ep, el);
1428     }
1429
1430     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1431         if (el)
1432             *el = len;
1433
1434         if (ep) {
1435             *ep = s + len;
1436         }
1437
1438         return TRUE;
1439     }
1440
1441     {
1442         const U8* send = s + len;
1443         const U8* x = first_variant;
1444         STRLEN outlen = first_variant - s;
1445
1446         while (x < send) {
1447             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1448             if (UNLIKELY(! cur_len)) {
1449                 break;
1450             }
1451             x += cur_len;
1452             outlen++;
1453         }
1454
1455         if (el)
1456             *el = outlen;
1457
1458         if (ep) {
1459             *ep = x;
1460         }
1461
1462         return (x == send);
1463     }
1464 }
1465
1466 /*
1467 =for apidoc utf8_distance
1468
1469 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1470 and C<b>.
1471
1472 WARNING: use only if you *know* that the pointers point inside the
1473 same UTF-8 buffer.
1474
1475 =cut
1476 */
1477
1478 PERL_STATIC_INLINE IV
1479 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1480 {
1481     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1482
1483     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1484 }
1485
1486 /*
1487 =for apidoc utf8_hop
1488
1489 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1490 forward or backward.
1491
1492 WARNING: do not use the following unless you *know* C<off> is within
1493 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1494 on the first byte of character or just after the last byte of a character.
1495
1496 =cut
1497 */
1498
1499 PERL_STATIC_INLINE U8 *
1500 Perl_utf8_hop(const U8 *s, SSize_t off)
1501 {
1502     PERL_ARGS_ASSERT_UTF8_HOP;
1503
1504     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1505      * the bitops (especially ~) can create illegal UTF-8.
1506      * In other words: in Perl UTF-8 is not just for Unicode. */
1507
1508     if (off >= 0) {
1509         while (off--)
1510             s += UTF8SKIP(s);
1511     }
1512     else {
1513         while (off++) {
1514             s--;
1515             while (UTF8_IS_CONTINUATION(*s))
1516                 s--;
1517         }
1518     }
1519     GCC_DIAG_IGNORE(-Wcast-qual)
1520     return (U8 *)s;
1521     GCC_DIAG_RESTORE
1522 }
1523
1524 /*
1525 =for apidoc utf8_hop_forward
1526
1527 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1528 forward.
1529
1530 C<off> must be non-negative.
1531
1532 C<s> must be before or equal to C<end>.
1533
1534 When moving forward it will not move beyond C<end>.
1535
1536 Will not exceed this limit even if the string is not valid "UTF-8".
1537
1538 =cut
1539 */
1540
1541 PERL_STATIC_INLINE U8 *
1542 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1543 {
1544     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1545
1546     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1547      * the bitops (especially ~) can create illegal UTF-8.
1548      * In other words: in Perl UTF-8 is not just for Unicode. */
1549
1550     assert(s <= end);
1551     assert(off >= 0);
1552
1553     while (off--) {
1554         STRLEN skip = UTF8SKIP(s);
1555         if ((STRLEN)(end - s) <= skip) {
1556             GCC_DIAG_IGNORE(-Wcast-qual)
1557             return (U8 *)end;
1558             GCC_DIAG_RESTORE
1559         }
1560         s += skip;
1561     }
1562
1563     GCC_DIAG_IGNORE(-Wcast-qual)
1564     return (U8 *)s;
1565     GCC_DIAG_RESTORE
1566 }
1567
1568 /*
1569 =for apidoc utf8_hop_back
1570
1571 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1572 backward.
1573
1574 C<off> must be non-positive.
1575
1576 C<s> must be after or equal to C<start>.
1577
1578 When moving backward it will not move before C<start>.
1579
1580 Will not exceed this limit even if the string is not valid "UTF-8".
1581
1582 =cut
1583 */
1584
1585 PERL_STATIC_INLINE U8 *
1586 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1587 {
1588     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1589
1590     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1591      * the bitops (especially ~) can create illegal UTF-8.
1592      * In other words: in Perl UTF-8 is not just for Unicode. */
1593
1594     assert(start <= s);
1595     assert(off <= 0);
1596
1597     while (off++ && s > start) {
1598         do {
1599             s--;
1600         } while (UTF8_IS_CONTINUATION(*s) && s > start);
1601     }
1602
1603     GCC_DIAG_IGNORE(-Wcast-qual)
1604     return (U8 *)s;
1605     GCC_DIAG_RESTORE
1606 }
1607
1608 /*
1609 =for apidoc utf8_hop_safe
1610
1611 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1612 either forward or backward.
1613
1614 When moving backward it will not move before C<start>.
1615
1616 When moving forward it will not move beyond C<end>.
1617
1618 Will not exceed those limits even if the string is not valid "UTF-8".
1619
1620 =cut
1621 */
1622
1623 PERL_STATIC_INLINE U8 *
1624 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1625 {
1626     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1627
1628     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1629      * the bitops (especially ~) can create illegal UTF-8.
1630      * In other words: in Perl UTF-8 is not just for Unicode. */
1631
1632     assert(start <= s && s <= end);
1633
1634     if (off >= 0) {
1635         return utf8_hop_forward(s, off, end);
1636     }
1637     else {
1638         return utf8_hop_back(s, off, start);
1639     }
1640 }
1641
1642 /*
1643
1644 =for apidoc is_utf8_valid_partial_char
1645
1646 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1647 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1648 points.  Otherwise, it returns 1 if there exists at least one non-empty
1649 sequence of bytes that when appended to sequence C<s>, starting at position
1650 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1651 otherwise returns 0.
1652
1653 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1654 point.
1655
1656 This is useful when a fixed-length buffer is being tested for being well-formed
1657 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1658 it is split somewhere in the middle of the final code point's UTF-8
1659 representation.  (Presumably when the buffer is refreshed with the next chunk
1660 of data, the new first bytes will complete the partial code point.)   This
1661 function is used to verify that the final bytes in the current buffer are in
1662 fact the legal beginning of some code point, so that if they aren't, the
1663 failure can be signalled without having to wait for the next read.
1664
1665 =cut
1666 */
1667 #define is_utf8_valid_partial_char(s, e)                                    \
1668                                 is_utf8_valid_partial_char_flags(s, e, 0)
1669
1670 /*
1671
1672 =for apidoc is_utf8_valid_partial_char_flags
1673
1674 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1675 or not the input is a valid UTF-8 encoded partial character, but it takes an
1676 extra parameter, C<flags>, which can further restrict which code points are
1677 considered valid.
1678
1679 If C<flags> is 0, this behaves identically to
1680 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
1681 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
1682 there is any sequence of bytes that can complete the input partial character in
1683 such a way that a non-prohibited character is formed, the function returns
1684 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
1685 partial character input.  But many  of the other possible excluded types can be
1686 determined from just the first one or two bytes.
1687
1688 =cut
1689  */
1690
1691 PERL_STATIC_INLINE bool
1692 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1693 {
1694     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1695
1696     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1697                           |UTF8_DISALLOW_PERL_EXTENDED)));
1698
1699     if (s >= e || s + UTF8SKIP(s) <= e) {
1700         return FALSE;
1701     }
1702
1703     return cBOOL(is_utf8_char_helper(s, e, flags));
1704 }
1705
1706 /*
1707
1708 =for apidoc is_utf8_fixed_width_buf_flags
1709
1710 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1711 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1712 otherwise it returns FALSE.
1713
1714 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1715 without restriction.  If the final few bytes of the buffer do not form a
1716 complete code point, this will return TRUE anyway, provided that
1717 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1718
1719 If C<flags> in non-zero, it can be any combination of the
1720 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1721 same meanings.
1722
1723 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1724 returns FALSE if the final few bytes of the string don't form a complete code
1725 point.
1726
1727 =cut
1728  */
1729 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
1730                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1731
1732 /*
1733
1734 =for apidoc is_utf8_fixed_width_buf_loc_flags
1735
1736 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1737 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
1738 to the beginning of any partial character at the end of the buffer; if there is
1739 no partial character C<*ep> will contain C<s>+C<len>.
1740
1741 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1742
1743 =cut
1744 */
1745
1746 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
1747                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1748
1749 /*
1750
1751 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1752
1753 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1754 complete, valid characters found in the C<el> pointer.
1755
1756 =cut
1757 */
1758
1759 PERL_STATIC_INLINE bool
1760 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1761                                        STRLEN len,
1762                                        const U8 **ep,
1763                                        STRLEN *el,
1764                                        const U32 flags)
1765 {
1766     const U8 * maybe_partial;
1767
1768     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1769
1770     if (! ep) {
1771         ep  = &maybe_partial;
1772     }
1773
1774     /* If it's entirely valid, return that; otherwise see if the only error is
1775      * that the final few bytes are for a partial character */
1776     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
1777            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1778 }
1779
1780 PERL_STATIC_INLINE UV
1781 Perl_utf8n_to_uvchr_msgs(const U8 *s,
1782                       STRLEN curlen,
1783                       STRLEN *retlen,
1784                       const U32 flags,
1785                       U32 * errors,
1786                       AV ** msgs)
1787 {
1788     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
1789      * simple cases, and, if necessary calls a helper function to deal with the
1790      * more complex ones.  Almost all well-formed non-problematic code points
1791      * are considered simple, so that it's unlikely that the helper function
1792      * will need to be called.
1793      *
1794      * This is an adaptation of the tables and algorithm given in
1795      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1796      * comprehensive documentation of the original version.  A copyright notice
1797      * for the original version is given at the beginning of this file.  The
1798      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
1799      */
1800
1801     const U8 * const s0 = s;
1802     const U8 * send = s0 + curlen;
1803     UV uv = 0;      /* The 0 silences some stupid compilers */
1804     UV state = 0;
1805
1806     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1807
1808     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
1809      * non-problematic code point, which can be returned immediately.
1810      * Otherwise we call a helper function to figure out the more complicated
1811      * cases. */
1812
1813     while (s < send && LIKELY(state != 1)) {
1814         UV type = PL_strict_utf8_dfa_tab[*s];
1815
1816         uv = (state == 0)
1817              ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1818              : UTF8_ACCUMULATE(uv, *s);
1819         state = PL_strict_utf8_dfa_tab[256 + state + type];
1820
1821         if (state != 0) {
1822             s++;
1823             continue;
1824         }
1825
1826         if (retlen) {
1827             *retlen = s - s0 + 1;
1828         }
1829         if (errors) {
1830             *errors = 0;
1831         }
1832         if (msgs) {
1833             *msgs = NULL;
1834         }
1835
1836         return UNI_TO_NATIVE(uv);
1837     }
1838
1839     /* Here is potentially problematic.  Use the full mechanism */
1840     return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1841 }
1842
1843 PERL_STATIC_INLINE UV
1844 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1845 {
1846     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
1847
1848     assert(s < send);
1849
1850     if (! ckWARN_d(WARN_UTF8)) {
1851
1852         /* EMPTY is not really allowed, and asserts on debugging builds.  But
1853          * on non-debugging we have to deal with it, and this causes it to
1854          * return the REPLACEMENT CHARACTER, as the documentation indicates */
1855         return utf8n_to_uvchr(s, send - s, retlen,
1856                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
1857     }
1858     else {
1859         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1860         if (retlen && ret == 0 && *s != '\0') {
1861             *retlen = (STRLEN) -1;
1862         }
1863
1864         return ret;
1865     }
1866 }
1867
1868 /* ------------------------------- perl.h ----------------------------- */
1869
1870 /*
1871 =head1 Miscellaneous Functions
1872
1873 =for apidoc is_safe_syscall
1874
1875 Test that the given C<pv> (with length C<len>) doesn't contain any internal
1876 C<NUL> characters.
1877 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1878 category, and return FALSE.
1879
1880 Return TRUE if the name is safe.
1881
1882 C<what> and C<op_name> are used in any warning.
1883
1884 Used by the C<IS_SAFE_SYSCALL()> macro.
1885
1886 =cut
1887 */
1888
1889 PERL_STATIC_INLINE bool
1890 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1891 {
1892     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1893      * perl itself uses xce*() functions which accept 8-bit strings.
1894      */
1895
1896     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1897
1898     if (len > 1) {
1899         char *null_at;
1900         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
1901                 SETERRNO(ENOENT, LIB_INVARG);
1902                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
1903                                    "Invalid \\0 character in %s for %s: %s\\0%s",
1904                                    what, op_name, pv, null_at+1);
1905                 return FALSE;
1906         }
1907     }
1908
1909     return TRUE;
1910 }
1911
1912 /*
1913
1914 Return true if the supplied filename has a newline character
1915 immediately before the first (hopefully only) NUL.
1916
1917 My original look at this incorrectly used the len from SvPV(), but
1918 that's incorrect, since we allow for a NUL in pv[len-1].
1919
1920 So instead, strlen() and work from there.
1921
1922 This allow for the user reading a filename, forgetting to chomp it,
1923 then calling:
1924
1925   open my $foo, "$file\0";
1926
1927 */
1928
1929 #ifdef PERL_CORE
1930
1931 PERL_STATIC_INLINE bool
1932 S_should_warn_nl(const char *pv)
1933 {
1934     STRLEN len;
1935
1936     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1937
1938     len = strlen(pv);
1939
1940     return len > 0 && pv[len-1] == '\n';
1941 }
1942
1943 #endif
1944
1945 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1946
1947 PERL_STATIC_INLINE bool
1948 S_lossless_NV_to_IV(const NV nv, IV *ivp)
1949 {
1950     /* This function determines if the input NV 'nv' may be converted without
1951      * loss of data to an IV.  If not, it returns FALSE taking no other action.
1952      * But if it is possible, it does the conversion, returning TRUE, and
1953      * storing the converted result in '*ivp' */
1954
1955     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1956
1957 #  if  defined(Perl_isnan)
1958
1959     if (UNLIKELY(Perl_isnan(nv))) {
1960         return FALSE;
1961     }
1962
1963 #  endif
1964
1965     if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1966         return FALSE;
1967     }
1968
1969     if ((IV) nv != nv) {
1970         return FALSE;
1971     }
1972
1973     *ivp = (IV) nv;
1974     return TRUE;
1975 }
1976
1977 #endif
1978
1979 /* ------------------ regcomp.c, toke.c ------------ */
1980
1981 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
1982
1983 /*
1984  - regcurly - a little FSA that accepts {\d+,?\d*}
1985     Pulled from reg.c.
1986  */
1987 PERL_STATIC_INLINE bool
1988 S_regcurly(const char *s)
1989 {
1990     PERL_ARGS_ASSERT_REGCURLY;
1991
1992     if (*s++ != '{')
1993         return FALSE;
1994     if (!isDIGIT(*s))
1995         return FALSE;
1996     while (isDIGIT(*s))
1997         s++;
1998     if (*s == ',') {
1999         s++;
2000         while (isDIGIT(*s))
2001             s++;
2002     }
2003
2004     return *s == '}';
2005 }
2006
2007 #endif
2008
2009 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2010
2011 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2012
2013 #define MAX_CHARSET_NAME_LENGTH 2
2014
2015 PERL_STATIC_INLINE const char *
2016 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2017 {
2018     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2019
2020     /* Returns a string that corresponds to the name of the regex character set
2021      * given by 'flags', and *lenp is set the length of that string, which
2022      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2023
2024     *lenp = 1;
2025     switch (get_regex_charset(flags)) {
2026         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2027         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2028         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2029         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2030         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2031             *lenp = 2;
2032             return ASCII_MORE_RESTRICT_PAT_MODS;
2033     }
2034     /* The NOT_REACHED; hides an assert() which has a rather complex
2035      * definition in perl.h. */
2036     NOT_REACHED; /* NOTREACHED */
2037     return "?";     /* Unknown */
2038 }
2039
2040 #endif
2041
2042 /*
2043
2044 Return false if any get magic is on the SV other than taint magic.
2045
2046 */
2047
2048 PERL_STATIC_INLINE bool
2049 Perl_sv_only_taint_gmagic(SV *sv)
2050 {
2051     MAGIC *mg = SvMAGIC(sv);
2052
2053     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2054
2055     while (mg) {
2056         if (mg->mg_type != PERL_MAGIC_taint
2057             && !(mg->mg_flags & MGf_GSKIP)
2058             && mg->mg_virtual->svt_get) {
2059             return FALSE;
2060         }
2061         mg = mg->mg_moremagic;
2062     }
2063
2064     return TRUE;
2065 }
2066
2067 /* ------------------ cop.h ------------------------------------------- */
2068
2069 /* implement GIMME_V() macro */
2070
2071 PERL_STATIC_INLINE U8
2072 Perl_gimme_V(pTHX)
2073 {
2074     I32 cxix;
2075     U8  gimme = (PL_op->op_flags & OPf_WANT);
2076
2077     if (gimme)
2078         return gimme;
2079     cxix = PL_curstackinfo->si_cxsubix;
2080     if (cxix < 0)
2081         return G_VOID;
2082     assert(cxstack[cxix].blk_gimme & G_WANT);
2083     return (cxstack[cxix].blk_gimme & G_WANT);
2084 }
2085
2086
2087 /* Enter a block. Push a new base context and return its address. */
2088
2089 PERL_STATIC_INLINE PERL_CONTEXT *
2090 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2091 {
2092     PERL_CONTEXT * cx;
2093
2094     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2095
2096     CXINC;
2097     cx = CX_CUR();
2098     cx->cx_type        = type;
2099     cx->blk_gimme      = gimme;
2100     cx->blk_oldsaveix  = saveix;
2101     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2102     cx->blk_oldcop     = PL_curcop;
2103     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2104     cx->blk_oldscopesp = PL_scopestack_ix;
2105     cx->blk_oldpm      = PL_curpm;
2106     cx->blk_old_tmpsfloor = PL_tmps_floor;
2107
2108     PL_tmps_floor        = PL_tmps_ix;
2109     CX_DEBUG(cx, "PUSH");
2110     return cx;
2111 }
2112
2113
2114 /* Exit a block (RETURN and LAST). */
2115
2116 PERL_STATIC_INLINE void
2117 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2118 {
2119     PERL_ARGS_ASSERT_CX_POPBLOCK;
2120
2121     CX_DEBUG(cx, "POP");
2122     /* these 3 are common to cx_popblock and cx_topblock */
2123     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2124     PL_scopestack_ix = cx->blk_oldscopesp;
2125     PL_curpm         = cx->blk_oldpm;
2126
2127     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2128      * and leaves a CX entry lying around for repeated use, so
2129      * skip for multicall */                  \
2130     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2131             || PL_savestack_ix == cx->blk_oldsaveix);
2132     PL_curcop     = cx->blk_oldcop;
2133     PL_tmps_floor = cx->blk_old_tmpsfloor;
2134 }
2135
2136 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2137  * Whereas cx_popblock() restores the state to the point just before
2138  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2139  * *after* cx_pushblock() was called. */
2140
2141 PERL_STATIC_INLINE void
2142 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2143 {
2144     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2145
2146     CX_DEBUG(cx, "TOP");
2147     /* these 3 are common to cx_popblock and cx_topblock */
2148     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2149     PL_scopestack_ix = cx->blk_oldscopesp;
2150     PL_curpm         = cx->blk_oldpm;
2151
2152     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2153 }
2154
2155
2156 PERL_STATIC_INLINE void
2157 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2158 {
2159     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2160
2161     PERL_ARGS_ASSERT_CX_PUSHSUB;
2162
2163     PERL_DTRACE_PROBE_ENTRY(cv);
2164     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2165     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2166     cx->blk_sub.cv = cv;
2167     cx->blk_sub.olddepth = CvDEPTH(cv);
2168     cx->blk_sub.prevcomppad = PL_comppad;
2169     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2170     cx->blk_sub.retop = retop;
2171     SvREFCNT_inc_simple_void_NN(cv);
2172     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2173 }
2174
2175
2176 /* subsets of cx_popsub() */
2177
2178 PERL_STATIC_INLINE void
2179 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2180 {
2181     CV *cv;
2182
2183     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2184     assert(CxTYPE(cx) == CXt_SUB);
2185
2186     PL_comppad = cx->blk_sub.prevcomppad;
2187     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2188     cv = cx->blk_sub.cv;
2189     CvDEPTH(cv) = cx->blk_sub.olddepth;
2190     cx->blk_sub.cv = NULL;
2191     SvREFCNT_dec(cv);
2192     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2193 }
2194
2195
2196 /* handle the @_ part of leaving a sub */
2197
2198 PERL_STATIC_INLINE void
2199 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2200 {
2201     AV *av;
2202
2203     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2204     assert(CxTYPE(cx) == CXt_SUB);
2205     assert(AvARRAY(MUTABLE_AV(
2206         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2207                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2208
2209     CX_POP_SAVEARRAY(cx);
2210     av = MUTABLE_AV(PAD_SVl(0));
2211     if (UNLIKELY(AvREAL(av)))
2212         /* abandon @_ if it got reified */
2213         clear_defarray(av, 0);
2214     else {
2215         CLEAR_ARGARRAY(av);
2216     }
2217 }
2218
2219
2220 PERL_STATIC_INLINE void
2221 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2222 {
2223     PERL_ARGS_ASSERT_CX_POPSUB;
2224     assert(CxTYPE(cx) == CXt_SUB);
2225
2226     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2227
2228     if (CxHASARGS(cx))
2229         cx_popsub_args(cx);
2230     cx_popsub_common(cx);
2231 }
2232
2233
2234 PERL_STATIC_INLINE void
2235 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2236 {
2237     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2238
2239     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2240     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2241     cx->blk_format.cv          = cv;
2242     cx->blk_format.retop       = retop;
2243     cx->blk_format.gv          = gv;
2244     cx->blk_format.dfoutgv     = PL_defoutgv;
2245     cx->blk_format.prevcomppad = PL_comppad;
2246     cx->blk_u16                = 0;
2247
2248     SvREFCNT_inc_simple_void_NN(cv);
2249     CvDEPTH(cv)++;
2250     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2251 }
2252
2253
2254 PERL_STATIC_INLINE void
2255 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2256 {
2257     CV *cv;
2258     GV *dfout;
2259
2260     PERL_ARGS_ASSERT_CX_POPFORMAT;
2261     assert(CxTYPE(cx) == CXt_FORMAT);
2262
2263     dfout = cx->blk_format.dfoutgv;
2264     setdefout(dfout);
2265     cx->blk_format.dfoutgv = NULL;
2266     SvREFCNT_dec_NN(dfout);
2267
2268     PL_comppad = cx->blk_format.prevcomppad;
2269     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2270     cv = cx->blk_format.cv;
2271     cx->blk_format.cv = NULL;
2272     --CvDEPTH(cv);
2273     SvREFCNT_dec_NN(cv);
2274     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2275 }
2276
2277
2278 PERL_STATIC_INLINE void
2279 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2280 {
2281     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2282
2283     cx->blk_eval.old_cxsubix   = PL_curstackinfo->si_cxsubix;
2284     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2285     cx->blk_eval.retop         = retop;
2286     cx->blk_eval.old_namesv    = namesv;
2287     cx->blk_eval.old_eval_root = PL_eval_root;
2288     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2289     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2290     cx->blk_eval.cur_top_env   = PL_top_env;
2291
2292     assert(!(PL_in_eval     & ~ 0x3F));
2293     assert(!(PL_op->op_type & ~0x1FF));
2294     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2295 }
2296
2297
2298 PERL_STATIC_INLINE void
2299 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2300 {
2301     SV *sv;
2302
2303     PERL_ARGS_ASSERT_CX_POPEVAL;
2304     assert(CxTYPE(cx) == CXt_EVAL);
2305
2306     PL_in_eval = CxOLD_IN_EVAL(cx);
2307     assert(!(PL_in_eval & 0xc0));
2308     PL_eval_root = cx->blk_eval.old_eval_root;
2309     sv = cx->blk_eval.cur_text;
2310     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2311         cx->blk_eval.cur_text = NULL;
2312         SvREFCNT_dec_NN(sv);
2313     }
2314
2315     sv = cx->blk_eval.old_namesv;
2316     if (sv) {
2317         cx->blk_eval.old_namesv = NULL;
2318         SvREFCNT_dec_NN(sv);
2319     }
2320     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2321 }
2322
2323
2324 /* push a plain loop, i.e.
2325  *     { block }
2326  *     while (cond) { block }
2327  *     for (init;cond;continue) { block }
2328  * This loop can be last/redo'ed etc.
2329  */
2330
2331 PERL_STATIC_INLINE void
2332 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2333 {
2334     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2335     cx->blk_loop.my_op = cLOOP;
2336 }
2337
2338
2339 /* push a true for loop, i.e.
2340  *     for var (list) { block }
2341  */
2342
2343 PERL_STATIC_INLINE void
2344 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2345 {
2346     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2347
2348     /* this one line is common with cx_pushloop_plain */
2349     cx->blk_loop.my_op = cLOOP;
2350
2351     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2352     cx->blk_loop.itersave      = itersave;
2353 #ifdef USE_ITHREADS
2354     cx->blk_loop.oldcomppad = PL_comppad;
2355 #endif
2356 }
2357
2358
2359 /* pop all loop types, including plain */
2360
2361 PERL_STATIC_INLINE void
2362 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2363 {
2364     PERL_ARGS_ASSERT_CX_POPLOOP;
2365
2366     assert(CxTYPE_is_LOOP(cx));
2367     if (  CxTYPE(cx) == CXt_LOOP_ARY
2368        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2369     {
2370         /* Free ary or cur. This assumes that state_u.ary.ary
2371          * aligns with state_u.lazysv.cur. See cx_dup() */
2372         SV *sv = cx->blk_loop.state_u.lazysv.cur;
2373         cx->blk_loop.state_u.lazysv.cur = NULL;
2374         SvREFCNT_dec_NN(sv);
2375         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2376             sv = cx->blk_loop.state_u.lazysv.end;
2377             cx->blk_loop.state_u.lazysv.end = NULL;
2378             SvREFCNT_dec_NN(sv);
2379         }
2380     }
2381     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2382         SV *cursv;
2383         SV **svp = (cx)->blk_loop.itervar_u.svp;
2384         if ((cx->cx_type & CXp_FOR_GV))
2385             svp = &GvSV((GV*)svp);
2386         cursv = *svp;
2387         *svp = cx->blk_loop.itersave;
2388         cx->blk_loop.itersave = NULL;
2389         SvREFCNT_dec(cursv);
2390     }
2391 }
2392
2393
2394 PERL_STATIC_INLINE void
2395 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2396 {
2397     PERL_ARGS_ASSERT_CX_PUSHWHEN;
2398
2399     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2400 }
2401
2402
2403 PERL_STATIC_INLINE void
2404 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2405 {
2406     PERL_ARGS_ASSERT_CX_POPWHEN;
2407     assert(CxTYPE(cx) == CXt_WHEN);
2408
2409     PERL_UNUSED_ARG(cx);
2410     PERL_UNUSED_CONTEXT;
2411     /* currently NOOP */
2412 }
2413
2414
2415 PERL_STATIC_INLINE void
2416 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2417 {
2418     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2419
2420     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2421     cx->blk_givwhen.defsv_save = orig_defsv;
2422 }
2423
2424
2425 PERL_STATIC_INLINE void
2426 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2427 {
2428     SV *sv;
2429
2430     PERL_ARGS_ASSERT_CX_POPGIVEN;
2431     assert(CxTYPE(cx) == CXt_GIVEN);
2432
2433     sv = GvSV(PL_defgv);
2434     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2435     cx->blk_givwhen.defsv_save = NULL;
2436     SvREFCNT_dec(sv);
2437 }
2438
2439 /* ------------------ util.h ------------------------------------------- */
2440
2441 /*
2442 =head1 Miscellaneous Functions
2443
2444 =for apidoc foldEQ
2445
2446 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2447 same
2448 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
2449 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
2450 range bytes match only themselves.
2451
2452 =cut
2453 */
2454
2455 PERL_STATIC_INLINE I32
2456 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2457 {
2458     const U8 *a = (const U8 *)s1;
2459     const U8 *b = (const U8 *)s2;
2460
2461     PERL_ARGS_ASSERT_FOLDEQ;
2462
2463     assert(len >= 0);
2464
2465     while (len--) {
2466         if (*a != *b && *a != PL_fold[*b])
2467             return 0;
2468         a++,b++;
2469     }
2470     return 1;
2471 }
2472
2473 PERL_STATIC_INLINE I32
2474 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2475 {
2476     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
2477      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2478      * does not check for this.  Nor does it check that the strings each have
2479      * at least 'len' characters. */
2480
2481     const U8 *a = (const U8 *)s1;
2482     const U8 *b = (const U8 *)s2;
2483
2484     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2485
2486     assert(len >= 0);
2487
2488     while (len--) {
2489         if (*a != *b && *a != PL_fold_latin1[*b]) {
2490             return 0;
2491         }
2492         a++, b++;
2493     }
2494     return 1;
2495 }
2496
2497 /*
2498 =for apidoc foldEQ_locale
2499
2500 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2501 same case-insensitively in the current locale; false otherwise.
2502
2503 =cut
2504 */
2505
2506 PERL_STATIC_INLINE I32
2507 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2508 {
2509     const U8 *a = (const U8 *)s1;
2510     const U8 *b = (const U8 *)s2;
2511
2512     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2513
2514     assert(len >= 0);
2515
2516     while (len--) {
2517         if (*a != *b && *a != PL_fold_locale[*b])
2518             return 0;
2519         a++,b++;
2520     }
2521     return 1;
2522 }
2523
2524 /*
2525 =for apidoc my_strnlen
2526
2527 The C library C<strnlen> if available, or a Perl implementation of it.
2528
2529 C<my_strnlen()> computes the length of the string, up to C<maxlen>
2530 characters.  It will never attempt to address more than C<maxlen>
2531 characters, making it suitable for use with strings that are not
2532 guaranteed to be NUL-terminated.
2533
2534 =cut
2535
2536 Description stolen from http://man.openbsd.org/strnlen.3,
2537 implementation stolen from PostgreSQL.
2538 */
2539 #ifndef HAS_STRNLEN
2540
2541 PERL_STATIC_INLINE Size_t
2542 Perl_my_strnlen(const char *str, Size_t maxlen)
2543 {
2544     const char *end = (char *) memchr(str, '\0', maxlen);
2545
2546     PERL_ARGS_ASSERT_MY_STRNLEN;
2547
2548     if (end == NULL) return maxlen;
2549     return end - str;
2550 }
2551
2552 #endif
2553
2554 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2555
2556 PERL_STATIC_INLINE void *
2557 S_my_memrchr(const char * s, const char c, const STRLEN len)
2558 {
2559     /* memrchr(), since many platforms lack it */
2560
2561     const char * t = s + len - 1;
2562
2563     PERL_ARGS_ASSERT_MY_MEMRCHR;
2564
2565     while (t >= s) {
2566         if (*t == c) {
2567             return (void *) t;
2568         }
2569         t--;
2570     }
2571
2572     return NULL;
2573 }
2574
2575 #endif
2576
2577 PERL_STATIC_INLINE char *
2578 Perl_mortal_getenv(const char * str)
2579 {
2580     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2581      *
2582      * It's (mostly) thread-safe because it uses a mutex to prevent
2583      * simultaneous access from other threads that use the same mutex, and
2584      * makes a copy of the result before releasing that mutex.  All of the Perl
2585      * core uses that mutex, but, like all mutexes, everything has to cooperate
2586      * for it to completely work.  It is possible for code from, say XS, to not
2587      * use this mutex, defeating the safety.
2588      *
2589      * On some platforms, getenv() is not sequential-call-safe, because
2590      * subsequent calls destroy the static storage inside the C library
2591      * returned by an earlier call.  The result must be copied or completely
2592      * acted upon before a subsequent getenv call.  Those calls could come from
2593      * another thread.  Again, making a copy while controlling the mutex
2594      * prevents these problems..
2595      *
2596      * To prevent leaks, the copy is made by creating a new SV containing it,
2597      * mortalizing the SV, and returning the SV's string (the copy).  Thus this
2598      * is a drop-in replacement for getenv().
2599      *
2600      * A complication is that this can be called during phases where the
2601      * mortalization process isn't available.  These are in interpreter
2602      * destruction or early in construction.  khw believes that at these times
2603      * there shouldn't be anything else going on, so plain getenv is safe AS
2604      * LONG AS the caller acts on the return before calling it again. */
2605
2606     char * ret;
2607     dTHX;
2608
2609     PERL_ARGS_ASSERT_MORTAL_GETENV;
2610
2611     /* Can't mortalize without stacks.  khw believes that no other threads
2612      * should be running, so no need to lock things, and this may be during a
2613      * phase when locking isn't even available */
2614     if (UNLIKELY(PL_scopestack_ix == 0)) {
2615         return getenv(str);
2616     }
2617
2618     ENV_LOCK;
2619
2620     ret = getenv(str);
2621
2622     if (ret != NULL) {
2623         ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2624     }
2625
2626     ENV_UNLOCK;
2627     return ret;
2628 }
2629
2630 /*
2631  * ex: set ts=8 sts=4 sw=4 et:
2632  */