inline.h

   1 /*    inline.h
   2  *
   3  *    Copyright (C) 2012 by Larry Wall and others
   4  *
   5  *    You may distribute under the terms of either the GNU General Public
   6  *    License or the Artistic License, as specified in the README file.
   7  *
   8  * This file is a home for static inline functions that cannot go in other
   9  * headers files, because they depend on proto.h (included after most other
  10  * headers) or struct definitions.
  11  *
  12  * Each section names the header file that the functions "belong" to.
  13  */
  14
  15 /* ------------------------------- av.h ------------------------------- */
  16
  17 PERL_STATIC_INLINE SSize_t
  18 S_av_top_index(pTHX_ AV *av)
  19 {
  20     PERL_ARGS_ASSERT_AV_TOP_INDEX;
  21     assert(SvTYPE(av) == SVt_PVAV);
  22
  23     return AvFILL(av);
  24 }
  25
  26 /* ------------------------------- cv.h ------------------------------- */
  27
  28 PERL_STATIC_INLINE GV *
  29 S_CvGV(pTHX_ CV *sv)
  30 {
  31     return CvNAMED(sv)
  32         ? Perl_cvgv_from_hek(aTHX_ sv)
  33         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
  34 }
  35
  36 PERL_STATIC_INLINE I32 *
  37 S_CvDEPTHp(const CV * const sv)
  38 {
  39     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
  40     return &((XPVCV*)SvANY(sv))->xcv_depth;
  41 }
  42
  43 /*
  44  CvPROTO returns the prototype as stored, which is not necessarily what
  45  the interpreter should be using. Specifically, the interpreter assumes
  46  that spaces have been stripped, which has been the case if the prototype
  47  was added by toke.c, but is generally not the case if it was added elsewhere.
  48  Since we can't enforce the spacelessness at assignment time, this routine
  49  provides a temporary copy at parse time with spaces removed.
  50  I<orig> is the start of the original buffer, I<len> is the length of the
  51  prototype and will be updated when this returns.
  52  */
  53
  54 #ifdef PERL_CORE
  55 PERL_STATIC_INLINE char *
  56 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
  57 {
  58     SV * tmpsv;
  59     char * tmps;
  60     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
  61     tmps = SvPVX(tmpsv);
  62     while ((*len)--) {
  63         if (!isSPACE(*orig))
  64             *tmps++ = *orig;
  65         orig++;
  66     }
  67     *tmps = '\0';
  68     *len = tmps - SvPVX(tmpsv);
  69                 return SvPVX(tmpsv);
  70 }
  71 #endif
  72
  73 /* ------------------------------- mg.h ------------------------------- */
  74
  75 #if defined(PERL_CORE) || defined(PERL_EXT)
  76 /* assumes get-magic and stringification have already occurred */
  77 PERL_STATIC_INLINE STRLEN
  78 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
  79 {
  80     assert(mg->mg_type == PERL_MAGIC_regex_global);
  81     assert(mg->mg_len != -1);
  82     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
  83         return (STRLEN)mg->mg_len;
  84     else {
  85         const STRLEN pos = (STRLEN)mg->mg_len;
  86         /* Without this check, we may read past the end of the buffer: */
  87         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
  88         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
  89     }
  90 }
  91 #endif
  92
  93 /* ------------------------------- pad.h ------------------------------ */
  94
  95 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
  96 PERL_STATIC_INLINE bool
  97 PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
  98 {
  99     /* is seq within the range _LOW to _HIGH ?
 100      * This is complicated by the fact that PL_cop_seqmax
 101      * may have wrapped around at some point */
 102     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
 103         return FALSE; /* not yet introduced */
 104
 105     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
 106     /* in compiling scope */
 107         if (
 108             (seq >  COP_SEQ_RANGE_LOW(pn))
 109             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
 110             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
 111         )
 112             return TRUE;
 113     }
 114     else if (
 115         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
 116         ?
 117             (  seq >  COP_SEQ_RANGE_LOW(pn)
 118             || seq <= COP_SEQ_RANGE_HIGH(pn))
 119
 120         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
 121              && seq <= COP_SEQ_RANGE_HIGH(pn))
 122     )
 123         return TRUE;
 124     return FALSE;
 125 }
 126 #endif
 127
 128 /* ------------------------------- pp.h ------------------------------- */
 129
 130 PERL_STATIC_INLINE I32
 131 S_TOPMARK(pTHX)
 132 {
 133     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 134                                  "MARK top  %p %" IVdf "\n",
 135                                   PL_markstack_ptr,
 136                                   (IV)*PL_markstack_ptr)));
 137     return *PL_markstack_ptr;
 138 }
 139
 140 PERL_STATIC_INLINE I32
 141 S_POPMARK(pTHX)
 142 {
 143     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
 144                                  "MARK pop  %p %" IVdf "\n",
 145                                   (PL_markstack_ptr-1),
 146                                   (IV)*(PL_markstack_ptr-1))));
 147     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
 148     return *PL_markstack_ptr--;
 149 }
 150
 151 /* ----------------------------- regexp.h ----------------------------- */
 152
 153 PERL_STATIC_INLINE struct regexp *
 154 S_ReANY(const REGEXP * const re)
 155 {
 156     XPV* const p = (XPV*)SvANY(re);
 157     assert(isREGEXP(re));
 158     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
 159                                    : (struct regexp *)p;
 160 }
 161
 162 /* ------------------------------- sv.h ------------------------------- */
 163
 164 PERL_STATIC_INLINE SV *
 165 S_SvREFCNT_inc(SV *sv)
 166 {
 167     if (LIKELY(sv != NULL))
 168         SvREFCNT(sv)++;
 169     return sv;
 170 }
 171 PERL_STATIC_INLINE SV *
 172 S_SvREFCNT_inc_NN(SV *sv)
 173 {
 174     SvREFCNT(sv)++;
 175     return sv;
 176 }
 177 PERL_STATIC_INLINE void
 178 S_SvREFCNT_inc_void(SV *sv)
 179 {
 180     if (LIKELY(sv != NULL))
 181         SvREFCNT(sv)++;
 182 }
 183 PERL_STATIC_INLINE void
 184 S_SvREFCNT_dec(pTHX_ SV *sv)
 185 {
 186     if (LIKELY(sv != NULL)) {
 187         U32 rc = SvREFCNT(sv);
 188         if (LIKELY(rc > 1))
 189             SvREFCNT(sv) = rc - 1;
 190         else
 191             Perl_sv_free2(aTHX_ sv, rc);
 192     }
 193 }
 194
 195 PERL_STATIC_INLINE void
 196 S_SvREFCNT_dec_NN(pTHX_ SV *sv)
 197 {
 198     U32 rc = SvREFCNT(sv);
 199     if (LIKELY(rc > 1))
 200         SvREFCNT(sv) = rc - 1;
 201     else
 202         Perl_sv_free2(aTHX_ sv, rc);
 203 }
 204
 205 PERL_STATIC_INLINE void
 206 SvAMAGIC_on(SV *sv)
 207 {
 208     assert(SvROK(sv));
 209     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
 210 }
 211 PERL_STATIC_INLINE void
 212 SvAMAGIC_off(SV *sv)
 213 {
 214     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
 215         HvAMAGIC_off(SvSTASH(SvRV(sv)));
 216 }
 217
 218 PERL_STATIC_INLINE U32
 219 S_SvPADSTALE_on(SV *sv)
 220 {
 221     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 222     return SvFLAGS(sv) |= SVs_PADSTALE;
 223 }
 224 PERL_STATIC_INLINE U32
 225 S_SvPADSTALE_off(SV *sv)
 226 {
 227     assert(!(SvFLAGS(sv) & SVs_PADTMP));
 228     return SvFLAGS(sv) &= ~SVs_PADSTALE;
 229 }
 230 #if defined(PERL_CORE) || defined (PERL_EXT)
 231 PERL_STATIC_INLINE STRLEN
 232 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
 233 {
 234     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
 235     if (SvGAMAGIC(sv)) {
 236         U8 *hopped = utf8_hop((U8 *)pv, pos);
 237         if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
 238         return (STRLEN)(hopped - (U8 *)pv);
 239     }
 240     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
 241 }
 242 #endif
 243
 244 /* ------------------------------- handy.h ------------------------------- */
 245
 246 /* saves machine code for a common noreturn idiom typically used in Newx*() */
 247 #ifdef GCC_DIAG_PRAGMA
 248 GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */
 249 #endif
 250 static void
 251 S_croak_memory_wrap(void)
 252 {
 253     Perl_croak_nocontext("%s",PL_memory_wrap);
 254 }
 255 #ifdef GCC_DIAG_PRAGMA
 256 GCC_DIAG_RESTORE /* Intentionally left semicolonless. */
 257 #endif
 258
 259 /* ------------------------------- utf8.h ------------------------------- */
 260
 261 /*
 262 =head1 Unicode Support
 263 */
 264
 265 PERL_STATIC_INLINE void
 266 S_append_utf8_from_native_byte(const U8 byte, U8** dest)
 267 {
 268     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
 269      * encoded string at '*dest', updating '*dest' to include it */
 270
 271     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
 272
 273     if (NATIVE_BYTE_IS_INVARIANT(byte))
 274         *((*dest)++) = byte;
 275     else {
 276         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
 277         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
 278     }
 279 }
 280
 281 /*
 282 =for apidoc valid_utf8_to_uvchr
 283 Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
 284 the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
 285 it passes C<L</isUTF8_CHAR>>.  Surrogates, non-character code points, and
 286 non-Unicode code points are allowed.
 287
 288 =cut
 289
 290  */
 291
 292 PERL_STATIC_INLINE UV
 293 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 294 {
 295     const UV expectlen = UTF8SKIP(s);
 296     const U8* send = s + expectlen;
 297     UV uv = *s;
 298
 299     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
 300
 301     if (retlen) {
 302         *retlen = expectlen;
 303     }
 304
 305     /* An invariant is trivially returned */
 306     if (expectlen == 1) {
 307         return uv;
 308     }
 309
 310     /* Remove the leading bits that indicate the number of bytes, leaving just
 311      * the bits that are part of the value */
 312     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
 313
 314     /* Now, loop through the remaining bytes, accumulating each into the
 315      * working total as we go.  (I khw tried unrolling the loop for up to 4
 316      * bytes, but there was no performance improvement) */
 317     for (++s; s < send; s++) {
 318         uv = UTF8_ACCUMULATE(uv, *s);
 319     }
 320
 321     return UNI_TO_NATIVE(uv);
 322
 323 }
 324
 325 /*
 326 =for apidoc is_utf8_invariant_string
 327
 328 Returns TRUE if the first C<len> bytes of the string C<s> are the same
 329 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
 330 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
 331 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
 332 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
 333 characters are invariant, but so also are the C1 controls.
 334
 335 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 336 use this option, that C<s> can't have embedded C<NUL> characters and has to
 337 have a terminating C<NUL> byte).
 338
 339 See also
 340 C<L</is_utf8_string>>,
 341 C<L</is_utf8_string_flags>>,
 342 C<L</is_utf8_string_loc>>,
 343 C<L</is_utf8_string_loc_flags>>,
 344 C<L</is_utf8_string_loclen>>,
 345 C<L</is_utf8_string_loclen_flags>>,
 346 C<L</is_utf8_fixed_width_buf_flags>>,
 347 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 348 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 349 C<L</is_strict_utf8_string>>,
 350 C<L</is_strict_utf8_string_loc>>,
 351 C<L</is_strict_utf8_string_loclen>>,
 352 C<L</is_c9strict_utf8_string>>,
 353 C<L</is_c9strict_utf8_string_loc>>,
 354 and
 355 C<L</is_c9strict_utf8_string_loclen>>.
 356
 357 =cut
 358
 359 */
 360
 361 #define is_utf8_invariant_string(s, len)                                    \
 362                                 is_utf8_invariant_string_loc(s, len, NULL)
 363
 364 /*
 365 =for apidoc is_utf8_invariant_string_loc
 366
 367 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
 368 the first UTF-8 variant character in the C<ep> pointer; if all characters are
 369 UTF-8 invariant, this function does not change the contents of C<*ep>.
 370
 371 =cut
 372
 373 */
 374
 375 PERL_STATIC_INLINE bool
 376 S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
 377 {
 378     const U8* send;
 379     const U8* x = s;
 380
 381     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
 382
 383     if (len == 0) {
 384         len = strlen((const char *)s);
 385     }
 386
 387     send = s + len;
 388
 389 #ifndef EBCDIC
 390     /* Try to get the widest word on this platform */
 391 #  ifdef HAS_LONG_LONG
 392 #    define PERL_WORDCAST unsigned long long
 393 #    define PERL_WORDSIZE LONGLONGSIZE
 394 #  else
 395 #    define PERL_WORDCAST UV
 396 #    define PERL_WORDSIZE UVSIZE
 397 #  endif
 398
 399 #  if PERL_WORDSIZE == 4
 400 #    define PERL_VARIANTS_WORD_MASK 0x80808080
 401 #    define PERL_WORD_BOUNDARY_MASK 0x3
 402 #  elif PERL_WORDSIZE == 8
 403 #    define PERL_VARIANTS_WORD_MASK UINT64_C(0x8080808080808080)
 404 #    define PERL_WORD_BOUNDARY_MASK 0x7
 405 #  else
 406 #    error Unexpected word size
 407 #  endif
 408
 409     /* Process per-byte until reach word boundary.  XXX This loop could be
 410      * eliminated if we knew that this platform had fast unaligned reads */
 411     while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
 412         if (! UTF8_IS_INVARIANT(*x)) {
 413             if (ep) {
 414                 *ep = x;
 415             }
 416
 417             return FALSE;
 418         }
 419         x++;
 420     }
 421
 422     /* Process per-word as long as we have at least a full word left */
 423     while (x + PERL_WORDSIZE <= send) {
 424         if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK)  {
 425
 426             /* Found a variant.  Just return if caller doesn't want its exact
 427              * position */
 428             if (! ep) {
 429                 return FALSE;
 430             }
 431
 432             /* Otherwise fall into final loop to find which byte it is */
 433             break;
 434         }
 435         x += PERL_WORDSIZE;
 436     }
 437
 438 #  undef PERL_WORDCAST
 439 #  undef PERL_WORDSIZE
 440 #  undef PERL_WORD_BOUNDARY_MASK
 441 #  undef PERL_VARIANTS_WORD_MASK
 442 #endif
 443
 444     /* Process per-byte */
 445     while (x < send) {
 446         if (! UTF8_IS_INVARIANT(*x)) {
 447             if (ep) {
 448                 *ep = x;
 449             }
 450
 451             return FALSE;
 452         }
 453
 454         x++;
 455     }
 456
 457     return TRUE;
 458 }
 459
 460 /*
 461 =for apidoc is_utf8_string
 462
 463 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 464 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
 465 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
 466 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
 467 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
 468
 469 This function considers Perl's extended UTF-8 to be valid.  That means that
 470 code points above Unicode, surrogates, and non-character code points are
 471 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
 472 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
 473 code points are considered valid.
 474
 475 See also
 476 C<L</is_utf8_invariant_string>>,
 477 C<L</is_utf8_invariant_string_loc>>,
 478 C<L</is_utf8_string_loc>>,
 479 C<L</is_utf8_string_loclen>>,
 480 C<L</is_utf8_fixed_width_buf_flags>>,
 481 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 482 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 483
 484 =cut
 485 */
 486
 487 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
 488
 489 #if defined(PERL_CORE) || defined (PERL_EXT)
 490
 491 /*
 492 =for apidoc is_utf8_non_invariant_string
 493
 494 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
 495 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
 496 UTF-8; otherwise returns FALSE.
 497
 498 A TRUE return means that at least one code point represented by the sequence
 499 either is a wide character not representable as a single byte, or the
 500 representation differs depending on whether the sequence is encoded in UTF-8 or
 501 not.
 502
 503 See also
 504 C<L<perlapi/is_utf8_invariant_string>>,
 505 C<L<perlapi/is_utf8_string>>
 506
 507 =cut
 508
 509 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
 510 It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
 511 it otherwise contains invalid UTF-8.
 512
 513 It is an internal function because khw thinks that XS code shouldn't be working
 514 at this low a level.  A valid use case could change that.
 515
 516 */
 517
 518 PERL_STATIC_INLINE bool
 519 S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
 520 {
 521     const U8 * first_variant;
 522
 523     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
 524
 525     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 526         return FALSE;
 527     }
 528
 529     return is_utf8_string(first_variant, len - (first_variant - s));
 530 }
 531
 532 #endif
 533
 534 /*
 535 =for apidoc is_strict_utf8_string
 536
 537 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 538 UTF-8-encoded string that is fully interchangeable by any application using
 539 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
 540 calculated using C<strlen(s)> (which means if you use this option, that C<s>
 541 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
 542 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
 543
 544 This function returns FALSE for strings containing any
 545 code points above the Unicode max of 0x10FFFF, surrogate code points, or
 546 non-character code points.
 547
 548 See also
 549 C<L</is_utf8_invariant_string>>,
 550 C<L</is_utf8_invariant_string_loc>>,
 551 C<L</is_utf8_string>>,
 552 C<L</is_utf8_string_flags>>,
 553 C<L</is_utf8_string_loc>>,
 554 C<L</is_utf8_string_loc_flags>>,
 555 C<L</is_utf8_string_loclen>>,
 556 C<L</is_utf8_string_loclen_flags>>,
 557 C<L</is_utf8_fixed_width_buf_flags>>,
 558 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 559 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 560 C<L</is_strict_utf8_string_loc>>,
 561 C<L</is_strict_utf8_string_loclen>>,
 562 C<L</is_c9strict_utf8_string>>,
 563 C<L</is_c9strict_utf8_string_loc>>,
 564 and
 565 C<L</is_c9strict_utf8_string_loclen>>.
 566
 567 =cut
 568 */
 569
 570 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
 571
 572 /*
 573 =for apidoc is_c9strict_utf8_string
 574
 575 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 576 UTF-8-encoded string that conforms to
 577 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
 578 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
 579 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
 580 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
 581 characters being ASCII constitute 'a valid UTF-8 string'.
 582
 583 This function returns FALSE for strings containing any code points above the
 584 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
 585 code points per
 586 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
 587
 588 See also
 589 C<L</is_utf8_invariant_string>>,
 590 C<L</is_utf8_invariant_string_loc>>,
 591 C<L</is_utf8_string>>,
 592 C<L</is_utf8_string_flags>>,
 593 C<L</is_utf8_string_loc>>,
 594 C<L</is_utf8_string_loc_flags>>,
 595 C<L</is_utf8_string_loclen>>,
 596 C<L</is_utf8_string_loclen_flags>>,
 597 C<L</is_utf8_fixed_width_buf_flags>>,
 598 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 599 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 600 C<L</is_strict_utf8_string>>,
 601 C<L</is_strict_utf8_string_loc>>,
 602 C<L</is_strict_utf8_string_loclen>>,
 603 C<L</is_c9strict_utf8_string_loc>>,
 604 and
 605 C<L</is_c9strict_utf8_string_loclen>>.
 606
 607 =cut
 608 */
 609
 610 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
 611
 612 /*
 613 =for apidoc is_utf8_string_flags
 614
 615 Returns TRUE if the first C<len> bytes of string C<s> form a valid
 616 UTF-8 string, subject to the restrictions imposed by C<flags>;
 617 returns FALSE otherwise.  If C<len> is 0, it will be calculated
 618 using C<strlen(s)> (which means if you use this option, that C<s> can't have
 619 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
 620 that all characters being ASCII constitute 'a valid UTF-8 string'.
 621
 622 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
 623 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
 624 as C<L</is_strict_utf8_string>>; and if C<flags> is
 625 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
 626 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
 627 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
 628 C<L</utf8n_to_uvchr>>, with the same meanings.
 629
 630 See also
 631 C<L</is_utf8_invariant_string>>,
 632 C<L</is_utf8_invariant_string_loc>>,
 633 C<L</is_utf8_string>>,
 634 C<L</is_utf8_string_loc>>,
 635 C<L</is_utf8_string_loc_flags>>,
 636 C<L</is_utf8_string_loclen>>,
 637 C<L</is_utf8_string_loclen_flags>>,
 638 C<L</is_utf8_fixed_width_buf_flags>>,
 639 C<L</is_utf8_fixed_width_buf_loc_flags>>,
 640 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
 641 C<L</is_strict_utf8_string>>,
 642 C<L</is_strict_utf8_string_loc>>,
 643 C<L</is_strict_utf8_string_loclen>>,
 644 C<L</is_c9strict_utf8_string>>,
 645 C<L</is_c9strict_utf8_string_loc>>,
 646 and
 647 C<L</is_c9strict_utf8_string_loclen>>.
 648
 649 =cut
 650 */
 651
 652 PERL_STATIC_INLINE bool
 653 S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
 654 {
 655     const U8 * first_variant;
 656
 657     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
 658     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 659                           |UTF8_DISALLOW_PERL_EXTENDED)));
 660
 661     if (len == 0) {
 662         len = strlen((const char *)s);
 663     }
 664
 665     if (flags == 0) {
 666         return is_utf8_string(s, len);
 667     }
 668
 669     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
 670                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
 671     {
 672         return is_strict_utf8_string(s, len);
 673     }
 674
 675     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
 676                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
 677     {
 678         return is_c9strict_utf8_string(s, len);
 679     }
 680
 681     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
 682         const U8* const send = s + len;
 683         const U8* x = first_variant;
 684
 685         while (x < send) {
 686             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
 687             if (UNLIKELY(! cur_len)) {
 688                 return FALSE;
 689             }
 690             x += cur_len;
 691         }
 692     }
 693
 694     return TRUE;
 695 }
 696
 697 /*
 698
 699 =for apidoc is_utf8_string_loc
 700
 701 Like C<L</is_utf8_string>> but stores the location of the failure (in the
 702 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 703 "utf8ness success") in the C<ep> pointer.
 704
 705 See also C<L</is_utf8_string_loclen>>.
 706
 707 =cut
 708 */
 709
 710 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
 711
 712 /*
 713
 714 =for apidoc is_utf8_string_loclen
 715
 716 Like C<L</is_utf8_string>> but stores the location of the failure (in the
 717 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 718 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
 719 encoded characters in the C<el> pointer.
 720
 721 See also C<L</is_utf8_string_loc>>.
 722
 723 =cut
 724 */
 725
 726 PERL_STATIC_INLINE bool
 727 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 728 {
 729     const U8 * first_variant;
 730
 731     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
 732
 733     if (len == 0) {
 734         len = strlen((const char *) s);
 735     }
 736
 737     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 738         if (el)
 739             *el = len;
 740
 741         if (ep) {
 742             *ep = s + len;
 743         }
 744
 745         return TRUE;
 746     }
 747
 748     {
 749         const U8* const send = s + len;
 750         const U8* x = first_variant;
 751         STRLEN outlen = first_variant - s;
 752
 753         while (x < send) {
 754             const STRLEN cur_len = isUTF8_CHAR(x, send);
 755             if (UNLIKELY(! cur_len)) {
 756                 break;
 757             }
 758             x += cur_len;
 759             outlen++;
 760         }
 761
 762         if (el)
 763             *el = outlen;
 764
 765         if (ep) {
 766             *ep = x;
 767         }
 768
 769         return (x == send);
 770     }
 771 }
 772
 773 /*
 774
 775 =for apidoc is_strict_utf8_string_loc
 776
 777 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
 778 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 779 "utf8ness success") in the C<ep> pointer.
 780
 781 See also C<L</is_strict_utf8_string_loclen>>.
 782
 783 =cut
 784 */
 785
 786 #define is_strict_utf8_string_loc(s, len, ep)                               \
 787                                 is_strict_utf8_string_loclen(s, len, ep, 0)
 788
 789 /*
 790
 791 =for apidoc is_strict_utf8_string_loclen
 792
 793 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
 794 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 795 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
 796 encoded characters in the C<el> pointer.
 797
 798 See also C<L</is_strict_utf8_string_loc>>.
 799
 800 =cut
 801 */
 802
 803 PERL_STATIC_INLINE bool
 804 S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 805 {
 806     const U8 * first_variant;
 807
 808     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
 809
 810     if (len == 0) {
 811         len = strlen((const char *) s);
 812     }
 813
 814     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 815         if (el)
 816             *el = len;
 817
 818         if (ep) {
 819             *ep = s + len;
 820         }
 821
 822         return TRUE;
 823     }
 824
 825     {
 826         const U8* const send = s + len;
 827         const U8* x = first_variant;
 828         STRLEN outlen = first_variant - s;
 829
 830         while (x < send) {
 831             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
 832             if (UNLIKELY(! cur_len)) {
 833                 break;
 834             }
 835             x += cur_len;
 836             outlen++;
 837         }
 838
 839         if (el)
 840             *el = outlen;
 841
 842         if (ep) {
 843             *ep = x;
 844         }
 845
 846         return (x == send);
 847     }
 848 }
 849
 850 /*
 851
 852 =for apidoc is_c9strict_utf8_string_loc
 853
 854 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
 855 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 856 "utf8ness success") in the C<ep> pointer.
 857
 858 See also C<L</is_c9strict_utf8_string_loclen>>.
 859
 860 =cut
 861 */
 862
 863 #define is_c9strict_utf8_string_loc(s, len, ep)                             \
 864                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
 865
 866 /*
 867
 868 =for apidoc is_c9strict_utf8_string_loclen
 869
 870 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
 871 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 872 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
 873 characters in the C<el> pointer.
 874
 875 See also C<L</is_c9strict_utf8_string_loc>>.
 876
 877 =cut
 878 */
 879
 880 PERL_STATIC_INLINE bool
 881 S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
 882 {
 883     const U8 * first_variant;
 884
 885     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
 886
 887     if (len == 0) {
 888         len = strlen((const char *) s);
 889     }
 890
 891     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 892         if (el)
 893             *el = len;
 894
 895         if (ep) {
 896             *ep = s + len;
 897         }
 898
 899         return TRUE;
 900     }
 901
 902     {
 903         const U8* const send = s + len;
 904         const U8* x = first_variant;
 905         STRLEN outlen = first_variant - s;
 906
 907         while (x < send) {
 908             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
 909             if (UNLIKELY(! cur_len)) {
 910                 break;
 911             }
 912             x += cur_len;
 913             outlen++;
 914         }
 915
 916         if (el)
 917             *el = outlen;
 918
 919         if (ep) {
 920             *ep = x;
 921         }
 922
 923         return (x == send);
 924     }
 925 }
 926
 927 /*
 928
 929 =for apidoc is_utf8_string_loc_flags
 930
 931 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
 932 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 933 "utf8ness success") in the C<ep> pointer.
 934
 935 See also C<L</is_utf8_string_loclen_flags>>.
 936
 937 =cut
 938 */
 939
 940 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
 941                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
 942
 943
 944 /* The above 3 actual functions could have been moved into the more general one
 945  * just below, and made #defines that call it with the right 'flags'.  They are
 946  * currently kept separate to increase their chances of getting inlined */
 947
 948 /*
 949
 950 =for apidoc is_utf8_string_loclen_flags
 951
 952 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
 953 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
 954 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
 955 encoded characters in the C<el> pointer.
 956
 957 See also C<L</is_utf8_string_loc_flags>>.
 958
 959 =cut
 960 */
 961
 962 PERL_STATIC_INLINE bool
 963 S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
 964 {
 965     const U8 * first_variant;
 966
 967     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
 968     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
 969                           |UTF8_DISALLOW_PERL_EXTENDED)));
 970
 971     if (len == 0) {
 972         len = strlen((const char *) s);
 973     }
 974
 975     if (flags == 0) {
 976         return is_utf8_string_loclen(s, len, ep, el);
 977     }
 978
 979     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
 980                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
 981     {
 982         return is_strict_utf8_string_loclen(s, len, ep, el);
 983     }
 984
 985     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
 986                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
 987     {
 988         return is_c9strict_utf8_string_loclen(s, len, ep, el);
 989     }
 990
 991     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
 992         if (el)
 993             *el = len;
 994
 995         if (ep) {
 996             *ep = s + len;
 997         }
 998
 999         return TRUE;
1000     }
1001
1002     {
1003         const U8* send = s + len;
1004         const U8* x = first_variant;
1005         STRLEN outlen = first_variant - s;
1006
1007         while (x < send) {
1008             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1009             if (UNLIKELY(! cur_len)) {
1010                 break;
1011             }
1012             x += cur_len;
1013             outlen++;
1014         }
1015
1016         if (el)
1017             *el = outlen;
1018
1019         if (ep) {
1020             *ep = x;
1021         }
1022
1023         return (x == send);
1024     }
1025 }
1026
1027 /*
1028 =for apidoc utf8_distance
1029
1030 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1031 and C<b>.
1032
1033 WARNING: use only if you *know* that the pointers point inside the
1034 same UTF-8 buffer.
1035
1036 =cut
1037 */
1038
1039 PERL_STATIC_INLINE IV
1040 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1041 {
1042     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1043
1044     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1045 }
1046
1047 /*
1048 =for apidoc utf8_hop
1049
1050 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1051 forward or backward.
1052
1053 WARNING: do not use the following unless you *know* C<off> is within
1054 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1055 on the first byte of character or just after the last byte of a character.
1056
1057 =cut
1058 */
1059
1060 PERL_STATIC_INLINE U8 *
1061 Perl_utf8_hop(const U8 *s, SSize_t off)
1062 {
1063     PERL_ARGS_ASSERT_UTF8_HOP;
1064
1065     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1066      * the bitops (especially ~) can create illegal UTF-8.
1067      * In other words: in Perl UTF-8 is not just for Unicode. */
1068
1069     if (off >= 0) {
1070         while (off--)
1071             s += UTF8SKIP(s);
1072     }
1073     else {
1074         while (off++) {
1075             s--;
1076             while (UTF8_IS_CONTINUATION(*s))
1077                 s--;
1078         }
1079     }
1080     GCC_DIAG_IGNORE(-Wcast-qual);
1081     return (U8 *)s;
1082     GCC_DIAG_RESTORE;
1083 }
1084
1085 /*
1086 =for apidoc utf8_hop_forward
1087
1088 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1089 forward.
1090
1091 C<off> must be non-negative.
1092
1093 C<s> must be before or equal to C<end>.
1094
1095 When moving forward it will not move beyond C<end>.
1096
1097 Will not exceed this limit even if the string is not valid "UTF-8".
1098
1099 =cut
1100 */
1101
1102 PERL_STATIC_INLINE U8 *
1103 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1104 {
1105     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1106
1107     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1108      * the bitops (especially ~) can create illegal UTF-8.
1109      * In other words: in Perl UTF-8 is not just for Unicode. */
1110
1111     assert(s <= end);
1112     assert(off >= 0);
1113
1114     while (off--) {
1115         STRLEN skip = UTF8SKIP(s);
1116         if ((STRLEN)(end - s) <= skip) {
1117             GCC_DIAG_IGNORE(-Wcast-qual);
1118             return (U8 *)end;
1119             GCC_DIAG_RESTORE;
1120         }
1121         s += skip;
1122     }
1123
1124     GCC_DIAG_IGNORE(-Wcast-qual);
1125     return (U8 *)s;
1126     GCC_DIAG_RESTORE;
1127 }
1128
1129 /*
1130 =for apidoc utf8_hop_back
1131
1132 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1133 backward.
1134
1135 C<off> must be non-positive.
1136
1137 C<s> must be after or equal to C<start>.
1138
1139 When moving backward it will not move before C<start>.
1140
1141 Will not exceed this limit even if the string is not valid "UTF-8".
1142
1143 =cut
1144 */
1145
1146 PERL_STATIC_INLINE U8 *
1147 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1148 {
1149     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1150
1151     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1152      * the bitops (especially ~) can create illegal UTF-8.
1153      * In other words: in Perl UTF-8 is not just for Unicode. */
1154
1155     assert(start <= s);
1156     assert(off <= 0);
1157
1158     while (off++ && s > start) {
1159         s--;
1160         while (UTF8_IS_CONTINUATION(*s) && s > start)
1161             s--;
1162     }
1163
1164     GCC_DIAG_IGNORE(-Wcast-qual);
1165     return (U8 *)s;
1166     GCC_DIAG_RESTORE;
1167 }
1168
1169 /*
1170 =for apidoc utf8_hop_safe
1171
1172 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1173 either forward or backward.
1174
1175 When moving backward it will not move before C<start>.
1176
1177 When moving forward it will not move beyond C<end>.
1178
1179 Will not exceed those limits even if the string is not valid "UTF-8".
1180
1181 =cut
1182 */
1183
1184 PERL_STATIC_INLINE U8 *
1185 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1186 {
1187     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1188
1189     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1190      * the bitops (especially ~) can create illegal UTF-8.
1191      * In other words: in Perl UTF-8 is not just for Unicode. */
1192
1193     assert(start <= s && s <= end);
1194
1195     if (off >= 0) {
1196         return utf8_hop_forward(s, off, end);
1197     }
1198     else {
1199         return utf8_hop_back(s, off, start);
1200     }
1201 }
1202
1203 /*
1204
1205 =for apidoc is_utf8_valid_partial_char
1206
1207 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1208 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1209 points.  Otherwise, it returns 1 if there exists at least one non-empty
1210 sequence of bytes that when appended to sequence C<s>, starting at position
1211 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1212 otherwise returns 0.
1213
1214 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1215 point.
1216
1217 This is useful when a fixed-length buffer is being tested for being well-formed
1218 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1219 it is split somewhere in the middle of the final code point's UTF-8
1220 representation.  (Presumably when the buffer is refreshed with the next chunk
1221 of data, the new first bytes will complete the partial code point.)   This
1222 function is used to verify that the final bytes in the current buffer are in
1223 fact the legal beginning of some code point, so that if they aren't, the
1224 failure can be signalled without having to wait for the next read.
1225
1226 =cut
1227 */
1228 #define is_utf8_valid_partial_char(s, e)                                    \
1229                                 is_utf8_valid_partial_char_flags(s, e, 0)
1230
1231 /*
1232
1233 =for apidoc is_utf8_valid_partial_char_flags
1234
1235 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1236 or not the input is a valid UTF-8 encoded partial character, but it takes an
1237 extra parameter, C<flags>, which can further restrict which code points are
1238 considered valid.
1239
1240 If C<flags> is 0, this behaves identically to
1241 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
1242 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
1243 there is any sequence of bytes that can complete the input partial character in
1244 such a way that a non-prohibited character is formed, the function returns
1245 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
1246 partial character input.  But many  of the other possible excluded types can be
1247 determined from just the first one or two bytes.
1248
1249 =cut
1250  */
1251
1252 PERL_STATIC_INLINE bool
1253 S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1254 {
1255     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1256
1257     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1258                           |UTF8_DISALLOW_PERL_EXTENDED)));
1259
1260     if (s >= e || s + UTF8SKIP(s) <= e) {
1261         return FALSE;
1262     }
1263
1264     return cBOOL(_is_utf8_char_helper(s, e, flags));
1265 }
1266
1267 /*
1268
1269 =for apidoc is_utf8_fixed_width_buf_flags
1270
1271 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1272 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1273 otherwise it returns FALSE.
1274
1275 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1276 without restriction.  If the final few bytes of the buffer do not form a
1277 complete code point, this will return TRUE anyway, provided that
1278 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1279
1280 If C<flags> in non-zero, it can be any combination of the
1281 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1282 same meanings.
1283
1284 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1285 returns FALSE if the final few bytes of the string don't form a complete code
1286 point.
1287
1288 =cut
1289  */
1290 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
1291                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1292
1293 /*
1294
1295 =for apidoc is_utf8_fixed_width_buf_loc_flags
1296
1297 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1298 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
1299 to the beginning of any partial character at the end of the buffer; if there is
1300 no partial character C<*ep> will contain C<s>+C<len>.
1301
1302 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1303
1304 =cut
1305 */
1306
1307 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
1308                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1309
1310 /*
1311
1312 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1313
1314 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1315 complete, valid characters found in the C<el> pointer.
1316
1317 =cut
1318 */
1319
1320 PERL_STATIC_INLINE bool
1321 S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1322                                        STRLEN len,
1323                                        const U8 **ep,
1324                                        STRLEN *el,
1325                                        const U32 flags)
1326 {
1327     const U8 * maybe_partial;
1328
1329     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1330
1331     if (! ep) {
1332         ep  = &maybe_partial;
1333     }
1334
1335     /* If it's entirely valid, return that; otherwise see if the only error is
1336      * that the final few bytes are for a partial character */
1337     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
1338            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1339 }
1340
1341 /* ------------------------------- perl.h ----------------------------- */
1342
1343 /*
1344 =head1 Miscellaneous Functions
1345
1346 =for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name
1347
1348 Test that the given C<pv> doesn't contain any internal C<NUL> characters.
1349 If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
1350
1351 Return TRUE if the name is safe.
1352
1353 Used by the C<IS_SAFE_SYSCALL()> macro.
1354
1355 =cut
1356 */
1357
1358 PERL_STATIC_INLINE bool
1359 S_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name) {
1360     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1361      * perl itself uses xce*() functions which accept 8-bit strings.
1362      */
1363
1364     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1365
1366     if (len > 1) {
1367         char *null_at;
1368         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
1369                 SETERRNO(ENOENT, LIB_INVARG);
1370                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
1371                                    "Invalid \\0 character in %s for %s: %s\\0%s",
1372                                    what, op_name, pv, null_at+1);
1373                 return FALSE;
1374         }
1375     }
1376
1377     return TRUE;
1378 }
1379
1380 /*
1381
1382 Return true if the supplied filename has a newline character
1383 immediately before the first (hopefully only) NUL.
1384
1385 My original look at this incorrectly used the len from SvPV(), but
1386 that's incorrect, since we allow for a NUL in pv[len-1].
1387
1388 So instead, strlen() and work from there.
1389
1390 This allow for the user reading a filename, forgetting to chomp it,
1391 then calling:
1392
1393   open my $foo, "$file\0";
1394
1395 */
1396
1397 #ifdef PERL_CORE
1398
1399 PERL_STATIC_INLINE bool
1400 S_should_warn_nl(const char *pv) {
1401     STRLEN len;
1402
1403     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1404
1405     len = strlen(pv);
1406
1407     return len > 0 && pv[len-1] == '\n';
1408 }
1409
1410 #endif
1411
1412 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
1413
1414 #define MAX_CHARSET_NAME_LENGTH 2
1415
1416 PERL_STATIC_INLINE const char *
1417 get_regex_charset_name(const U32 flags, STRLEN* const lenp)
1418 {
1419     /* Returns a string that corresponds to the name of the regex character set
1420      * given by 'flags', and *lenp is set the length of that string, which
1421      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
1422
1423     *lenp = 1;
1424     switch (get_regex_charset(flags)) {
1425         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
1426         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
1427         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
1428         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
1429         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
1430             *lenp = 2;
1431             return ASCII_MORE_RESTRICT_PAT_MODS;
1432     }
1433     /* The NOT_REACHED; hides an assert() which has a rather complex
1434      * definition in perl.h. */
1435     NOT_REACHED; /* NOTREACHED */
1436     return "?";     /* Unknown */
1437 }
1438
1439 /*
1440
1441 Return false if any get magic is on the SV other than taint magic.
1442
1443 */
1444
1445 PERL_STATIC_INLINE bool
1446 S_sv_only_taint_gmagic(SV *sv) {
1447     MAGIC *mg = SvMAGIC(sv);
1448
1449     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
1450
1451     while (mg) {
1452         if (mg->mg_type != PERL_MAGIC_taint
1453             && !(mg->mg_flags & MGf_GSKIP)
1454             && mg->mg_virtual->svt_get) {
1455             return FALSE;
1456         }
1457         mg = mg->mg_moremagic;
1458     }
1459
1460     return TRUE;
1461 }
1462
1463 /* ------------------ cop.h ------------------------------------------- */
1464
1465
1466 /* Enter a block. Push a new base context and return its address. */
1467
1468 PERL_STATIC_INLINE PERL_CONTEXT *
1469 S_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
1470 {
1471     PERL_CONTEXT * cx;
1472
1473     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
1474
1475     CXINC;
1476     cx = CX_CUR();
1477     cx->cx_type        = type;
1478     cx->blk_gimme      = gimme;
1479     cx->blk_oldsaveix  = saveix;
1480     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
1481     cx->blk_oldcop     = PL_curcop;
1482     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
1483     cx->blk_oldscopesp = PL_scopestack_ix;
1484     cx->blk_oldpm      = PL_curpm;
1485     cx->blk_old_tmpsfloor = PL_tmps_floor;
1486
1487     PL_tmps_floor        = PL_tmps_ix;
1488     CX_DEBUG(cx, "PUSH");
1489     return cx;
1490 }
1491
1492
1493 /* Exit a block (RETURN and LAST). */
1494
1495 PERL_STATIC_INLINE void
1496 S_cx_popblock(pTHX_ PERL_CONTEXT *cx)
1497 {
1498     PERL_ARGS_ASSERT_CX_POPBLOCK;
1499
1500     CX_DEBUG(cx, "POP");
1501     /* these 3 are common to cx_popblock and cx_topblock */
1502     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1503     PL_scopestack_ix = cx->blk_oldscopesp;
1504     PL_curpm         = cx->blk_oldpm;
1505
1506     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
1507      * and leaves a CX entry lying around for repeated use, so
1508      * skip for multicall */                  \
1509     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
1510             || PL_savestack_ix == cx->blk_oldsaveix);
1511     PL_curcop     = cx->blk_oldcop;
1512     PL_tmps_floor = cx->blk_old_tmpsfloor;
1513 }
1514
1515 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
1516  * Whereas cx_popblock() restores the state to the point just before
1517  * cx_pushblock() was called,  cx_topblock() restores it to the point just
1518  * *after* cx_pushblock() was called. */
1519
1520 PERL_STATIC_INLINE void
1521 S_cx_topblock(pTHX_ PERL_CONTEXT *cx)
1522 {
1523     PERL_ARGS_ASSERT_CX_TOPBLOCK;
1524
1525     CX_DEBUG(cx, "TOP");
1526     /* these 3 are common to cx_popblock and cx_topblock */
1527     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
1528     PL_scopestack_ix = cx->blk_oldscopesp;
1529     PL_curpm         = cx->blk_oldpm;
1530
1531     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
1532 }
1533
1534
1535 PERL_STATIC_INLINE void
1536 S_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
1537 {
1538     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
1539
1540     PERL_ARGS_ASSERT_CX_PUSHSUB;
1541
1542     PERL_DTRACE_PROBE_ENTRY(cv);
1543     cx->blk_sub.cv = cv;
1544     cx->blk_sub.olddepth = CvDEPTH(cv);
1545     cx->blk_sub.prevcomppad = PL_comppad;
1546     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
1547     cx->blk_sub.retop = retop;
1548     SvREFCNT_inc_simple_void_NN(cv);
1549     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
1550 }
1551
1552
1553 /* subsets of cx_popsub() */
1554
1555 PERL_STATIC_INLINE void
1556 S_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
1557 {
1558     CV *cv;
1559
1560     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
1561     assert(CxTYPE(cx) == CXt_SUB);
1562
1563     PL_comppad = cx->blk_sub.prevcomppad;
1564     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1565     cv = cx->blk_sub.cv;
1566     CvDEPTH(cv) = cx->blk_sub.olddepth;
1567     cx->blk_sub.cv = NULL;
1568     SvREFCNT_dec(cv);
1569 }
1570
1571
1572 /* handle the @_ part of leaving a sub */
1573
1574 PERL_STATIC_INLINE void
1575 S_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
1576 {
1577     AV *av;
1578
1579     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
1580     assert(CxTYPE(cx) == CXt_SUB);
1581     assert(AvARRAY(MUTABLE_AV(
1582         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
1583                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
1584
1585     CX_POP_SAVEARRAY(cx);
1586     av = MUTABLE_AV(PAD_SVl(0));
1587     if (UNLIKELY(AvREAL(av)))
1588         /* abandon @_ if it got reified */
1589         clear_defarray(av, 0);
1590     else {
1591         CLEAR_ARGARRAY(av);
1592     }
1593 }
1594
1595
1596 PERL_STATIC_INLINE void
1597 S_cx_popsub(pTHX_ PERL_CONTEXT *cx)
1598 {
1599     PERL_ARGS_ASSERT_CX_POPSUB;
1600     assert(CxTYPE(cx) == CXt_SUB);
1601
1602     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
1603
1604     if (CxHASARGS(cx))
1605         cx_popsub_args(cx);
1606     cx_popsub_common(cx);
1607 }
1608
1609
1610 PERL_STATIC_INLINE void
1611 S_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
1612 {
1613     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
1614
1615     cx->blk_format.cv          = cv;
1616     cx->blk_format.retop       = retop;
1617     cx->blk_format.gv          = gv;
1618     cx->blk_format.dfoutgv     = PL_defoutgv;
1619     cx->blk_format.prevcomppad = PL_comppad;
1620     cx->blk_u16                = 0;
1621
1622     SvREFCNT_inc_simple_void_NN(cv);
1623     CvDEPTH(cv)++;
1624     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
1625 }
1626
1627
1628 PERL_STATIC_INLINE void
1629 S_cx_popformat(pTHX_ PERL_CONTEXT *cx)
1630 {
1631     CV *cv;
1632     GV *dfout;
1633
1634     PERL_ARGS_ASSERT_CX_POPFORMAT;
1635     assert(CxTYPE(cx) == CXt_FORMAT);
1636
1637     dfout = cx->blk_format.dfoutgv;
1638     setdefout(dfout);
1639     cx->blk_format.dfoutgv = NULL;
1640     SvREFCNT_dec_NN(dfout);
1641
1642     PL_comppad = cx->blk_format.prevcomppad;
1643     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
1644     cv = cx->blk_format.cv;
1645     cx->blk_format.cv = NULL;
1646     --CvDEPTH(cv);
1647     SvREFCNT_dec_NN(cv);
1648 }
1649
1650
1651 PERL_STATIC_INLINE void
1652 S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
1653 {
1654     PERL_ARGS_ASSERT_CX_PUSHEVAL;
1655
1656     cx->blk_eval.retop         = retop;
1657     cx->blk_eval.old_namesv    = namesv;
1658     cx->blk_eval.old_eval_root = PL_eval_root;
1659     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
1660     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
1661     cx->blk_eval.cur_top_env   = PL_top_env;
1662
1663     assert(!(PL_in_eval     & ~ 0x3F));
1664     assert(!(PL_op->op_type & ~0x1FF));
1665     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
1666 }
1667
1668
1669 PERL_STATIC_INLINE void
1670 S_cx_popeval(pTHX_ PERL_CONTEXT *cx)
1671 {
1672     SV *sv;
1673
1674     PERL_ARGS_ASSERT_CX_POPEVAL;
1675     assert(CxTYPE(cx) == CXt_EVAL);
1676
1677     PL_in_eval = CxOLD_IN_EVAL(cx);
1678     assert(!(PL_in_eval & 0xc0));
1679     PL_eval_root = cx->blk_eval.old_eval_root;
1680     sv = cx->blk_eval.cur_text;
1681     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
1682         cx->blk_eval.cur_text = NULL;
1683         SvREFCNT_dec_NN(sv);
1684     }
1685
1686     sv = cx->blk_eval.old_namesv;
1687     if (sv) {
1688         cx->blk_eval.old_namesv = NULL;
1689         SvREFCNT_dec_NN(sv);
1690     }
1691 }
1692
1693
1694 /* push a plain loop, i.e.
1695  *     { block }
1696  *     while (cond) { block }
1697  *     for (init;cond;continue) { block }
1698  * This loop can be last/redo'ed etc.
1699  */
1700
1701 PERL_STATIC_INLINE void
1702 S_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
1703 {
1704     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
1705     cx->blk_loop.my_op = cLOOP;
1706 }
1707
1708
1709 /* push a true for loop, i.e.
1710  *     for var (list) { block }
1711  */
1712
1713 PERL_STATIC_INLINE void
1714 S_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
1715 {
1716     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
1717
1718     /* this one line is common with cx_pushloop_plain */
1719     cx->blk_loop.my_op = cLOOP;
1720
1721     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
1722     cx->blk_loop.itersave      = itersave;
1723 #ifdef USE_ITHREADS
1724     cx->blk_loop.oldcomppad = PL_comppad;
1725 #endif
1726 }
1727
1728
1729 /* pop all loop types, including plain */
1730
1731 PERL_STATIC_INLINE void
1732 S_cx_poploop(pTHX_ PERL_CONTEXT *cx)
1733 {
1734     PERL_ARGS_ASSERT_CX_POPLOOP;
1735
1736     assert(CxTYPE_is_LOOP(cx));
1737     if (  CxTYPE(cx) == CXt_LOOP_ARY
1738        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
1739     {
1740         /* Free ary or cur. This assumes that state_u.ary.ary
1741          * aligns with state_u.lazysv.cur. See cx_dup() */
1742         SV *sv = cx->blk_loop.state_u.lazysv.cur;
1743         cx->blk_loop.state_u.lazysv.cur = NULL;
1744         SvREFCNT_dec_NN(sv);
1745         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
1746             sv = cx->blk_loop.state_u.lazysv.end;
1747             cx->blk_loop.state_u.lazysv.end = NULL;
1748             SvREFCNT_dec_NN(sv);
1749         }
1750     }
1751     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
1752         SV *cursv;
1753         SV **svp = (cx)->blk_loop.itervar_u.svp;
1754         if ((cx->cx_type & CXp_FOR_GV))
1755             svp = &GvSV((GV*)svp);
1756         cursv = *svp;
1757         *svp = cx->blk_loop.itersave;
1758         cx->blk_loop.itersave = NULL;
1759         SvREFCNT_dec(cursv);
1760     }
1761 }
1762
1763
1764 PERL_STATIC_INLINE void
1765 S_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
1766 {
1767     PERL_ARGS_ASSERT_CX_PUSHWHEN;
1768
1769     cx->blk_givwhen.leave_op = cLOGOP->op_other;
1770 }
1771
1772
1773 PERL_STATIC_INLINE void
1774 S_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
1775 {
1776     PERL_ARGS_ASSERT_CX_POPWHEN;
1777     assert(CxTYPE(cx) == CXt_WHEN);
1778
1779     PERL_UNUSED_ARG(cx);
1780     PERL_UNUSED_CONTEXT;
1781     /* currently NOOP */
1782 }
1783
1784
1785 PERL_STATIC_INLINE void
1786 S_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
1787 {
1788     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
1789
1790     cx->blk_givwhen.leave_op = cLOGOP->op_other;
1791     cx->blk_givwhen.defsv_save = orig_defsv;
1792 }
1793
1794
1795 PERL_STATIC_INLINE void
1796 S_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
1797 {
1798     SV *sv;
1799
1800     PERL_ARGS_ASSERT_CX_POPGIVEN;
1801     assert(CxTYPE(cx) == CXt_GIVEN);
1802
1803     sv = GvSV(PL_defgv);
1804     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
1805     cx->blk_givwhen.defsv_save = NULL;
1806     SvREFCNT_dec(sv);
1807 }
1808
1809 /* ------------------ util.h ------------------------------------------- */
1810
1811 /*
1812 =head1 Miscellaneous Functions
1813
1814 =for apidoc foldEQ
1815
1816 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1817 same
1818 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
1819 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
1820 range bytes match only themselves.
1821
1822 =cut
1823 */
1824
1825 PERL_STATIC_INLINE I32
1826 Perl_foldEQ(const char *s1, const char *s2, I32 len)
1827 {
1828     const U8 *a = (const U8 *)s1;
1829     const U8 *b = (const U8 *)s2;
1830
1831     PERL_ARGS_ASSERT_FOLDEQ;
1832
1833     assert(len >= 0);
1834
1835     while (len--) {
1836         if (*a != *b && *a != PL_fold[*b])
1837             return 0;
1838         a++,b++;
1839     }
1840     return 1;
1841 }
1842
1843 PERL_STATIC_INLINE I32
1844 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
1845 {
1846     /* Compare non-utf8 using Unicode (Latin1) semantics.  Does not work on
1847      * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
1848      * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these.  Nor
1849      * does it check that the strings each have at least 'len' characters */
1850
1851     const U8 *a = (const U8 *)s1;
1852     const U8 *b = (const U8 *)s2;
1853
1854     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
1855
1856     assert(len >= 0);
1857
1858     while (len--) {
1859         if (*a != *b && *a != PL_fold_latin1[*b]) {
1860             return 0;
1861         }
1862         a++, b++;
1863     }
1864     return 1;
1865 }
1866
1867 /*
1868 =for apidoc foldEQ_locale
1869
1870 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
1871 same case-insensitively in the current locale; false otherwise.
1872
1873 =cut
1874 */
1875
1876 PERL_STATIC_INLINE I32
1877 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
1878 {
1879     dVAR;
1880     const U8 *a = (const U8 *)s1;
1881     const U8 *b = (const U8 *)s2;
1882
1883     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
1884
1885     assert(len >= 0);
1886
1887     while (len--) {
1888         if (*a != *b && *a != PL_fold_locale[*b])
1889             return 0;
1890         a++,b++;
1891     }
1892     return 1;
1893 }
1894
1895 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
1896
1897 PERL_STATIC_INLINE void *
1898 S_my_memrchr(const char * s, const char c, const STRLEN len)
1899 {
1900     /* memrchr(), since many platforms lack it */
1901
1902     const char * t = s + len - 1;
1903
1904     PERL_ARGS_ASSERT_MY_MEMRCHR;
1905
1906     while (t >= s) {
1907         if (*t == c) {
1908             return (void *) t;
1909         }
1910         t--;
1911     }
1912
1913     return NULL;
1914 }
1915
1916 #endif
1917
1918 /*
1919  * ex: set ts=8 sts=4 sw=4 et:
1920  */