* header files, because they depend on proto.h (included after most other
* headers) or struct definitions.
*
+ * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
+ * whose details should be exposed to the compiler, for such things as tail
+ * call optimization.
+ *
* Each section names the header file that the functions "belong" to.
*/
assert(key > -1);
if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
- return lval ? av_store_simple(av,key,newSV(0)) : NULL;
+ return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
} else {
return &AvARRAY(av)[key];
}
}
+/*
+=for apidoc av_push_simple
+
+This is a cut-down version of av_push that assumes that the array is very
+straightforward - no magic, not readonly, and AvREAL - and that C<key> is
+not less than -1. This function MUST NOT be used in situations where any
+of those assumptions may not hold.
+
+Pushes an SV (transferring control of one reference count) onto the end of the
+array. The array will grow automatically to accommodate the addition.
+
+Perl equivalent: C<push @myarray, $val;>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE void
+Perl_av_push_simple(pTHX_ AV *av, SV *val)
+{
+ PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
+ assert(SvTYPE(av) == SVt_PVAV);
+ assert(!SvMAGICAL(av));
+ assert(!SvREADONLY(av));
+ assert(AvREAL(av));
+ assert(AvFILLp(av) > -2);
+
+ (void)av_store_simple(av,AvFILLp(av)+1,val);
+}
+
+/*
+=for apidoc av_new_alloc
+
+This implements L<perlapi/C<newAV_alloc_x>>
+and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
+functionality.
+
+Creates a new AV and allocates its SV* array.
+
+This is similar to, but more efficient than doing:
+
+ AV *av = newAV();
+ av_extend(av, key);
+
+The size parameter is used to pre-allocate a SV* array large enough to
+hold at least elements C<0..(size-1)>. C<size> must be at least 1.
+
+The C<zeroflag> parameter controls whether or not the array is NULL
+initialized.
+
+=cut
+*/
+
+PERL_STATIC_INLINE AV *
+Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
+{
+ AV * const av = newAV();
+ SV** ary;
+ PERL_ARGS_ASSERT_AV_NEW_ALLOC;
+ assert(size > 0);
+
+ Newx(ary, size, SV*); /* Newx performs the memwrap check */
+ AvALLOC(av) = ary;
+ AvARRAY(av) = ary;
+ AvMAX(av) = size - 1;
+
+ if (zeroflag)
+ Zero(ary, size, SV*);
+
+ return av;
+}
+
+
/* ------------------------------- cv.h ------------------------------- */
/*
: ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
}
+/*
+=for apidoc CvDEPTH
+Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
+recursive call.
+
+=cut
+*/
PERL_STATIC_INLINE I32 *
Perl_CvDEPTH(const CV * const sv)
{
/* ----------------------------- regexp.h ----------------------------- */
+/* PVLVs need to act as a superset of all scalar types - they are basically
+ * PVMGs with a few extra fields.
+ * REGEXPs are first class scalars, but have many fields that can't be copied
+ * into a PVLV body.
+ *
+ * Hence we take a different approach - instead of a copy, PVLVs store a pointer
+ * back to the original body. To avoid increasing the size of PVLVs just for the
+ * rare case of REGEXP assignment, this pointer is stored in the memory usually
+ * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
+ * read the pointer from the two possible locations. The macro SvLEN() wraps the
+ * access to the union's member xpvlenu_len, but there is no equivalent macro
+ * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
+ *
+ * See commit df6b4bd56551f2d3 for more details. */
+
PERL_STATIC_INLINE struct regexp *
Perl_ReANY(const REGEXP * const re)
{
: (struct regexp *)p;
}
-/* ------------------------------- sv.h ------------------------------- */
-
-PERL_STATIC_INLINE bool
-Perl_SvTRUE(pTHX_ SV *sv)
-{
- PERL_ARGS_ASSERT_SVTRUE;
-
- if (UNLIKELY(sv == NULL))
- return FALSE;
- SvGETMAGIC(sv);
- return SvTRUE_nomg_NN(sv);
-}
-
-PERL_STATIC_INLINE bool
-Perl_SvTRUE_nomg(pTHX_ SV *sv)
-{
- PERL_ARGS_ASSERT_SVTRUE_NOMG;
-
- if (UNLIKELY(sv == NULL))
- return FALSE;
- return SvTRUE_nomg_NN(sv);
-}
-
-PERL_STATIC_INLINE bool
-Perl_SvTRUE_NN(pTHX_ SV *sv)
-{
- PERL_ARGS_ASSERT_SVTRUE_NN;
-
- SvGETMAGIC(sv);
- return SvTRUE_nomg_NN(sv);
-}
-
-PERL_STATIC_INLINE bool
-Perl_SvTRUE_common(pTHX_ SV * sv, const bool sv_2bool_is_fallback)
-{
- PERL_ARGS_ASSERT_SVTRUE_COMMON;
-
- if (UNLIKELY(SvIMMORTAL_INTERP(sv)))
- return SvIMMORTAL_TRUE(sv);
-
- if (! SvOK(sv))
- return FALSE;
-
- if (SvPOK(sv))
- return SvPVXtrue(sv);
-
- if (SvIOK(sv))
- return SvIVX(sv) != 0; /* casts to bool */
-
- if (SvROK(sv) && !(SvOBJECT(SvRV(sv)) && HvAMAGIC(SvSTASH(SvRV(sv)))))
- return TRUE;
-
- if (sv_2bool_is_fallback)
- return sv_2bool_nomg(sv);
-
- return isGV_with_GP(sv);
-}
-
-
-PERL_STATIC_INLINE SV *
-Perl_SvREFCNT_inc(SV *sv)
-{
- if (LIKELY(sv != NULL))
- SvREFCNT(sv)++;
- return sv;
-}
-PERL_STATIC_INLINE SV *
-Perl_SvREFCNT_inc_NN(SV *sv)
-{
- PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
-
- SvREFCNT(sv)++;
- return sv;
-}
-PERL_STATIC_INLINE void
-Perl_SvREFCNT_inc_void(SV *sv)
-{
- if (LIKELY(sv != NULL))
- SvREFCNT(sv)++;
-}
-PERL_STATIC_INLINE void
-Perl_SvREFCNT_dec(pTHX_ SV *sv)
-{
- if (LIKELY(sv != NULL)) {
- U32 rc = SvREFCNT(sv);
- if (LIKELY(rc > 1))
- SvREFCNT(sv) = rc - 1;
- else
- Perl_sv_free2(aTHX_ sv, rc);
- }
-}
-
-PERL_STATIC_INLINE void
-Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
-{
- U32 rc = SvREFCNT(sv);
-
- PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
-
- if (LIKELY(rc > 1))
- SvREFCNT(sv) = rc - 1;
- else
- Perl_sv_free2(aTHX_ sv, rc);
-}
-
-PERL_STATIC_INLINE void
-Perl_SvAMAGIC_on(SV *sv)
-{
- PERL_ARGS_ASSERT_SVAMAGIC_ON;
- assert(SvROK(sv));
-
- if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
-}
-PERL_STATIC_INLINE void
-Perl_SvAMAGIC_off(SV *sv)
-{
- PERL_ARGS_ASSERT_SVAMAGIC_OFF;
-
- if (SvROK(sv) && SvOBJECT(SvRV(sv)))
- HvAMAGIC_off(SvSTASH(SvRV(sv)));
-}
-
-PERL_STATIC_INLINE U32
-Perl_SvPADSTALE_on(SV *sv)
-{
- assert(!(SvFLAGS(sv) & SVs_PADTMP));
- return SvFLAGS(sv) |= SVs_PADSTALE;
-}
-PERL_STATIC_INLINE U32
-Perl_SvPADSTALE_off(SV *sv)
-{
- assert(!(SvFLAGS(sv) & SVs_PADTMP));
- return SvFLAGS(sv) &= ~SVs_PADSTALE;
-}
-#if defined(PERL_CORE) || defined (PERL_EXT)
-PERL_STATIC_INLINE STRLEN
-S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
-{
- PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
- if (SvGAMAGIC(sv)) {
- U8 *hopped = utf8_hop((U8 *)pv, pos);
- if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
- return (STRLEN)(hopped - (U8 *)pv);
- }
- return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
-}
-#endif
-
/* ------------------------------- utf8.h ------------------------------- */
/*
# endif
#endif
-#if defined(_MSC_VER) && _MSC_VER >= 1400
+#if defined(_MSC_VER)
# include <intrin.h>
# pragma intrinsic(_BitScanForward)
# pragma intrinsic(_BitScanReverse)
return (unsigned) PERL_CTZ_64(word);
-# elif U64SIZE == 8 && defined(_MSC_VER) && _MSC_VER >= 1400
+# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
# define PERL_HAS_FAST_GET_LSB_POS64
{
return (unsigned) PERL_CTZ_32(word);
-#elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
+#elif U32SIZE == 4 && defined(_MSC_VER)
# define PERL_HAS_FAST_GET_LSB_POS32
{
return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
-# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400
+# elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
# define PERL_HAS_FAST_GET_MSB_POS64
{
return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
-#elif U32SIZE == 4 && defined(_MSC_VER) && _MSC_VER >= 1400
+#elif U32SIZE == 4 && defined(_MSC_VER)
# define PERL_HAS_FAST_GET_MSB_POS32
{
#endif
-#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
+ /* Keep these around for these files */
+#if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
# undef PERL_WORDSIZE
# undef PERL_COUNT_MULTIPLIER
# undef PERL_WORD_BOUNDARY_MASK
}
}
+/* The perl core arranges to never call the DFA below without there being at
+ * least one byte available to look at. This allows the DFA to use a do {}
+ * while loop which means that calling it with a UTF-8 invariant has a single
+ * conditional, same as the calling code checking for invariance ahead of time.
+ * And having the calling code remove that conditional speeds up by that
+ * conditional, the case where it wasn't invariant. So there's no reason to
+ * check before caling this.
+ *
+ * But we don't know this for non-core calls, so have to retain the check for
+ * them. */
+#ifdef PERL_CORE
+# define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
+#else
+# define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
+#endif
+
+/*
+ * DFA for checking input is valid UTF-8 syntax.
+ *
+ * This uses adaptations of the table and algorithm given in
+ * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+ * documentation of the original version. A copyright notice for the original
+ * version is given at the beginning of this file. The Perl adapations are
+ * documented at the definition of PL_extended_utf8_dfa_tab[].
+ *
+ * This dfa is fast. There are three exit conditions:
+ * 1) a well-formed code point, acceptable to the table
+ * 2) the beginning bytes of an incomplete character, whose completion might
+ * or might not be acceptable
+ * 3) unacceptable to the table. Some of the adaptations have certain,
+ * hopefully less likely to occur, legal inputs be unacceptable to the
+ * table, so these must be sorted out afterwards.
+ *
+ * This macro is a complete implementation of the code executing the DFA. It
+ * is passed the input sequence bounds and the table to use, and what to do
+ * for each of the exit conditions. There are three canned actions, likely to
+ * be the ones you want:
+ * DFA_RETURN_SUCCESS_
+ * DFA_RETURN_FAILURE_
+ * DFA_GOTO_TEASE_APART_FF_
+ *
+ * You pass a parameter giving the action to take for each of the three
+ * possible exit conditions:
+ *
+ * 'accept_action' This is executed when the DFA accepts the input.
+ * DFA_RETURN_SUCCESS_ is the most likely candidate.
+ * 'reject_action' This is executed when the DFA rejects the input.
+ * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
+ * you have written code to distinguish the rejecting state
+ * results. Because it happens in several places, and
+ * involves #ifdefs, the special action
+ * DFA_GOTO_TEASE_APART_FF_ is what you want with
+ * PL_extended_utf8_dfa_tab. On platforms without
+ * EXTRA_LONG_UTF8, there is no need to tease anything apart,
+ * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
+ * need to have a label 'tease_apart_FF' that it will transfer
+ * to.
+ * 'incomplete_char_action' This is executed when the DFA ran off the end
+ * before accepting or rejecting the input.
+ * DFA_RETURN_FAILURE_ is the likely action, but you could
+ * have a 'goto', or NOOP. In the latter case the DFA drops
+ * off the end, and you place your code to handle this case
+ * immediately after it.
+ */
+
+#define DFA_RETURN_SUCCESS_ return s - s0
+#define DFA_RETURN_FAILURE_ return 0
+#ifdef HAS_EXTRA_LONG_UTF8
+# define DFA_TEASE_APART_FF_ goto tease_apart_FF
+#else
+# define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
+#endif
+
+#define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
+ accept_action, \
+ reject_action, \
+ incomplete_char_action) \
+ STMT_START { \
+ const U8 * s = s0; \
+ const U8 * e_ = e; \
+ UV state = 0; \
+ \
+ PERL_NON_CORE_CHECK_EMPTY(s, e_); \
+ \
+ do { \
+ state = dfa_tab[256 + state + dfa_tab[*s]]; \
+ s++; \
+ \
+ if (state == 0) { /* Accepting state */ \
+ accept_action; \
+ } \
+ \
+ if (UNLIKELY(state == 1)) { /* Rejecting state */ \
+ reject_action; \
+ } \
+ } while (s < e_); \
+ \
+ /* Here, dropped out of loop before end-of-char */ \
+ incomplete_char_action; \
+ } STMT_END
+
+
/*
=for apidoc isUTF8_CHAR
documentation of the original version. A copyright notice for the original
version is given at the beginning of this file. The Perl adapation is
documented at the definition of PL_extended_utf8_dfa_tab[].
-
*/
PERL_STATIC_INLINE Size_t
Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
{
- const U8 * s = s0;
- UV state = 0;
-
PERL_ARGS_ASSERT_ISUTF8_CHAR;
- /* This dfa is fast. If it accepts the input, it was for a well-formed,
- * code point, which can be returned immediately. Otherwise, it is either
- * malformed, or for the start byte FF which the dfa doesn't handle (except
- * on 32-bit ASCII platforms where it trivially is an error). Call a
- * helper function for the other platforms. */
-
- while (s < e) {
- state = PL_extended_utf8_dfa_tab[ 256
- + state
- + PL_extended_utf8_dfa_tab[*s]];
- s++;
-
- if (state == 0) {
- return s - s0;
- }
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+ DFA_RETURN_SUCCESS_,
+ DFA_TEASE_APART_FF_,
+ DFA_RETURN_FAILURE_);
- if (UNLIKELY(state == 1)) {
- break;
- }
- }
+ /* Here, we didn't return success, but dropped out of the loop. In the
+ * case of PL_extended_utf8_dfa_tab, this means the input is either
+ * malformed, or the start byte was FF on a platform that the dfa doesn't
+ * handle FF's. Call a helper function. */
+
+#ifdef HAS_EXTRA_LONG_UTF8
-#if defined(UV_IS_QUAD) || defined(EBCDIC)
+ tease_apart_FF:
- if (e - s0 >= UTF8_MAXBYTES && NATIVE_UTF8_TO_I8(*s0) == 0xFF) {
- return is_utf8_char_helper(s0, e, 0);
+ /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
+ * either malformed, or was for the largest possible start byte, which we
+ * now check, not inline */
+ if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
+ return 0;
}
+ return is_utf8_FF_helper_(s0, e,
+ FALSE /* require full, not partial char */
+ );
#endif
- return 0;
}
/*
PERL_STATIC_INLINE Size_t
Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
{
- const U8 * s = s0;
- UV state = 0;
-
PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
- while (s < e) {
- state = PL_strict_utf8_dfa_tab[ 256
- + state
- + PL_strict_utf8_dfa_tab[*s]];
- s++;
-
- if (state == 0) {
- return s - s0;
- }
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
+ DFA_RETURN_SUCCESS_,
+ goto check_hanguls,
+ DFA_RETURN_FAILURE_);
+ check_hanguls:
- if (UNLIKELY(state == 1)) {
- break;
- }
- }
+ /* Here, we didn't return success, but dropped out of the loop. In the
+ * case of PL_strict_utf8_dfa_tab, this means the input is either
+ * malformed, or was for certain Hanguls; handle them specially */
/* The dfa above drops out for incomplete or illegal inputs, and certain
* legal Hanguls; check and return accordingly */
PERL_STATIC_INLINE Size_t
Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
{
- const U8 * s = s0;
- UV state = 0;
-
PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
- while (s < e) {
- state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
- s++;
-
- if (state == 0) {
- return s - s0;
- }
-
- if (UNLIKELY(state == 1)) {
- break;
- }
- }
-
- return 0;
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
+ DFA_RETURN_SUCCESS_,
+ DFA_RETURN_FAILURE_,
+ DFA_RETURN_FAILURE_);
}
/*
=for apidoc utf8_hop
Return the UTF-8 pointer C<s> displaced by C<off> characters, either
-forward or backward.
+forward (if C<off> is positive) or backward (if negative). C<s> does not need
+to be pointing to the starting byte of a character. If it isn't, one count of
+C<off> will be used up to get to the start of the next character for forward
+hops, and to the start of the current character for negative ones.
-WARNING: do not use the following unless you *know* C<off> is within
-the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
-on the first byte of character or just after the last byte of a character.
+WARNING: Prefer L</utf8_hop_safe> to this one.
+
+Do NOT use this function unless you B<know> C<off> is within
+the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
+on the first byte of a character or just after the last byte of a character.
=cut
*/
PERL_ARGS_ASSERT_UTF8_HOP;
/* Note: cannot use UTF8_IS_...() too eagerly here since e.g
- * the bitops (especially ~) can create illegal UTF-8.
+ * the XXX bitops (especially ~) can create illegal UTF-8.
* In other words: in Perl UTF-8 is not just for Unicode. */
- if (off >= 0) {
+ if (off > 0) {
+
+ /* Get to next non-continuation byte */
+ if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
+ do {
+ s++;
+ }
+ while (UTF8_IS_CONTINUATION(*s));
+ off--;
+ }
+
while (off--)
s += UTF8SKIP(s);
}
s--;
}
}
+
GCC_DIAG_IGNORE(-Wcast-qual)
return (U8 *)s;
GCC_DIAG_RESTORE
=for apidoc utf8_hop_forward
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-forward.
+forward. C<s> does not need to be pointing to the starting byte of a
+character. If it isn't, one count of C<off> will be used up to get to the
+start of the next character.
C<off> must be non-negative.
assert(s <= end);
assert(off >= 0);
+ if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
+ /* Get to next non-continuation byte */
+ do {
+ s++;
+ }
+ while (UTF8_IS_CONTINUATION(*s));
+ off--;
+ }
+
while (off--) {
STRLEN skip = UTF8SKIP(s);
if ((STRLEN)(end - s) <= skip) {
=for apidoc utf8_hop_back
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-backward.
+backward. C<s> does not need to be pointing to the starting byte of a
+character. If it isn't, one count of C<off> will be used up to get to that
+start.
C<off> must be non-positive.
assert(start <= s);
assert(off <= 0);
+ /* Note: if we know that the input is well-formed, we can do per-word
+ * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
+ * that. But it was reverted because doing per-word has some
+ * start-up/tear-down overhead, so only makes sense if the distance to be
+ * moved is large, and core perl doesn't currently move more than a few
+ * characters at a time. You can reinstate it if it does become
+ * advantageous. */
while (off++ && s > start) {
do {
s--;
=for apidoc utf8_hop_safe
Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
-either forward or backward.
+either forward or backward. C<s> does not need to be pointing to the starting
+byte of a character. If it isn't, one count of C<off> will be used up to get
+to the start of the next character for forward hops, and to the start of the
+current character for negative ones.
When moving backward it will not move before C<start>.
/*
+=for apidoc isUTF8_CHAR_flags
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point, subject to the restrictions given by C<flags>;
+otherwise it evaluates to 0. If non-zero, the value gives how many bytes
+starting at C<s> comprise the code point's representation. Any bytes remaining
+before C<e>, but beyond the ones needed to form the first code point in C<s>,
+are not examined.
+
+If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
+if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</isSTRICT_UTF8_CHAR>>;
+and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
+the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
+Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
+understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+
+The three alternative macros are for the most commonly needed validations; they
+are likely to run somewhat faster than this more general one, as they can be
+inlined into your code.
+
+Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
+L</is_utf8_string_loclen_flags> to check entire strings.
+
+=cut
+*/
+
+PERL_STATIC_INLINE STRLEN
+Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
+{
+ PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_PERL_EXTENDED)));
+
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+ goto check_success,
+ DFA_TEASE_APART_FF_,
+ DFA_RETURN_FAILURE_);
+
+ check_success:
+
+ return is_utf8_char_helper_(s0, e, flags);
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+ tease_apart_FF:
+
+ /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
+ * either malformed, or was for the largest possible start byte, which
+ * indicates perl extended UTF-8, well above the Unicode maximum */
+ if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
+ || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+ {
+ return 0;
+ }
+
+ /* Otherwise examine the sequence not inline */
+ return is_utf8_FF_helper_(s0, e,
+ FALSE /* require full, not partial char */
+ );
+#endif
+
+}
+
+/*
+
=for apidoc is_utf8_valid_partial_char
Returns 0 if the sequence of bytes starting at C<s> and looking no further than
*/
PERL_STATIC_INLINE bool
-Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
+Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
{
PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
-
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
- if (s >= e || s + UTF8SKIP(s) <= e) {
- return FALSE;
+ PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
+ DFA_RETURN_FAILURE_,
+ DFA_TEASE_APART_FF_,
+ NOOP);
+
+ /* The NOOP above causes the DFA to drop down here iff the input was a
+ * partial character. flags=0 => can return TRUE immediately; otherwise we
+ * need to check (not inline) if the partial character is the beginning of
+ * a disallowed one */
+ if (flags == 0) {
+ return TRUE;
}
- return cBOOL(is_utf8_char_helper(s, e, flags));
+ return cBOOL(is_utf8_char_helper_(s0, e, flags));
+
+#ifdef HAS_EXTRA_LONG_UTF8
+
+ tease_apart_FF:
+
+ /* Getting here means the input is either malformed, or, in the case of
+ * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
+ * latter case has to be extended UTF-8, so can fail immediately if that is
+ * forbidden */
+
+ if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
+ || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
+ {
+ return 0;
+ }
+
+ return is_utf8_FF_helper_(s0, e,
+ TRUE /* Require to be a partial character */
+ );
+#endif
+
}
/*
PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 *s,
- STRLEN curlen,
- STRLEN *retlen,
- const U32 flags,
- U32 * errors,
- AV ** msgs)
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags,
+ U32 * errors,
+ AV ** msgs)
{
/* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
* simple cases, and, if necessary calls a helper function to deal with the
const U8 * const s0 = s;
const U8 * send = s0 + curlen;
- UV uv = 0; /* The 0 silences some stupid compilers */
- UV state = 0;
+ UV type;
+ UV uv;
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
* Otherwise we call a helper function to figure out the more complicated
* cases. */
- while (s < send && LIKELY(state != 1)) {
- UV type = PL_strict_utf8_dfa_tab[*s];
+ /* No calls from core pass in an empty string; non-core need a check */
+#ifdef PERL_CORE
+ assert(curlen > 0);
+#else
+ if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
+ flags, errors, msgs);
+#endif
- uv = (state == 0)
- ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
- : UTF8_ACCUMULATE(uv, *s);
- state = PL_strict_utf8_dfa_tab[256 + state + type];
+ type = PL_strict_utf8_dfa_tab[*s];
- if (state != 0) {
- s++;
- continue;
- }
+ /* The table is structured so that 'type' is 0 iff the input byte is
+ * represented identically regardless of the UTF-8ness of the string */
+ if (type == 0) { /* UTF-8 invariants are returned unchanged */
+ uv = *s;
+ }
+ else {
+ UV state = PL_strict_utf8_dfa_tab[256 + type];
+ uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
- if (retlen) {
- *retlen = s - s0 + 1;
- }
- if (errors) {
- *errors = 0;
- }
- if (msgs) {
- *msgs = NULL;
+ while (++s < send) {
+ type = PL_strict_utf8_dfa_tab[*s];
+ state = PL_strict_utf8_dfa_tab[256 + state + type];
+
+ uv = UTF8_ACCUMULATE(uv, *s);
+
+ if (state == 0) {
+#ifdef EBCDIC
+ uv = UNI_TO_NATIVE(uv);
+#endif
+ goto success;
+ }
+
+ if (UNLIKELY(state == 1)) {
+ break;
+ }
}
- return UNI_TO_NATIVE(uv);
+ /* Here is potentially problematic. Use the full mechanism */
+ return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
+ errors, msgs);
}
- /* Here is potentially problematic. Use the full mechanism */
- return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
+ success:
+ if (retlen) {
+ *retlen = s - s0 + 1;
+ }
+ if (errors) {
+ *errors = 0;
+ }
+ if (msgs) {
+ *msgs = NULL;
+ }
+
+ return uv;
}
PERL_STATIC_INLINE UV
}
else {
UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
- if (retlen && ret == 0 && *s != '\0') {
+ if (retlen && ret == 0 && (send <= s || *s != '\0')) {
*retlen = (STRLEN) -1;
}
/* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
-#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
+#if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
#define MAX_CHARSET_NAME_LENGTH 2
SvREFCNT_dec(sv);
}
+/*
+=for apidoc newPADxVOP
+
+Constructs, checks and returns an op containing a pad offset. C<type> is
+the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
+or C<OP_PADCV>. The returned op will have the C<op_targ> field set by
+the C<padix> argument.
+
+This is convenient when constructing a large optree in nested function
+calls, as it avoids needing to store the pad op directly to set the
+C<op_targ> field as a side-effect. For example
+
+ o = op_append_elem(OP_LINESEQ, o,
+ newPADxVOP(OP_PADSV, 0, padix));
+
+=cut
+*/
+
+PERL_STATIC_INLINE OP *
+Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
+{
+ PERL_ARGS_ASSERT_NEWPADXVOP;
+
+ assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
+ || type == OP_PADCV);
+ OP *o = newOP(type, flags);
+ o->op_targ = padix;
+ return o;
+}
+
/* ------------------ util.h ------------------------------------------- */
/*
*/
PERL_STATIC_INLINE I32
-Perl_foldEQ(const char *s1, const char *s2, I32 len)
+Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
{
const U8 *a = (const U8 *)s1;
const U8 *b = (const U8 *)s2;
}
PERL_STATIC_INLINE I32
-Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
+Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
{
/* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
* representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
*/
PERL_STATIC_INLINE I32
-Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
+Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
{
const U8 *a = (const U8 *)s1;
const U8 *b = (const U8 *)s2;
assert(len >= 0);
while (len--) {
- if (*a != *b && *a != PL_fold_locale[*b])
+ if (*a != *b && *a != PL_fold_locale[*b]) {
+ DEBUG_Lv(PerlIO_printf(Perl_debug_log,
+ "%s:%d: Our records indicate %02x is not a fold of %02x"
+ " or its mate %02x\n",
+ __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
+
return 0;
+ }
a++,b++;
}
return 1;
* mutex is defined accordingly.
*
* But in all cases, using the mutex prevents these problems, as long as
- * all code uses the same mutex..
+ * all code uses the same mutex.
*
* A complication is that this can be called during phases where the
* mortalization process isn't available. These are in interpreter
}
}
- /* Then each of the three significant characters */
+ /* Then each of the four significant characters */
if (strchr(ret, 'm')) {
*mem_log_meat++ = 'm';
}
if (strchr(ret, 't')) {
*mem_log_meat++ = 't';
}
+ if (strchr(ret, 'c')) {
+ *mem_log_meat++ = 'c';
+ }
*mem_log_meat = '\0';
assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
ret = getenv(str);
if (ret != NULL) {
- ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
+ ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
}
GETENV_UNLOCK;
return ret;
}
+PERL_STATIC_INLINE bool
+Perl_sv_isbool(pTHX_ const SV *sv)
+{
+ /* change to the following in 5.37, logically the same but
+ * more efficient and more future proof */
+#if 0
+ return (SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv));
+#else
+ return SvIOK(sv) && SvPOK(sv) && SvIsCOW_static(sv) &&
+ (SvPVX_const(sv) == PL_Yes || SvPVX_const(sv) == PL_No);
+#endif
+
+}
+
+#ifdef USE_ITHREADS
+
+PERL_STATIC_INLINE AV *
+Perl_cop_file_avn(pTHX_ const COP *cop) {
+
+ PERL_ARGS_ASSERT_COP_FILE_AVN;
+
+ const char *file = CopFILE(cop);
+ if (file) {
+ GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
+ if (gv) {
+ return GvAVn(gv);
+ }
+ else
+ return NULL;
+ }
+ else
+ return NULL;
+}
+
+#endif
+
+PERL_STATIC_INLINE PADNAME *
+Perl_padname_refcnt_inc(PADNAME *pn)
+{
+ PadnameREFCNT(pn)++;
+ return pn;
+}
+
/*
* ex: set ts=8 sts=4 sw=4 et:
*/