+PERL_STATIC_INLINE void
+S_SvREFCNT_dec_NN(pTHX_ SV *sv)
+{
+ U32 rc = SvREFCNT(sv);
+ if (LIKELY(rc > 1))
+ SvREFCNT(sv) = rc - 1;
+ else
+ Perl_sv_free2(aTHX_ sv, rc);
+}
+
+PERL_STATIC_INLINE void
+SvAMAGIC_on(SV *sv)
+{
+ assert(SvROK(sv));
+ if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
+}
+PERL_STATIC_INLINE void
+SvAMAGIC_off(SV *sv)
+{
+ if (SvROK(sv) && SvOBJECT(SvRV(sv)))
+ HvAMAGIC_off(SvSTASH(SvRV(sv)));
+}
+
+PERL_STATIC_INLINE U32
+S_SvPADSTALE_on(SV *sv)
+{
+ assert(!(SvFLAGS(sv) & SVs_PADTMP));
+ return SvFLAGS(sv) |= SVs_PADSTALE;
+}
+PERL_STATIC_INLINE U32
+S_SvPADSTALE_off(SV *sv)
+{
+ assert(!(SvFLAGS(sv) & SVs_PADTMP));
+ return SvFLAGS(sv) &= ~SVs_PADSTALE;
+}
+#if defined(PERL_CORE) || defined (PERL_EXT)
+PERL_STATIC_INLINE STRLEN
+S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
+{
+ PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
+ if (SvGAMAGIC(sv)) {
+ U8 *hopped = utf8_hop((U8 *)pv, pos);
+ if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
+ return (STRLEN)(hopped - (U8 *)pv);
+ }
+ return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
+}
+#endif
+
+/* ------------------------------- handy.h ------------------------------- */
+
+/* saves machine code for a common noreturn idiom typically used in Newx*() */
+GCC_DIAG_IGNORE_DECL(-Wunused-function);
+static void
+S_croak_memory_wrap(void)
+{
+ Perl_croak_nocontext("%s",PL_memory_wrap);
+}
+GCC_DIAG_RESTORE_DECL;
+
+/* ------------------------------- utf8.h ------------------------------- */
+
+/*
+=head1 Unicode Support
+*/
+
+PERL_STATIC_INLINE void
+S_append_utf8_from_native_byte(const U8 byte, U8** dest)
+{
+ /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
+ * encoded string at '*dest', updating '*dest' to include it */
+
+ PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
+
+ if (NATIVE_BYTE_IS_INVARIANT(byte))
+ *((*dest)++) = byte;
+ else {
+ *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
+ *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
+ }
+}
+
+/*
+=for apidoc valid_utf8_to_uvchr
+Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
+the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
+it passes C<L</isUTF8_CHAR>>. Surrogates, non-character code points, and
+non-Unicode code points are allowed.
+
+=cut
+
+ */
+
+PERL_STATIC_INLINE UV
+Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
+{
+ const UV expectlen = UTF8SKIP(s);
+ const U8* send = s + expectlen;
+ UV uv = *s;
+
+ PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
+
+ if (retlen) {
+ *retlen = expectlen;
+ }
+
+ /* An invariant is trivially returned */
+ if (expectlen == 1) {
+ return uv;
+ }
+
+ /* Remove the leading bits that indicate the number of bytes, leaving just
+ * the bits that are part of the value */
+ uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+
+ /* Now, loop through the remaining bytes, accumulating each into the
+ * working total as we go. (I khw tried unrolling the loop for up to 4
+ * bytes, but there was no performance improvement) */
+ for (++s; s < send; s++) {
+ uv = UTF8_ACCUMULATE(uv, *s);
+ }
+
+ return UNI_TO_NATIVE(uv);
+
+}
+
+/*
+=for apidoc is_utf8_invariant_string
+
+Returns TRUE if the first C<len> bytes of the string C<s> are the same
+regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
+EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
+are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
+the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
+characters are invariant, but so also are the C1 controls.
+
+If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
+use this option, that C<s> can't have embedded C<NUL> characters and has to
+have a terminating C<NUL> byte).
+
+See also
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+
+*/
+
+#define is_utf8_invariant_string(s, len) \
+ is_utf8_invariant_string_loc(s, len, NULL)
+
+/*
+=for apidoc is_utf8_invariant_string_loc
+
+Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
+the first UTF-8 variant character in the C<ep> pointer; if all characters are
+UTF-8 invariant, this function does not change the contents of C<*ep>.
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
+{
+ const U8* send;
+ const U8* x = s;
+
+ PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
+
+ if (len == 0) {
+ len = strlen((const char *)s);
+ }
+
+ send = s + len;
+
+/* This looks like 0x010101... */
+# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
+
+/* This looks like 0x808080... */
+# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
+# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
+# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
+
+/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
+ * or'ing together the lowest bits of 'x'. Hopefully the final term gets
+ * optimized out completely on a 32-bit system, and its mask gets optimized out
+ * on a 64-bit system */
+# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
+ | ( PTR2nat(x) >> 1) \
+ | ( ( (PTR2nat(x) \
+ & PERL_WORD_BOUNDARY_MASK) >> 2))))
+
+#ifndef EBCDIC
+
+ /* Do the word-at-a-time iff there is at least one usable full word. That
+ * means that after advancing to a word boundary, there still is at least a
+ * full word left. The number of bytes needed to advance is 'wordsize -
+ * offset' unless offset is 0. */
+ if ((STRLEN) (send - x) >= PERL_WORDSIZE
+
+ /* This term is wordsize if subword; 0 if not */
+ + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+
+ /* 'offset' */
+ - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+ {
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
+ }
+ x++;
+ }
+
+ /* Here, we know we have at least one full word to process. Process
+ * per-word as long as we have at least a full word left */
+ do {
+ if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
+
+ /* Found a variant. Just return if caller doesn't want its
+ * exact position */
+ if (! ep) {
+ return FALSE;
+ }
+
+# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
+ || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ *ep = x + _variant_byte_number(* (PERL_UINTMAX_T *) x);
+ assert(*ep >= s && *ep < send);
+
+ return FALSE;
+
+# else /* If weird byte order, drop into next loop to do byte-at-a-time
+ checks. */
+
+ break;
+# endif
+ }
+
+ x += PERL_WORDSIZE;
+
+ } while (x + PERL_WORDSIZE <= send);
+ }
+
+#endif /* End of ! EBCDIC */
+
+ /* Process per-byte */
+ while (x < send) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
+ }
+
+ x++;
+ }
+
+ return TRUE;
+}
+
+#ifndef EBCDIC
+
+PERL_STATIC_INLINE unsigned int
+S__variant_byte_number(PERL_UINTMAX_T word)
+{
+
+ /* This returns the position in a word (0..7) of the first variant byte in
+ * it. This is a helper function. Note that there are no branches */
+
+ assert(word);
+
+ /* Get just the msb bits of each byte */
+ word &= PERL_VARIANTS_WORD_MASK;
+
+# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
+ easiest thing is to hide that from the callers */
+ {
+ unsigned int i;
+ const U8 * s = (U8 *) &word;
+ dTHX;
+
+ for (i = 0; i < sizeof(word); i++ ) {
+ if (s[i]) {
+ return i;
+ }
+ }
+
+ Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
+ __FILE__, __LINE__);
+ }
+
+# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
+
+ /* Bytes are stored like
+ * Byte8 ... Byte2 Byte1
+ * 63..56...15...8 7...0
+ *
+ * Isolate the lsb;
+ * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+ *
+ * The word will look this this, with a rightmost set bit in position 's':
+ * ('x's are don't cares)
+ * s
+ * x..x100..0
+ * x..xx10..0 Right shift (rightmost 0 is shifted off)
+ * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
+ * the 1 just to their left into a 0; the remainder is
+ * untouched
+ * 0..0011..1 The xor with the original, x..xx10..0, clears that
+ * remainder, sets the bottom to all 1
+ * 0..0100..0 Add 1 to clear the word except for the bit in 's'
+ *
+ * Another method is to do 'word &= -word'; but it generates a compiler
+ * message on some platforms about taking the negative of an unsigned */
+
+ word >>= 1;
+ word = 1 + (word ^ (word - 1));
+
+# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ /* Bytes are stored like
+ * Byte1 Byte2 ... Byte8
+ * 63..56 55..47 ... 7...0
+ *
+ * Isolate the msb; http://codeforces.com/blog/entry/10330
+ *
+ * Only the most significant set bit matters. Or'ing word with its right
+ * shift of 1 makes that bit and the next one to its right both 1. Then
+ * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
+ * msb and all to the right being 1. */
+ word |= word >> 1;
+ word |= word >> 2;
+ word |= word >> 4;
+ word |= word >> 8;
+ word |= word >> 16;
+ word |= word >> 32; /* This should get optimized out on 32-bit systems. */
+
+ /* Then subtracting the right shift by 1 clears all but the left-most of
+ * the 1 bits, which is our desired result */
+ word -= (word >> 1);
+
+# else
+# error Unexpected byte order
+# endif
+
+ /* Here 'word' has a single bit set: the msb of the first byte in which it
+ * is set. Calculate that position in the word. We can use this
+ * specialized solution: https://stackoverflow.com/a/32339674/1626653,
+ * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
+ * just get shifted off at compile time) */
+ word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
+ | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
+ | (39 << 24) | (47 << 16)
+ | (55 << 8) | (63 << 0));
+ word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
+
+ /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
+ word = ((word + 1) >> 3) - 1;
+
+# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ /* And invert the result */
+ word = CHARBITS - word - 1;
+
+# endif
+
+ return (unsigned int) word;
+}
+
+#endif
+#if defined(PERL_CORE) || defined(PERL_EXT)
+
+/*
+=for apidoc variant_under_utf8_count
+
+This function looks at the sequence of bytes between C<s> and C<e>, which are
+assumed to be encoded in ASCII/Latin1, and returns how many of them would
+change should the string be translated into UTF-8. Due to the nature of UTF-8,
+each of these would occupy two bytes instead of the single one in the input
+string. Thus, this function returns the precise number of bytes the string
+would expand by when translated to UTF-8.
+
+Unlike most of the other functions that have C<utf8> in their name, the input
+to this function is NOT a UTF-8-encoded string. The function name is slightly
+I<odd> to emphasize this.
+
+This function is internal to Perl because khw thinks that any XS code that
+would want this is probably operating too close to the internals. Presenting a
+valid use case could change that.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>
+and
+C<L<perlapi/is_utf8_invariant_string_loc>>,
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_variant_under_utf8_count(const U8* const s, const U8* const e)
+{
+ const U8* x = s;
+ Size_t count = 0;
+
+ PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
+
+# ifndef EBCDIC
+
+ /* Test if the string is long enough to use word-at-a-time. (Logic is the
+ * same as for is_utf8_invariant_string()) */
+ if ((STRLEN) (e - x) >= PERL_WORDSIZE
+ + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+ - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+ {
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
+ count += ! UTF8_IS_INVARIANT(*x++);
+ }
+
+ /* Process per-word as long as we have at least a full word left */
+ do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
+ explanation of how this works */
+ PERL_UINTMAX_T increment
+ = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
+ * PERL_COUNT_MULTIPLIER)
+ >> ((PERL_WORDSIZE - 1) * CHARBITS);
+ count += (Size_t) increment;
+ x += PERL_WORDSIZE;
+ } while (x + PERL_WORDSIZE <= e);
+ }
+
+# endif
+
+ /* Process per-byte */
+ while (x < e) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ count++;
+ }
+
+ x++;
+ }
+
+ return count;
+}
+
+#endif
+
+#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */
+# undef PERL_WORDSIZE
+# undef PERL_COUNT_MULTIPLIER
+# undef PERL_WORD_BOUNDARY_MASK
+# undef PERL_VARIANTS_WORD_MASK
+#endif
+
+/*
+=for apidoc is_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
+be calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function considers Perl's extended UTF-8 to be valid. That means that
+code points above Unicode, surrogates, and non-character code points are
+considered valid by this function. Use C<L</is_strict_utf8_string>>,
+C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
+code points are considered valid.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+
+=cut
+*/
+
+#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
+
+#if defined(PERL_CORE) || defined (PERL_EXT)
+
+/*
+=for apidoc is_utf8_non_invariant_string
+
+Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
+C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
+UTF-8; otherwise returns FALSE.
+
+A TRUE return means that at least one code point represented by the sequence
+either is a wide character not representable as a single byte, or the
+representation differs depending on whether the sequence is encoded in UTF-8 or
+not.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>,
+C<L<perlapi/is_utf8_string>>
+
+=cut
+
+This is commonly used to determine if a SV's UTF-8 flag should be turned on.
+It generally needn't be if its string is entirely UTF-8 invariant, and it
+shouldn't be if it otherwise contains invalid UTF-8.
+
+It is an internal function because khw thinks that XS code shouldn't be working
+at this low a level. A valid use case could change that.
+
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ return FALSE;
+ }
+
+ return is_utf8_string(first_variant, len - (first_variant - s));
+}
+
+#endif
+
+/*
+=for apidoc is_strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that is fully interchangeable by any application using
+Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
+calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any
+code points above the Unicode max of 0x10FFFF, surrogate code points, or
+non-character code points.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
+
+/*
+=for apidoc is_c9strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that conforms to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
+otherwise it returns FALSE. If C<len> is 0, it will be calculated using
+C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
+C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
+characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any code points above the
+Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
+code points per
+L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
+
+/*
+=for apidoc is_utf8_string_flags
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8 string, subject to the restrictions imposed by C<flags>;
+returns FALSE otherwise. If C<len> is 0, it will be calculated
+using C<strlen(s)> (which means if you use this option, that C<s> can't have
+embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
+that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
+C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</is_strict_utf8_string>>; and if C<flags> is
+C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
+C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
+combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
+C<L</utf8n_to_uvchr>>, with the same meanings.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_PERL_EXTENDED)));
+
+ if (len == 0) {
+ len = strlen((const char *)s);
+ }
+
+ if (flags == 0) {
+ return is_utf8_string(s, len);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+ == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+ {
+ return is_strict_utf8_string(s, len);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+ == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+ {
+ return is_c9strict_utf8_string(s, len);
+ }
+
+ if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+
+ while (x < send) {
+ STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+ if (UNLIKELY(! cur_len)) {
+ return FALSE;
+ }
+ x += cur_len;
+ }
+ }
+
+ return TRUE;
+}
+
+/*
+
+=for apidoc is_utf8_string_loc
+
+Like C<L</is_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_utf8_string_loclen
+
+Like C<L</is_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
+
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isUTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
+}
+
+/*
+
+=for apidoc isUTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point; otherwise it evaluates to 0. If non-zero, the
+value gives how many bytes starting at C<s> comprise the code point's
+representation. Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The code point can be any that will fit in an IV on this machine, using Perl's
+extension to official UTF-8 to represent those higher than the Unicode maximum
+of 0x10FFFF. That means that this macro is used to efficiently decide if the
+next few bytes in C<s> is legal UTF-8 for a single character.
+
+Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
+defined by Unicode to be fully interchangeable across applications;
+C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
+C<L</is_utf8_string_loclen>> to check entire strings.
+
+Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut
+
+This uses an adaptation of the table and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version. A copyright notice for the original
+version is given at the beginning of this file. The Perl adapation is
+documented at the definition of PL_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+ const U8 * s = s0;
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_ISUTF8_CHAR;
+
+ /* This dfa is fast. If it accepts the input, it was for a well-formed,
+ * code point, which can be returned immediately. Otherwise, it is either
+ * malformed, or for the start byte FF which the dfa doesn't handle (except
+ * on 32-bit ASCII platforms where it trivially is an error). Call a
+ * helper function for the other platforms. */
+
+ while (s < e && LIKELY(state != 1)) {
+ state = PL_extended_utf8_dfa_tab[256
+ + state
+ + PL_extended_utf8_dfa_tab[*s]];
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ return s - s0 + 1;
+ }
+
+#if defined(UV_IS_QUAD) || defined(EBCDIC)
+
+ if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
+ return _is_utf8_char_helper(s0, e, 0);
+ }
+
+#endif
+
+ return 0;
+}
+
+/*
+
+=for apidoc isSTRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode code point completely acceptable for open interchange between all
+applications; otherwise it evaluates to 0. If non-zero, the value gives how
+many bytes starting at C<s> comprise the code point's representation. Any
+bytes remaining before C<e>, but beyond the ones needed to form the first code
+point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
+be a surrogate nor a non-character code point. Thus this excludes any code
+point from Perl's extended UTF-8.
+
+This is used to efficiently decide if the next few bytes in C<s> is
+legal Unicode-acceptable UTF-8 for a single character.
+
+Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
+and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version. A copyright notice for the original
+version is given at the beginning of this file. The Perl adapation is
+documented at the definition of strict_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+ const U8 * s = s0;
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
+
+ while (s < e && LIKELY(state != 1)) {
+ state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
+
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ return s - s0 + 1;
+ }
+
+#ifndef EBCDIC
+
+ /* The dfa above drops out for certain Hanguls; handle them specially */
+ if (is_HANGUL_ED_utf8_safe(s0, e)) {
+ return 3;
+ }
+
+#endif
+
+ return 0;
+}
+
+/*
+
+=for apidoc isC9_STRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
+the value gives how many bytes starting at C<s> comprise the code point's
+representation. Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF. This
+differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
+code points. This corresponds to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+which said that non-character code points are merely discouraged rather than
+completely forbidden in open interchange. See
+L<perlunicode/Noncharacter code points>.
+
+Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
+C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version. A copyright notice for the original
+version is given at the beginning of this file. The Perl adapation is
+documented at the definition of PL_c9_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+ const U8 * s = s0;
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
+
+ while (s < e && LIKELY(state != 1)) {
+ state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
+
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ return s - s0 + 1;
+ }
+
+ return 0;
+}
+
+/*
+
+=for apidoc is_strict_utf8_string_loc
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string_loc(s, len, ep) \
+ is_strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_strict_utf8_string_loclen
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
+
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
+}
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loc
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string_loc(s, len, ep) \
+ is_c9strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loclen
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
+characters in the C<el> pointer.
+
+See also C<L</is_c9strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
+
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
+}
+
+/*
+
+=for apidoc is_utf8_string_loc_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc_flags(s, len, ep, flags) \
+ is_utf8_string_loclen_flags(s, len, ep, 0, flags)
+
+
+/* The above 3 actual functions could have been moved into the more general one
+ * just below, and made #defines that call it with the right 'flags'. They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+
+=for apidoc is_utf8_string_loclen_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc_flags>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
+{
+ const U8 * first_variant;
+
+ PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_PERL_EXTENDED)));
+
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
+
+ if (flags == 0) {
+ return is_utf8_string_loclen(s, len, ep, el);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+ == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+ {
+ return is_strict_utf8_string_loclen(s, len, ep, el);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+ == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+ {
+ return is_c9strict_utf8_string_loclen(s, len, ep, el);
+ }
+
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
+ }
+
+ {
+ const U8* send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
+}
+
+/*
+=for apidoc utf8_distance