inline.h: Use 'do {} while' instead of 'while {}'

[perl5.git] / inline.h
diff --git a/inline.h b/inline.h

index f709572..0d43656 100644 (file)
--- a/inline.h
+++ b/inline.h
@@ -5,8 +5,33 @@
   *    You may distribute under the terms of either the GNU General Public
   *    License or the Artistic License, as specified in the README file.
   *
+ *    This file contains tables and code adapted from
+ *    http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
+ *    copyright notice:
+
+Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+ *
   * This file is a home for static inline functions that cannot go in other
- * headers files, because they depend on proto.h (included after most other
+ * header files, because they depend on proto.h (included after most other
   * headers) or struct definitions.
   *
   * Each section names the header file that the functions "belong" to.
@@ -131,7 +156,7 @@ PERL_STATIC_INLINE I32
  S_TOPMARK(pTHX)
  {
      DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
-                                "MARK top  %p %"IVdf"\n",
+                                "MARK top  %p %" IVdf "\n",
                                   PL_markstack_ptr,
                                   (IV)*PL_markstack_ptr)));
      return *PL_markstack_ptr;
@@ -141,7 +166,7 @@ PERL_STATIC_INLINE I32
  S_POPMARK(pTHX)
  {
      DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
-                                "MARK pop  %p %"IVdf"\n",
+                                "MARK pop  %p %" IVdf "\n",
                                   (PL_markstack_ptr-1),
                                   (IV)*(PL_markstack_ptr-1))));
      assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
@@ -153,8 +178,10 @@ S_POPMARK(pTHX)
  PERL_STATIC_INLINE struct regexp *
  S_ReANY(const REGEXP * const re)
  {
+    XPV* const p = (XPV*)SvANY(re);
      assert(isREGEXP(re));
-    return re->sv_u.svu_rx;
+    return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
+                                   : (struct regexp *)p;
  }
  
  /* ------------------------------- sv.h ------------------------------- */
@@ -237,295 +264,1582 @@ S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
      }
      return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
  }
-#endif
+#endif
+
+/* ------------------------------- handy.h ------------------------------- */
+
+/* saves machine code for a common noreturn idiom typically used in Newx*() */
+GCC_DIAG_IGNORE_DECL(-Wunused-function);
+static void
+S_croak_memory_wrap(void)
+{
+    Perl_croak_nocontext("%s",PL_memory_wrap);
+}
+GCC_DIAG_RESTORE_DECL;
+
+/* ------------------------------- utf8.h ------------------------------- */
+
+/*
+=head1 Unicode Support
+*/
+
+PERL_STATIC_INLINE void
+S_append_utf8_from_native_byte(const U8 byte, U8** dest)
+{
+    /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
+     * encoded string at '*dest', updating '*dest' to include it */
+
+    PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
+
+    if (NATIVE_BYTE_IS_INVARIANT(byte))
+        *((*dest)++) = byte;
+    else {
+        *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
+        *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
+    }
+}
+
+/*
+=for apidoc valid_utf8_to_uvchr
+Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
+the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
+it passes C<L</isUTF8_CHAR>>.  Surrogates, non-character code points, and
+non-Unicode code points are allowed.
+
+=cut
+
+ */
+
+PERL_STATIC_INLINE UV
+Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
+{
+    const UV expectlen = UTF8SKIP(s);
+    const U8* send = s + expectlen;
+    UV uv = *s;
+
+    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
+
+    if (retlen) {
+        *retlen = expectlen;
+    }
+
+    /* An invariant is trivially returned */
+    if (expectlen == 1) {
+       return uv;
+    }
+
+    /* Remove the leading bits that indicate the number of bytes, leaving just
+     * the bits that are part of the value */
+    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+
+    /* Now, loop through the remaining bytes, accumulating each into the
+     * working total as we go.  (I khw tried unrolling the loop for up to 4
+     * bytes, but there was no performance improvement) */
+    for (++s; s < send; s++) {
+        uv = UTF8_ACCUMULATE(uv, *s);
+    }
+
+    return UNI_TO_NATIVE(uv);
+
+}
+
+/*
+=for apidoc is_utf8_invariant_string
+
+Returns TRUE if the first C<len> bytes of the string C<s> are the same
+regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
+EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
+are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
+the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
+characters are invariant, but so also are the C1 controls.
+
+If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
+use this option, that C<s> can't have embedded C<NUL> characters and has to
+have a terminating C<NUL> byte).
+
+See also
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+
+*/
+
+#define is_utf8_invariant_string(s, len)                                    \
+                                is_utf8_invariant_string_loc(s, len, NULL)
+
+/*
+=for apidoc is_utf8_invariant_string_loc
+
+Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
+the first UTF-8 variant character in the C<ep> pointer; if all characters are
+UTF-8 invariant, this function does not change the contents of C<*ep>.
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
+{
+    const U8* send;
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
+
+    if (len == 0) {
+        len = strlen((const char *)s);
+    }
+
+    send = s + len;
+
+/* This looks like 0x010101... */
+#  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
+
+/* This looks like 0x808080... */
+#  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
+#  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
+#  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
+
+/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
+ * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
+ * optimized out completely on a 32-bit system, and its mask gets optimized out
+ * on a 64-bit system */
+#  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
+                                      |   (  PTR2nat(x) >> 1)                 \
+                                      | ( ( (PTR2nat(x)                       \
+                                           & PERL_WORD_BOUNDARY_MASK) >> 2))))
+
+#ifndef EBCDIC
+
+    /* Do the word-at-a-time iff there is at least one usable full word.  That
+     * means that after advancing to a word boundary, there still is at least a
+     * full word left.  The number of bytes needed to advance is 'wordsize -
+     * offset' unless offset is 0. */
+    if ((STRLEN) (send - x) >= PERL_WORDSIZE
+
+                            /* This term is wordsize if subword; 0 if not */
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+
+                            /* 'offset' */
+                          - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+    {
+
+        /* Process per-byte until reach word boundary.  XXX This loop could be
+         * eliminated if we knew that this platform had fast unaligned reads */
+        while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
+            if (! UTF8_IS_INVARIANT(*x)) {
+                if (ep) {
+                    *ep = x;
+                }
+
+                return FALSE;
+            }
+            x++;
+        }
+
+        /* Here, we know we have at least one full word to process.  Process
+         * per-word as long as we have at least a full word left */
+        do {
+            if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
+
+                /* Found a variant.  Just return if caller doesn't want its
+                 * exact position */
+                if (! ep) {
+                    return FALSE;
+                }
+
+#  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
+     || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+                *ep = x + _variant_byte_number(* (PERL_UINTMAX_T *) x);
+                assert(*ep >= s && *ep < send);
+
+                return FALSE;
+
+#  else   /* If weird byte order, drop into next loop to do byte-at-a-time
+           checks. */
+
+                break;
+#  endif
+            }
+
+            x += PERL_WORDSIZE;
+
+        } while (x + PERL_WORDSIZE <= send);
+    }
+
+#endif      /* End of ! EBCDIC */
+
+    /* Process per-byte */
+    while (x < send) {
+       if (! UTF8_IS_INVARIANT(*x)) {
+            if (ep) {
+                *ep = x;
+            }
+
+            return FALSE;
+        }
+
+        x++;
+    }
+
+    return TRUE;
+}
+
+#ifndef EBCDIC
+
+PERL_STATIC_INLINE unsigned int
+S__variant_byte_number(PERL_UINTMAX_T word)
+{
+
+    /* This returns the position in a word (0..7) of the first variant byte in
+     * it.  This is a helper function.  Note that there are no branches */
+
+    assert(word);
+
+    /* Get just the msb bits of each byte */
+    word &= PERL_VARIANTS_WORD_MASK;
+
+#  ifdef USING_MSVC6    /* VC6 has some issues with the normal code, and the
+                           easiest thing is to hide that from the callers */
+    {
+        unsigned int i;
+        const U8 * s = (U8 *) &word;
+        dTHX;
+
+        for (i = 0; i < sizeof(word); i++ ) {
+            if (s[i]) {
+                return i;
+            }
+        }
+
+        Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
+                                 __FILE__, __LINE__);
+    }
+
+#  elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
+
+    /* Bytes are stored like
+     *  Byte8 ... Byte2 Byte1
+     *  63..56...15...8 7...0
+     *
+     *  Isolate the lsb;
+     * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+     *
+     * The word will look this this, with a rightmost set bit in position 's':
+     * ('x's are don't cares)
+     *      s
+     *  x..x100..0
+     *  x..xx10..0      Right shift (rightmost 0 is shifted off)
+     *  x..xx01..1      Subtract 1, turns all the trailing zeros into 1's and
+     *                  the 1 just to their left into a 0; the remainder is
+     *                  untouched
+     *  0..0011..1      The xor with x..xx10..0 clears that remainder, sets
+     *                  bottom to all 1
+     *  0..0100..0      Add 1 to clear the word except for the bit in 's'
+     *
+     * Another method is to do 'word &= -word'; but it generates a compiler
+     * message on some platforms about taking the negative of an unsigned */
+
+    word >>= 1;
+    word = 1 + (word ^ (word - 1));
+
+#  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+    /* Bytes are stored like
+     *  Byte1 Byte2  ... Byte8
+     * 63..56 55..47 ... 7...0
+     *
+     * Isolate the msb; http://codeforces.com/blog/entry/10330
+     *
+     * Only the most significant set bit matters.  Or'ing word with its right
+     * shift of 1 makes that bit and the next one to its right both 1.  Then
+     * right shifting by 2 makes for 4 1-bits in a row. ...  We end with the
+     * msb and all to the right being 1. */
+    word |= word >>  1;
+    word |= word >>  2;
+    word |= word >>  4;
+    word |= word >>  8;
+    word |= word >> 16;
+    word |= word >> 32;  /* This should get optimized out on 32-bit systems. */
+
+    /* Then subtracting the right shift by 1 clears all but the left-most of
+     * the 1 bits, which is our desired result */
+    word -= (word >> 1);
+
+#  else
+#    error Unexpected byte order
+#  endif
+
+    /* Here 'word' has a single bit set: the  msb of the first byte in which it
+     * is set.  Calculate that position in the word.  We can use this
+     * specialized solution: https://stackoverflow.com/a/32339674/1626653,
+     * assumes an 8-bit byte.  (On a 32-bit machine, the larger numbers should
+     * just get shifted off at compile time) */
+    word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
+                        | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
+                        |           (39 <<  24) |           (47 <<  16)
+                        |           (55 <<   8) |           (63 <<   0));
+    word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
+
+    /* Here, word contains the position 7..63 of that bit.  Convert to 0..7 */
+    word = ((word + 1) >> 3) - 1;
+
+#  if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+    /* And invert the result */
+    word = CHARBITS - word - 1;
+
+#  endif
+
+    return (unsigned int) word;
+}
+
+#endif
+#if defined(PERL_CORE) || defined(PERL_EXT)
+
+/*
+=for apidoc variant_under_utf8_count
+
+This function looks at the sequence of bytes between C<s> and C<e>, which are
+assumed to be encoded in ASCII/Latin1, and returns how many of them would
+change should the string be translated into UTF-8.  Due to the nature of UTF-8,
+each of these would occupy two bytes instead of the single one in the input
+string.  Thus, this function returns the precise number of bytes the string
+would expand by when translated to UTF-8.
+
+Unlike most of the other functions that have C<utf8> in their name, the input
+to this function is NOT a UTF-8-encoded string.  The function name is slightly
+I<odd> to emphasize this.
+
+This function is internal to Perl because khw thinks that any XS code that
+would want this is probably operating too close to the internals.  Presenting a
+valid use case could change that.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>
+and
+C<L<perlapi/is_utf8_invariant_string_loc>>,
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_variant_under_utf8_count(const U8* const s, const U8* const e)
+{
+    const U8* x = s;
+    Size_t count = 0;
+
+    PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
+
+#  ifndef EBCDIC
+
+    /* Test if the string is long enough to use word-at-a-time.  (Logic is the
+     * same as for is_utf8_invariant_string()) */
+    if ((STRLEN) (e - x) >= PERL_WORDSIZE
+                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+                          - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+    {
+
+        /* Process per-byte until reach word boundary.  XXX This loop could be
+         * eliminated if we knew that this platform had fast unaligned reads */
+        while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
+            count += ! UTF8_IS_INVARIANT(*x++);
+        }
+
+        /* Process per-word as long as we have at least a full word left */
+        do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
+                   explanation of how this works */
+            PERL_UINTMAX_T increment
+                = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
+                      * PERL_COUNT_MULTIPLIER)
+                    >> ((PERL_WORDSIZE - 1) * CHARBITS);
+            count += (Size_t) increment;
+            x += PERL_WORDSIZE;
+        } while (x + PERL_WORDSIZE <= e);
+    }
+
+#  endif
+
+    /* Process per-byte */
+    while (x < e) {
+       if (! UTF8_IS_INVARIANT(*x)) {
+            count++;
+        }
+
+        x++;
+    }
+
+    return count;
+}
+
+#endif
+
+#ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
+#  undef PERL_WORDSIZE
+#  undef PERL_COUNT_MULTIPLIER
+#  undef PERL_WORD_BOUNDARY_MASK
+#  undef PERL_VARIANTS_WORD_MASK
+#endif
+
+/*
+=for apidoc is_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
+be calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function considers Perl's extended UTF-8 to be valid.  That means that
+code points above Unicode, surrogates, and non-character code points are
+considered valid by this function.  Use C<L</is_strict_utf8_string>>,
+C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
+code points are considered valid.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+
+=cut
+*/
+
+#define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
+
+#if defined(PERL_CORE) || defined (PERL_EXT)
+
+/*
+=for apidoc is_utf8_non_invariant_string
+
+Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
+C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
+UTF-8; otherwise returns FALSE.
+
+A TRUE return means that at least one code point represented by the sequence
+either is a wide character not representable as a single byte, or the
+representation differs depending on whether the sequence is encoded in UTF-8 or
+not.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>,
+C<L<perlapi/is_utf8_string>>
+
+=cut
+
+This is commonly used to determine if a SV's UTF-8 flag should be turned on.
+It generally needn't be if its string is entirely UTF-8 invariant, and it
+shouldn't be if it otherwise contains invalid UTF-8.
+
+It is an internal function because khw thinks that XS code shouldn't be working
+at this low a level.  A valid use case could change that.
+
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
+{
+    const U8 * first_variant;
+
+    PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        return FALSE;
+    }
+
+    return is_utf8_string(first_variant, len - (first_variant - s));
+}
+
+#endif
+
+/*
+=for apidoc is_strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that is fully interchangeable by any application using
+Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
+calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any
+code points above the Unicode max of 0x10FFFF, surrogate code points, or
+non-character code points.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
+
+/*
+=for apidoc is_c9strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that conforms to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
+otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
+C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
+C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
+characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any code points above the
+Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
+code points per
+L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
+
+/*
+=for apidoc is_utf8_string_flags
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8 string, subject to the restrictions imposed by C<flags>;
+returns FALSE otherwise.  If C<len> is 0, it will be calculated
+using C<strlen(s)> (which means if you use this option, that C<s> can't have
+embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
+that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
+C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</is_strict_utf8_string>>; and if C<flags> is
+C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
+C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
+combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
+C<L</utf8n_to_uvchr>>, with the same meanings.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
+{
+    const U8 * first_variant;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
+
+    if (len == 0) {
+        len = strlen((const char *)s);
+    }
+
+    if (flags == 0) {
+        return is_utf8_string(s, len);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+    {
+        return is_strict_utf8_string(s, len);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+                                       == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+    {
+        return is_c9strict_utf8_string(s, len);
+    }
+
+    if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+
+        while (x < send) {
+            STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+            if (UNLIKELY(! cur_len)) {
+                return FALSE;
+            }
+            x += cur_len;
+        }
+    }
+
+    return TRUE;
+}
+
+/*
+
+=for apidoc is_utf8_string_loc
+
+Like C<L</is_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_utf8_string_loclen
+
+Like C<L</is_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8 * first_variant;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
+
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
+        while (x < send) {
+            const STRLEN cur_len = isUTF8_CHAR(x, send);
+            if (UNLIKELY(! cur_len)) {
+                break;
+            }
+            x += cur_len;
+            outlen++;
+        }
+
+        if (el)
+            *el = outlen;
+
+        if (ep) {
+            *ep = x;
+        }
+
+        return (x == send);
+    }
+}
+
+/*
+
+=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point; otherwise it evaluates to 0.  If non-zero, the
+value gives how many bytes starting at C<s> comprise the code point's
+representation.  Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The code point can be any that will fit in an IV on this machine, using Perl's
+extension to official UTF-8 to represent those higher than the Unicode maximum
+of 0x10FFFF.  That means that this macro is used to efficiently decide if the
+next few bytes in C<s> is legal UTF-8 for a single character.
+
+Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
+defined by Unicode to be fully interchangeable across applications;
+C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
+C<L</is_utf8_string_loclen>> to check entire strings.
+
+Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut
+
+This uses an adaptation of the table and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of PL_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISUTF8_CHAR;
+
+    /* This dfa is fast.  If it accepts the input, it was for a well-formed,
+     * code point, which can be returned immediately.  Otherwise, it is either
+     * malformed, or for the start byte FF which the dfa doesn't handle (except
+     * on 32-bit ASCII platforms where it trivially is an error).  Call a
+     * helper function for the other platforms. */
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_extended_utf8_dfa_tab[256
+                                         + state
+                                         + PL_extended_utf8_dfa_tab[*s]];
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+#if defined(UV_IS_QUAD) || defined(EBCDIC)
+
+    if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
+       return _is_utf8_char_helper(s0, e, 0);
+    }
+
+#endif
+
+    return 0;
+}
+
+/*
+
+=for apidoc isSTRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode code point completely acceptable for open interchange between all
+applications; otherwise it evaluates to 0.  If non-zero, the value gives how
+many bytes starting at C<s> comprise the code point's representation.  Any
+bytes remaining before C<e>, but beyond the ones needed to form the first code
+point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
+be a surrogate nor a non-character code point.  Thus this excludes any code
+point from Perl's extended UTF-8.
+
+This is used to efficiently decide if the next few bytes in C<s> is
+legal Unicode-acceptable UTF-8 for a single character.
+
+Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
+and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of strict_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+#ifndef EBCDIC
+
+    /* The dfa above drops out for certain Hanguls; handle them specially */
+    if (is_HANGUL_ED_utf8_safe(s0, e)) {
+        return 3;
+    }
+
+#endif
+
+    return 0;
+}
+
+/*
+
+=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
+the value gives how many bytes starting at C<s> comprise the code point's
+representation.  Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
+differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
+code points.  This corresponds to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+which said that non-character code points are merely discouraged rather than
+completely forbidden in open interchange.  See
+L<perlunicode/Noncharacter code points>.
+
+Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
+C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of PL_c9_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+    return 0;
+}
+
+/*
+
+=for apidoc is_strict_utf8_string_loc
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string_loc(s, len, ep)                               \
+                                is_strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_strict_utf8_string_loclen
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8 * first_variant;
+
+    PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
+
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
+        while (x < send) {
+            const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+            if (UNLIKELY(! cur_len)) {
+                break;
+            }
+            x += cur_len;
+            outlen++;
+        }
+
+        if (el)
+            *el = outlen;
+
+        if (ep) {
+            *ep = x;
+        }
+
+        return (x == send);
+    }
+}
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loc
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string_loc(s, len, ep)                                    \
+                            is_c9strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loclen
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
+characters in the C<el> pointer.
+
+See also C<L</is_c9strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8 * first_variant;
+
+    PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
+
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* const send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
+        while (x < send) {
+            const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+            if (UNLIKELY(! cur_len)) {
+                break;
+            }
+            x += cur_len;
+            outlen++;
+        }
+
+        if (el)
+            *el = outlen;
+
+        if (ep) {
+            *ep = x;
+        }
+
+        return (x == send);
+    }
+}
+
+/*
+
+=for apidoc is_utf8_string_loc_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc_flags(s, len, ep, flags)                         \
+                        is_utf8_string_loclen_flags(s, len, ep, 0, flags)
+
+
+/* The above 3 actual functions could have been moved into the more general one
+ * just below, and made #defines that call it with the right 'flags'.  They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+
+=for apidoc is_utf8_string_loclen_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc_flags>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
+{
+    const U8 * first_variant;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
+
+    if (len == 0) {
+        len = strlen((const char *) s);
+    }
+
+    if (flags == 0) {
+        return is_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+    {
+        return is_strict_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
+                                    == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+    {
+        return is_c9strict_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+        if (el)
+            *el = len;
+
+        if (ep) {
+            *ep = s + len;
+        }
+
+        return TRUE;
+    }
+
+    {
+        const U8* send = s + len;
+        const U8* x = first_variant;
+        STRLEN outlen = first_variant - s;
+
+        while (x < send) {
+            const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+            if (UNLIKELY(! cur_len)) {
+                break;
+            }
+            x += cur_len;
+            outlen++;
+        }
+
+        if (el)
+            *el = outlen;
+
+        if (ep) {
+            *ep = x;
+        }
+
+        return (x == send);
+    }
+}
+
+/*
+=for apidoc utf8_distance
+
+Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
+and C<b>.
+
+WARNING: use only if you *know* that the pointers point inside the
+same UTF-8 buffer.
+
+=cut
+*/
+
+PERL_STATIC_INLINE IV
+Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
+{
+    PERL_ARGS_ASSERT_UTF8_DISTANCE;
+
+    return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
+}
  
-/* ------------------------------- handy.h ------------------------------- */
+/*
+=for apidoc utf8_hop
  
-/* saves machine code for a common noreturn idiom typically used in Newx*() */
-#ifdef GCC_DIAG_PRAGMA
-GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */
-#endif
-static void
-S_croak_memory_wrap(void)
-{
-    Perl_croak_nocontext("%s",PL_memory_wrap);
-}
-#ifdef GCC_DIAG_PRAGMA
-GCC_DIAG_RESTORE /* Intentionally left semicolonless. */
-#endif
+Return the UTF-8 pointer C<s> displaced by C<off> characters, either
+forward or backward.
  
-/* ------------------------------- utf8.h ------------------------------- */
+WARNING: do not use the following unless you *know* C<off> is within
+the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
+on the first byte of character or just after the last byte of a character.
  
-/*
-=head1 Unicode Support
+=cut
  */
  
-PERL_STATIC_INLINE void
-S_append_utf8_from_native_byte(const U8 byte, U8** dest)
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop(const U8 *s, SSize_t off)
  {
-    /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
-     * encoded string at '*dest', updating '*dest' to include it */
+    PERL_ARGS_ASSERT_UTF8_HOP;
  
-    PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
  
-    if (NATIVE_BYTE_IS_INVARIANT(byte))
-        *(*dest)++ = byte;
+    if (off >= 0) {
+       while (off--)
+           s += UTF8SKIP(s);
+    }
      else {
-        *(*dest)++ = UTF8_EIGHT_BIT_HI(byte);
-        *(*dest)++ = UTF8_EIGHT_BIT_LO(byte);
+       while (off++) {
+           s--;
+           while (UTF8_IS_CONTINUATION(*s))
+               s--;
+       }
      }
+    GCC_DIAG_IGNORE(-Wcast-qual)
+    return (U8 *)s;
+    GCC_DIAG_RESTORE
  }
  
  /*
+=for apidoc utf8_hop_forward
  
-A helper function for the macro isUTF8_CHAR(), which should be used instead of
-this function.  The macro will handle smaller code points directly saving time,
-using this function as a fall-back for higher code points.
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+forward.
  
-Tests if the first bytes of string C<s> form a valid UTF-8 character.  0 is
-returned if the bytes starting at C<s> up to but not including C<e> do not form a
-complete well-formed UTF-8 character; otherwise the number of bytes in the
-character is returned.
+C<off> must be non-negative.
  
-Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8
-character.
+C<s> must be before or equal to C<end>.
  
-=cut */
-PERL_STATIC_INLINE STRLEN
-S__is_utf8_char_slow(const U8 *s, const U8 *e)
+When moving forward it will not move beyond C<end>.
+
+Will not exceed this limit even if the string is not valid "UTF-8".
+
+=cut
+*/
+
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
  {
-    dTHX;   /* The function called below requires thread context */
+    PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
  
-    STRLEN actual_len;
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
  
-    PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
+    assert(s <= end);
+    assert(off >= 0);
  
-    assert(e >= s);
-    utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY);
+    while (off--) {
+        STRLEN skip = UTF8SKIP(s);
+        if ((STRLEN)(end - s) <= skip) {
+            GCC_DIAG_IGNORE(-Wcast-qual)
+            return (U8 *)end;
+            GCC_DIAG_RESTORE
+        }
+        s += skip;
+    }
  
-    return (actual_len == (STRLEN) -1) ? 0 : actual_len;
+    GCC_DIAG_IGNORE(-Wcast-qual)
+    return (U8 *)s;
+    GCC_DIAG_RESTORE
  }
  
  /*
-=for apidoc valid_utf8_to_uvchr
-Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
-the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
-it passes C<L</isUTF8_CHAR>>.  Surrogates, non-character code points, and
-non-Unicode code points are allowed.
+=for apidoc utf8_hop_back
  
-=cut
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+backward.
  
- */
+C<off> must be non-positive.
  
-PERL_STATIC_INLINE UV
-Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
-{
-    UV expectlen = UTF8SKIP(s);
-    const U8* send = s + expectlen;
-    UV uv = *s;
+C<s> must be after or equal to C<start>.
  
-    PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
+When moving backward it will not move before C<start>.
  
-    if (retlen) {
-        *retlen = expectlen;
-    }
+Will not exceed this limit even if the string is not valid "UTF-8".
  
-    /* An invariant is trivially returned */
-    if (expectlen == 1) {
-       return uv;
-    }
+=cut
+*/
  
-    /* Remove the leading bits that indicate the number of bytes, leaving just
-     * the bits that are part of the value */
-    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
+{
+    PERL_ARGS_ASSERT_UTF8_HOP_BACK;
  
-    /* Now, loop through the remaining bytes, accumulating each into the
-     * working total as we go.  (I khw tried unrolling the loop for up to 4
-     * bytes, but there was no performance improvement) */
-    for (++s; s < send; s++) {
-        uv = UTF8_ACCUMULATE(uv, *s);
-    }
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
  
-    return UNI_TO_NATIVE(uv);
+    assert(start <= s);
+    assert(off <= 0);
  
+    while (off++ && s > start) {
+        do {
+            s--;
+        } while (UTF8_IS_CONTINUATION(*s) && s > start);
+    }
+    
+    GCC_DIAG_IGNORE(-Wcast-qual)
+    return (U8 *)s;
+    GCC_DIAG_RESTORE
  }
  
  /*
-=for apidoc is_utf8_invariant_string
+=for apidoc utf8_hop_safe
  
-Returns true iff the first C<len> bytes of the string C<s> are the same
-regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
-EBCDIC machines).  That is, if they are UTF-8 invariant.  On ASCII-ish
-machines, all the ASCII characters and only the ASCII characters fit this
-definition.  On EBCDIC machines, the ASCII-range characters are invariant, but
-so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
-EBCDIC).
+Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
+either forward or backward.
  
-If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
-use this option, that C<s> can't have embedded C<NUL> characters and has to
-have a terminating C<NUL> byte).
+When moving backward it will not move before C<start>.
+
+When moving forward it will not move beyond C<end>.
  
-See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+Will not exceed those limits even if the string is not valid "UTF-8".
  
  =cut
  */
  
-PERL_STATIC_INLINE bool
-S_is_utf8_invariant_string(const U8* const s, const STRLEN len)
+PERL_STATIC_INLINE U8 *
+Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
  {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
+    PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
+
+    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
+     * the bitops (especially ~) can create illegal UTF-8.
+     * In other words: in Perl UTF-8 is not just for Unicode. */
  
-    PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING;
+    assert(start <= s && s <= end);
  
-    for (; x < send; ++x) {
-       if (!UTF8_IS_INVARIANT(*x))
-           return FALSE;
+    if (off >= 0) {
+        return utf8_hop_forward(s, off, end);
+    }
+    else {
+        return utf8_hop_back(s, off, start);
      }
-
-    return TRUE;
  }
  
  /*
-=for apidoc is_utf8_string
  
-Returns true if the first C<len> bytes of string C<s> form a valid
-UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
-using C<strlen(s)> (which means if you use this option, that C<s> can't have
-embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
-that all characters being ASCII constitute 'a valid UTF-8 string'.
+=for apidoc is_utf8_valid_partial_char
+
+Returns 0 if the sequence of bytes starting at C<s> and looking no further than
+S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
+points.  Otherwise, it returns 1 if there exists at least one non-empty
+sequence of bytes that when appended to sequence C<s>, starting at position
+C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
+otherwise returns 0.
+
+In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
+point.
  
-See also L</is_utf8_invariant_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+This is useful when a fixed-length buffer is being tested for being well-formed
+UTF-8, but the final few bytes in it don't comprise a full character; that is,
+it is split somewhere in the middle of the final code point's UTF-8
+representation.  (Presumably when the buffer is refreshed with the next chunk
+of data, the new first bytes will complete the partial code point.)   This
+function is used to verify that the final bytes in the current buffer are in
+fact the legal beginning of some code point, so that if they aren't, the
+failure can be signalled without having to wait for the next read.
  
  =cut
  */
+#define is_utf8_valid_partial_char(s, e)                                    \
+                                is_utf8_valid_partial_char_flags(s, e, 0)
+
+/*
+
+=for apidoc is_utf8_valid_partial_char_flags
+
+Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
+or not the input is a valid UTF-8 encoded partial character, but it takes an
+extra parameter, C<flags>, which can further restrict which code points are
+considered valid.
+
+If C<flags> is 0, this behaves identically to
+C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
+of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
+there is any sequence of bytes that can complete the input partial character in
+such a way that a non-prohibited character is formed, the function returns
+TRUE; otherwise FALSE.  Non character code points cannot be determined based on
+partial character input.  But many  of the other possible excluded types can be
+determined from just the first one or two bytes.
+
+=cut
+ */
  
-bool
-Perl_is_utf8_string(const U8 *s, STRLEN len)
+PERL_STATIC_INLINE bool
+S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
  {
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
+    PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
  
-    PERL_ARGS_ASSERT_IS_UTF8_STRING;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_PERL_EXTENDED)));
  
-    while (x < send) {
-        STRLEN len = isUTF8_CHAR(x, send);
-        if (UNLIKELY(! len)) {
-            return FALSE;
-        }
-        x += len;
+    if (s >= e || s + UTF8SKIP(s) <= e) {
+        return FALSE;
      }
  
-    return TRUE;
+    return cBOOL(_is_utf8_char_helper(s, e, flags));
  }
  
  /*
-Implemented as a macro in utf8.h
-
-=for apidoc is_utf8_string_loc
  
-Like L</is_utf8_string> but stores the location of the failure (in the
-case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>.
+=for apidoc is_utf8_fixed_width_buf_flags
  
-See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
+Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
+is entirely valid UTF-8, subject to the restrictions given by C<flags>;
+otherwise it returns FALSE.
  
-=for apidoc is_utf8_string_loclen
+If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
+without restriction.  If the final few bytes of the buffer do not form a
+complete code point, this will return TRUE anyway, provided that
+C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
  
-Like L</is_utf8_string>() but stores the location of the failure (in the
-case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>, and the number of UTF-8
-encoded characters in the C<el>.
+If C<flags> in non-zero, it can be any combination of the
+C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
+same meanings.
  
-See also L</is_utf8_string_loc>() and L</is_utf8_string>().
+This function differs from C<L</is_utf8_string_flags>> only in that the latter
+returns FALSE if the final few bytes of the string don't form a complete code
+point.
  
  =cut
-*/
+ */
+#define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
+                is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
  
-bool
-Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
-{
-    const U8* const send = s + (len ? len : strlen((const char *)s));
-    const U8* x = s;
-    STRLEN outlen = 0;
+/*
  
-    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
+=for apidoc is_utf8_fixed_width_buf_loc_flags
  
-    while (x < send) {
-        STRLEN len = isUTF8_CHAR(x, send);
-        if (UNLIKELY(! len)) {
-            break;
-        }
-        x += len;
-        outlen++;
-    }
+Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
+failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
+to the beginning of any partial character at the end of the buffer; if there is
+no partial character C<*ep> will contain C<s>+C<len>.
  
-    if (el)
-        *el = outlen;
+See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
  
-    if (ep) {
-        *ep = x;
-    }
+=cut
+*/
  
-    return (x == send);
-}
+#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
+                is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
  
  /*
-=for apidoc utf8_distance
  
-Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
-and C<b>.
+=for apidoc is_utf8_fixed_width_buf_loclen_flags
  
-WARNING: use only if you *know* that the pointers point inside the
-same UTF-8 buffer.
+Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
+complete, valid characters found in the C<el> pointer.
  
  =cut
  */
  
-PERL_STATIC_INLINE IV
-Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
+PERL_STATIC_INLINE bool
+S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
+                                       STRLEN len,
+                                       const U8 **ep,
+                                       STRLEN *el,
+                                       const U32 flags)
  {
-    PERL_ARGS_ASSERT_UTF8_DISTANCE;
+    const U8 * maybe_partial;
  
-    return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
+    PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
+
+    if (! ep) {
+        ep  = &maybe_partial;
+    }
+
+    /* If it's entirely valid, return that; otherwise see if the only error is
+     * that the final few bytes are for a partial character */
+    return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
+           || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
  }
  
-/*
-=for apidoc utf8_hop
+PERL_STATIC_INLINE UV
+S_utf8n_to_uvchr_msgs(const U8 *s,
+                      STRLEN curlen,
+                      STRLEN *retlen,
+                      const U32 flags,
+                      U32 * errors,
+                      AV ** msgs)
+{
+    /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
+     * simple cases, and, if necessary calls a helper function to deal with the
+     * more complex ones.  Almost all well-formed non-problematic code points
+     * are considered simple, so that it's unlikely that the helper function
+     * will need to be called.
+     *
+     * This is an adaptation of the tables and algorithm given in
+     * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
+     * comprehensive documentation of the original version.  A copyright notice
+     * for the original version is given at the beginning of this file.  The
+     * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
+     */
  
-Return the UTF-8 pointer C<s> displaced by C<off> characters, either
-forward or backward.
+    const U8 * const s0 = s;
+    const U8 * send = s0 + curlen;
+    UV uv = 0;      /* The 0 silences some stupid compilers */
+    UV state = 0;
  
-WARNING: do not use the following unless you *know* C<off> is within
-the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
-on the first byte of character or just after the last byte of a character.
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
  
-=cut
-*/
+    /* This dfa is fast.  If it accepts the input, it was for a well-formed,
+     * non-problematic code point, which can be returned immediately.
+     * Otherwise we call a helper function to figure out the more complicated
+     * cases. */
  
-PERL_STATIC_INLINE U8 *
-Perl_utf8_hop(const U8 *s, SSize_t off)
-{
-    PERL_ARGS_ASSERT_UTF8_HOP;
+    while (s < send && LIKELY(state != 1)) {
+        UV type = PL_strict_utf8_dfa_tab[*s];
  
-    /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
-     * the bitops (especially ~) can create illegal UTF-8.
-     * In other words: in Perl UTF-8 is not just for Unicode. */
+        uv = (state == 0)
+             ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
+             : UTF8_ACCUMULATE(uv, *s);
+        state = PL_strict_utf8_dfa_tab[256 + state + type];
  
-    if (off >= 0) {
-       while (off--)
-           s += UTF8SKIP(s);
-    }
-    else {
-       while (off++) {
-           s--;
-           while (UTF8_IS_CONTINUATION(*s))
-               s--;
-       }
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        if (retlen) {
+            *retlen = s - s0 + 1;
+        }
+        if (errors) {
+            *errors = 0;
+        }
+        if (msgs) {
+            *msgs = NULL;
+        }
+
+        return uv;
      }
-    return (U8 *)s;
+
+    /* Here is potentially problematic.  Use the full mechanism */
+    return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
  }
  
  /* ------------------------------- perl.h ----------------------------- */
@@ -850,9 +2164,9 @@ S_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
      cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
      cx->blk_eval.cur_top_env   = PL_top_env;
  
-    assert(!(PL_in_eval     & ~ 0x7F));
+    assert(!(PL_in_eval     & ~ 0x3F));
      assert(!(PL_op->op_type & ~0x1FF));
-    cx->blk_u16 = (PL_in_eval & 0x7F) | ((U16)PL_op->op_type << 7);
+    cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
  }
  
  
@@ -865,9 +2179,10 @@ S_cx_popeval(pTHX_ PERL_CONTEXT *cx)
      assert(CxTYPE(cx) == CXt_EVAL);
  
      PL_in_eval = CxOLD_IN_EVAL(cx);
+    assert(!(PL_in_eval & 0xc0));
      PL_eval_root = cx->blk_eval.old_eval_root;
      sv = cx->blk_eval.cur_text;
-    if (sv && SvSCREAM(sv)) {
+    if (sv && CxEVAL_TXT_REFCNTED(cx)) {
          cx->blk_eval.cur_text = NULL;
          SvREFCNT_dec_NN(sv);
      }
@@ -995,6 +2310,115 @@ S_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
      SvREFCNT_dec(sv);
  }
  
+/* ------------------ util.h ------------------------------------------- */
+
+/*
+=head1 Miscellaneous Functions
+
+=for apidoc foldEQ
+
+Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
+same
+case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
+match themselves and their opposite case counterparts.  Non-cased and non-ASCII
+range bytes match only themselves.
+
+=cut
+*/
+
+PERL_STATIC_INLINE I32
+Perl_foldEQ(const char *s1, const char *s2, I32 len)
+{
+    const U8 *a = (const U8 *)s1;
+    const U8 *b = (const U8 *)s2;
+
+    PERL_ARGS_ASSERT_FOLDEQ;
+
+    assert(len >= 0);
+
+    while (len--) {
+       if (*a != *b && *a != PL_fold[*b])
+           return 0;
+       a++,b++;
+    }
+    return 1;
+}
+
+PERL_STATIC_INLINE I32
+Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
+{
+    /* Compare non-utf8 using Unicode (Latin1) semantics.  Does not work on
+     * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
+     * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these.  Nor
+     * does it check that the strings each have at least 'len' characters */
+
+    const U8 *a = (const U8 *)s1;
+    const U8 *b = (const U8 *)s2;
+
+    PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
+
+    assert(len >= 0);
+
+    while (len--) {
+       if (*a != *b && *a != PL_fold_latin1[*b]) {
+           return 0;
+       }
+       a++, b++;
+    }
+    return 1;
+}
+
+/*
+=for apidoc foldEQ_locale
+
+Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
+same case-insensitively in the current locale; false otherwise.
+
+=cut
+*/
+
+PERL_STATIC_INLINE I32
+Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
+{
+    dVAR;
+    const U8 *a = (const U8 *)s1;
+    const U8 *b = (const U8 *)s2;
+
+    PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
+
+    assert(len >= 0);
+
+    while (len--) {
+       if (*a != *b && *a != PL_fold_locale[*b])
+           return 0;
+       a++,b++;
+    }
+    return 1;
+}
+
+#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
+
+PERL_STATIC_INLINE void *
+S_my_memrchr(const char * s, const char c, const STRLEN len)
+{
+    /* memrchr(), since many platforms lack it */
+
+    const char * t = s + len - 1;
+
+    PERL_ARGS_ASSERT_MY_MEMRCHR;
+
+    while (t >= s) {
+        if (*t == c) {
+            return (void *) t;
+        }
+        t--;
+    }
+
+    return NULL;
+}
+
+#endif
+
  /*
   * ex: set ts=8 sts=4 sw=4 et:
   */