t/op/local.t: tests for RT #7411

[perl5.git] / inline.h
diff --git a/inline.h b/inline.h

index c3acf67..66ba348 100644 (file)
--- a/inline.h
+++ b/inline.h
@@ -269,16 +269,16 @@ S_append_utf8_from_native_byte(const U8 byte, U8** dest)
      PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
  
      if (NATIVE_BYTE_IS_INVARIANT(byte))
-        *(*dest)++ = byte;
+        *((*dest)++) = byte;
      else {
-        *(*dest)++ = UTF8_EIGHT_BIT_HI(byte);
-        *(*dest)++ = UTF8_EIGHT_BIT_LO(byte);
+        *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
+        *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
      }
  }
  
  /*
  =for apidoc valid_utf8_to_uvchr
-Like L</utf8_to_uvchr_buf>(), but should only be called when it is known that
+Like C<L</utf8_to_uvchr_buf>>, but should only be called when it is known that
  the next character in the input UTF-8 string C<s> is well-formed (I<e.g.>,
  it passes C<L</isUTF8_CHAR>>.  Surrogates, non-character code points, and
  non-Unicode code points are allowed.
@@ -290,7 +290,7 @@ non-Unicode code points are allowed.
  PERL_STATIC_INLINE UV
  Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
  {
-    UV expectlen = UTF8SKIP(s);
+    const UV expectlen = UTF8SKIP(s);
      const U8* send = s + expectlen;
      UV uv = *s;
  
@@ -323,20 +323,34 @@ Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
  /*
  =for apidoc is_utf8_invariant_string
  
-Returns true iff the first C<len> bytes of the string C<s> are the same
+Returns TRUE if the first C<len> bytes of the string C<s> are the same
  regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
-EBCDIC machines).  That is, if they are UTF-8 invariant.  On ASCII-ish
-machines, all the ASCII characters and only the ASCII characters fit this
-definition.  On EBCDIC machines, the ASCII-range characters are invariant, but
-so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
-EBCDIC).
+EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
+are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
+the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
+characters are invariant, but so also are the C1 controls.
  
  If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
  use this option, that C<s> can't have embedded C<NUL> characters and has to
  have a terminating C<NUL> byte).
  
-See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+See also
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
  
  =cut
  */
@@ -360,20 +374,31 @@ S_is_utf8_invariant_string(const U8* const s, const STRLEN len)
  /*
  =for apidoc is_utf8_string
  
-Returns true if the first C<len> bytes of string C<s> form a valid
-UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
-using C<strlen(s)> (which means if you use this option, that C<s> can't have
-embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
-that all characters being ASCII constitute 'a valid UTF-8 string'.
-
-See also L</is_utf8_invariant_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
+be calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function considers Perl's extended UTF-8 to be valid.  That means that
+code points above Unicode, surrogates, and non-character code points are
+considered valid by this function.  Use C<L</is_strict_utf8_string>>,
+C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
+code points are considered valid.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
  
  =cut
  */
  
-bool
-Perl_is_utf8_string(const U8 *s, STRLEN len)
+PERL_STATIC_INLINE bool
+Perl_is_utf8_string(const U8 *s, const STRLEN len)
  {
      /* This is now marked pure in embed.fnc, because isUTF8_CHAR now is pure.
       * Be aware of possible changes to that */
@@ -384,41 +409,237 @@ Perl_is_utf8_string(const U8 *s, STRLEN len)
      PERL_ARGS_ASSERT_IS_UTF8_STRING;
  
      while (x < send) {
-        STRLEN len = isUTF8_CHAR(x, send);
-        if (UNLIKELY(! len)) {
+        const STRLEN cur_len = isUTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            return FALSE;
+        }
+        x += cur_len;
+    }
+
+    return TRUE;
+}
+
+/*
+=for apidoc is_strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that is fully interchangeable by any application using
+Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
+calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any
+code points above the Unicode max of 0x10FFFF, surrogate code points, or
+non-character code points.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string(const U8 *s, const STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING;
+
+    while (x < send) {
+        const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            return FALSE;
+        }
+        x += cur_len;
+    }
+
+    return TRUE;
+}
+
+/*
+=for apidoc is_c9strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that conforms to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
+otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
+C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
+C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
+characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any code points above the
+Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
+code points per
+L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING;
+
+    while (x < send) {
+        const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            return FALSE;
+        }
+        x += cur_len;
+    }
+
+    return TRUE;
+}
+
+/* The above 3 functions could have been moved into the more general one just
+ * below, and made #defines that call it with the right 'flags'.  They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+=for apidoc is_utf8_string_flags
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8 string, subject to the restrictions imposed by C<flags>;
+returns FALSE otherwise.  If C<len> is 0, it will be calculated
+using C<strlen(s)> (which means if you use this option, that C<s> can't have
+embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
+that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
+C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</is_strict_utf8_string>>; and if C<flags> is
+C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
+C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
+combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
+C<L</utf8n_to_uvchr>>, with the same meanings.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+
+    if (flags == 0) {
+        return is_utf8_string(s, len);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+    {
+        return is_strict_utf8_string(s, len);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                       == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+    {
+        return is_c9strict_utf8_string(s, len);
+    }
+
+    while (x < send) {
+        STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+        if (UNLIKELY(! cur_len)) {
              return FALSE;
          }
-        x += len;
+        x += cur_len;
      }
  
      return TRUE;
  }
  
  /*
-Implemented as a macro in utf8.h
  
  =for apidoc is_utf8_string_loc
  
-Like L</is_utf8_string> but stores the location of the failure (in the
+Like C<L</is_utf8_string>> but stores the location of the failure (in the
  case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>.
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen>>.
+
+=cut
+*/
  
-See also L</is_utf8_string_loclen>() and L</is_utf8_string>().
+#define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
+
+/*
  
  =for apidoc is_utf8_string_loclen
  
-Like L</is_utf8_string>() but stores the location of the failure (in the
+Like C<L</is_utf8_string>> but stores the location of the failure (in the
  case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>, and the number of UTF-8
-encoded characters in the C<el>.
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
  
-See also L</is_utf8_string_loc>() and L</is_utf8_string>().
+See also C<L</is_utf8_string_loc>>.
  
  =cut
  */
  
-bool
-Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+PERL_STATIC_INLINE bool
+Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
  {
      const U8* const send = s + (len ? len : strlen((const char *)s));
      const U8* x = s;
@@ -427,11 +648,208 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
      PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
  
      while (x < send) {
-        STRLEN len = isUTF8_CHAR(x, send);
-        if (UNLIKELY(! len)) {
+        const STRLEN cur_len = isUTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            break;
+        }
+        x += cur_len;
+        outlen++;
+    }
+
+    if (el)
+        *el = outlen;
+
+    if (ep) {
+        *ep = x;
+    }
+
+    return (x == send);
+}
+
+/*
+
+=for apidoc is_strict_utf8_string_loc
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string_loc(s, len, ep)                               \
+                                is_strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_strict_utf8_string_loclen
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+    STRLEN outlen = 0;
+
+    PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
+
+    while (x < send) {
+        const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            break;
+        }
+        x += cur_len;
+        outlen++;
+    }
+
+    if (el)
+        *el = outlen;
+
+    if (ep) {
+        *ep = x;
+    }
+
+    return (x == send);
+}
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loc
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string_loc(s, len, ep)                                    \
+                            is_c9strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loclen
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
+characters in the C<el> pointer.
+
+See also C<L</is_c9strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+    STRLEN outlen = 0;
+
+    PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
+
+    while (x < send) {
+        const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            break;
+        }
+        x += cur_len;
+        outlen++;
+    }
+
+    if (el)
+        *el = outlen;
+
+    if (ep) {
+        *ep = x;
+    }
+
+    return (x == send);
+}
+
+/*
+
+=for apidoc is_utf8_string_loc_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc_flags(s, len, ep, flags)                         \
+                        is_utf8_string_loclen_flags(s, len, ep, 0, flags)
+
+
+/* The above 3 actual functions could have been moved into the more general one
+ * just below, and made #defines that call it with the right 'flags'.  They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+
+=for apidoc is_utf8_string_loclen_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc_flags>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+    STRLEN outlen = 0;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+
+    if (flags == 0) {
+        return is_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+    {
+        return is_strict_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                    == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+    {
+        return is_c9strict_utf8_string_loclen(s, len, ep, el);
+    }
+
+    while (x < send) {
+        const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+        if (UNLIKELY(! cur_len)) {
              break;
          }
-        x += len;
+        x += cur_len;
          outlen++;
      }
  
@@ -505,39 +923,138 @@ Perl_utf8_hop(const U8 *s, SSize_t off)
  
  =for apidoc is_utf8_valid_partial_char
  
-Returns 1 if there exists some sequence of bytes, call it C<s'>, that when
-appended to the sequence from C<s> through S<C<e - 1>> causes the entire
-sequence starting at C<s> (including C<s'>) to be the well-formed UTF-8 of
-some code point; otherwise returns 0.
+Returns 0 if the sequence of bytes starting at C<s> and looking no further than
+S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
+points.  Otherwise, it returns 1 if there exists at least one non-empty
+sequence of bytes that when appended to sequence C<s>, starting at position
+C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
+otherwise returns 0.
+
+In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
+point.
+
+This is useful when a fixed-length buffer is being tested for being well-formed
+UTF-8, but the final few bytes in it don't comprise a full character; that is,
+it is split somewhere in the middle of the final code point's UTF-8
+representation.  (Presumably when the buffer is refreshed with the next chunk
+of data, the new first bytes will complete the partial code point.)   This
+function is used to verify that the final bytes in the current buffer are in
+fact the legal beginning of some code point, so that if they aren't, the
+failure can be signalled without having to wait for the next read.
+
+=cut
+*/
+#define is_utf8_valid_partial_char(s, e)                                    \
+                                is_utf8_valid_partial_char_flags(s, e, 0)
+
+/*
  
-In other words this returns TRUE if C<s> points to the beginning, but partial,
-sequence of the UTF-8 for some code point.
+=for apidoc is_utf8_valid_partial_char_flags
  
-This is useful when some fixed-length buffer is being tested for being
-well-formed UTF-8, but the final few bytes in it don't comprise a full
-character: it is split somewhere in the middle of its UTF-8 representation.
-(Presumably when the buffer is refreshed with the next chunk of data, the new
-first bytes will complete the partial code point.)   This function is used to
-verify that the final bytes in the current buffer are in fact the legal
-beginning of some code point, so that if they aren't, the failure can be
-signalled without having to wait for the next read.
+Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
+or not the input is a valid UTF-8 encoded partial character, but it takes an
+extra parameter, C<flags>, which can further restrict which code points are
+considered valid.
  
-If the bytes terminated at S<C<e - 1>> are a full character (or more), 0 is
-returned.
+If C<flags> is 0, this behaves identically to
+C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
+of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
+there is any sequence of bytes that can complete the input partial character in
+such a way that a non-prohibited character is formed, the function returns
+TRUE; otherwise FALSE.  Non character code points cannot be determined based on
+partial character input.  But many  of the other possible excluded types can be
+determined from just the first one or two bytes.
  
  =cut
-*/
-bool
-S_is_utf8_valid_partial_char(const U8 * const s, const U8 * const e)
+ */
+
+PERL_STATIC_INLINE bool
+S_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
  {
+    PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
  
-    PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
  
-    if (s >= e || s + UTF8SKIP(s) < e) {
+    if (s >= e || s + UTF8SKIP(s) <= e) {
          return FALSE;
      }
  
-    return cBOOL(_is_utf8_char_slow(s, e - s));
+    return cBOOL(_is_utf8_char_helper(s, e, flags));
+}
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_flags
+
+Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
+is entirely valid UTF-8, subject to the restrictions given by C<flags>;
+otherwise it returns FALSE.
+
+If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
+without restriction.  If the final few bytes of the buffer do not form a
+complete code point, this will return TRUE anyway, provided that
+C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
+
+If C<flags> in non-zero, it can be any combination of the
+C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
+same meanings.
+
+This function differs from C<L</is_utf8_string_flags>> only in that the latter
+returns FALSE if the final few bytes of the string don't form a complete code
+point.
+
+=cut
+ */
+#define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
+                is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loc_flags
+
+Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
+failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
+to the beginning of any partial character at the end of the buffer; if there is
+no partial character C<*ep> will contain C<s>+C<len>.
+
+See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
+                is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loclen_flags
+
+Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
+complete, valid characters found in the C<el> pointer.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
+                                       const STRLEN len,
+                                       const U8 **ep,
+                                       STRLEN *el,
+                                       const U32 flags)
+{
+    const U8 * maybe_partial;
+
+    PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
+
+    if (! ep) {
+        ep  = &maybe_partial;
+    }
+
+    /* If it's entirely valid, return that; otherwise see if the only error is
+     * that the final few bytes are for a partial character */
+    return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
+           || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
  }
  
  /* ------------------------------- perl.h ----------------------------- */