next if $use_flags eq "" && $restriction eq "fits_in_31_bits";
# Start building up the name of the function we will test.
- my $name = "is_";
+ my $base_name = "is_";
if (! $use_flags && $restriction ne "") {
- $name .= $restriction . "_";
+ $base_name .= $restriction . "_";
}
- $name .= "utf8_string";
+
+ # We test both "is_utf8_string_foo" and "is_fixed_width_buf" functions
+ foreach my $operand ('string', 'fixed_width_buf') {
+
+ # Currently, the only fixed_width_buf functions have the '_flags'
+ # suffix.
+ next if $operand eq 'fixed_width_buf' && $use_flags eq "";
+
+ my $name = "${base_name}utf8_$operand";
# We test each version of the function
for my $function ("_loclen", "_loc", "") {
# a) valid input
# b) invalid input created by appending an out-of-place
# continuation character to the valid string
- # c) invalid input created by appending a partial character
+ # c) input created by appending a partial character. This
+ # is valid in the 'fixed_width' functions, but invalid in
+ # the 'string' ones
# d) invalid input created by calling a function that is
# expecting a restricted form of the input using the string
# that's valid when unrestricted
if ($this_error_type) {
# Appending a bare continuation byte or a partial
- # character makes it invalid, but the character count
- # and offset remain the same. But in the other cases,
- # we have saved where the failures should occur, so
- # use those.
+ # character doesn't change the character count or
+ # offset. But in the other cases, we have saved where
+ # the failures should occur, so use those. Appending
+ # a continuation byte makes it invalid; appending a
+ # partial character makes the 'string' form invalid,
+ # but not the 'fixed_width_buf' form.
if ($this_error_type eq $cont_byte || $this_error_type eq $p) {
$bytes .= $this_error_type;
if ($this_error_type eq $cont_byte) {
else {
$test_name_suffix
= " if ends with a partial character";
+ $this_error_type
+ = 0 if $operand eq "fixed_width_buf";
}
}
else {
}
}
}
+ }
}
}
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
C<L</is_utf8_invariant_string>>,
C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
=cut
*/
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
C<L</is_c9strict_utf8_string>>,
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
C<L</is_utf8_string_loc_flags>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_string_loclen_flags>>,
+C<L</is_utf8_fixed_width_buf_flags>>,
+C<L</is_utf8_fixed_width_buf_loc_flags>>,
+C<L</is_utf8_fixed_width_buf_loclen_flags>>,
C<L</is_strict_utf8_string>>,
C<L</is_strict_utf8_string_loc>>,
C<L</is_strict_utf8_string_loclen>>,
return cBOOL(_is_utf8_char_helper(s, e, flags));
}
+/*
+
+=for apidoc is_utf8_fixed_width_buf_flags
+
+Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
+is entirely valid UTF-8, subject to the restrictions given by C<flags>;
+otherwise it returns FALSE.
+
+If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
+without restriction. If the final few bytes of the buffer do not form a
+complete code point, this will return TRUE anyway, provided that
+C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
+
+If C<flags> in non-zero, it can be any combination of the
+C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
+same meanings.
+
+This function differs from C<L</is_utf8_string_flags>> only in that the latter
+returns FALSE if the final few bytes of the string don't form a complete code
+point.
+
+=cut
+ */
+#define is_utf8_fixed_width_buf_flags(s, len, flags) \
+ is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loc_flags
+
+Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
+failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
+to the beginning of any partial character at the end of the buffer; if there is
+no partial character C<*ep> will contain C<s>+C<len>.
+
+See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
+ is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
+
+/*
+
+=for apidoc is_utf8_fixed_width_buf_loclen_flags
+
+Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
+complete, valid characters found in the C<el> pointer.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
+ const STRLEN len,
+ const U8 **ep,
+ STRLEN *el,
+ const U32 flags)
+{
+ const U8 * maybe_partial;
+
+ PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
+
+ if (! ep) {
+ ep = &maybe_partial;
+ }
+
+ /* If it's entirely valid, return that; otherwise see if the only error is
+ * that the final few bytes are for a partial character */
+ return is_utf8_string_loclen_flags(s, len, ep, el, flags)
+ || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
+}
+
/* ------------------------------- perl.h ----------------------------- */
/*