AnpdD |STRLEN |is_utf8_char |NN const U8 *s
Abmnpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
AnipdP |bool |is_utf8_string |NN const U8 *s|const STRLEN len
-Anpdmb |bool |is_utf8_string_loc|NN const U8 *s|const STRLEN len|NN const U8 **ep
-Anipd |bool |is_utf8_string_loclen|NN const U8 *s|const STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
+AnidP |bool |is_utf8_string_flags \
+ |NN const U8 *s|const STRLEN len|const U32 flags
+AnidP |bool |is_strict_utf8_string|NN const U8 *s|const STRLEN len
+AnidP |bool |is_c9strict_utf8_string|NN const U8 *s|const STRLEN len
+Anpdmb |bool |is_utf8_string_loc \
+ |NN const U8 *s|const STRLEN len|NN const U8 **ep
+Andm |bool |is_utf8_string_loc_flags \
+ |NN const U8 *s|const STRLEN len|NN const U8 **ep \
+ |const U32 flags
+Andm |bool |is_strict_utf8_string_loc \
+ |NN const U8 *s|const STRLEN len|NN const U8 **ep
+Andm |bool |is_c9strict_utf8_string_loc \
+ |NN const U8 *s|const STRLEN len|NN const U8 **ep
+Anipd |bool |is_utf8_string_loclen \
+ |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep \
+ |NULLOK STRLEN *el
+Anid |bool |is_utf8_string_loclen_flags \
+ |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep \
+ |NULLOK STRLEN *el|const U32 flags
+Anid |bool |is_strict_utf8_string_loclen \
+ |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep \
+ |NULLOK STRLEN *el
+Anid |bool |is_c9strict_utf8_string_loclen \
+ |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep \
+ |NULLOK STRLEN *el
AmndP |bool |is_utf8_valid_partial_char \
|NN const U8 * const s|NN const U8 * const e
AnidP |bool |is_utf8_valid_partial_char_flags \
#define intro_my() Perl_intro_my(aTHX)
#define isALNUM_lazy(a) Perl_isALNUM_lazy(aTHX_ a)
#define isIDFIRST_lazy(a) Perl_isIDFIRST_lazy(aTHX_ a)
+#define is_c9strict_utf8_string S_is_c9strict_utf8_string
+#define is_c9strict_utf8_string_loclen S_is_c9strict_utf8_string_loclen
#define is_lvalue_sub() Perl_is_lvalue_sub(aTHX)
+#define is_strict_utf8_string S_is_strict_utf8_string
+#define is_strict_utf8_string_loclen S_is_strict_utf8_string_loclen
#define is_uni_alnum(a) Perl_is_uni_alnum(aTHX_ a)
#define is_uni_alnum_lc(a) Perl_is_uni_alnum_lc(aTHX_ a)
#define is_uni_alnumc(a) Perl_is_uni_alnumc(aTHX_ a)
#define is_utf8_punct(a) Perl_is_utf8_punct(aTHX_ a)
#define is_utf8_space(a) Perl_is_utf8_space(aTHX_ a)
#define is_utf8_string Perl_is_utf8_string
+#define is_utf8_string_flags S_is_utf8_string_flags
#define is_utf8_string_loclen Perl_is_utf8_string_loclen
+#define is_utf8_string_loclen_flags S_is_utf8_string_loclen_flags
#define is_utf8_upper(a) Perl_is_utf8_upper(aTHX_ a)
#define is_utf8_valid_partial_char_flags S_is_utf8_valid_partial_char_flags
#define is_utf8_xdigit(a) Perl_is_utf8_xdigit(aTHX_ a)
use warnings;
use Carp;
-our $VERSION = '0.85';
+our $VERSION = '0.86';
require XSLoader;
IV
test_is_utf8_valid_partial_char_flags(char *s, STRLEN len, U32 flags)
CODE:
- /* RETVAL should be bool, but making it IV allows us to test it
- * returning 0 or 1 */
+ /* RETVAL should be bool (here and in tests below), but making it IV
+ * allows us to test it returning 0 or 1 */
RETVAL = is_utf8_valid_partial_char_flags((U8 *) s, (U8 *) s + len, flags);
OUTPUT:
RETVAL
+IV
+test_is_utf8_string(char *s, STRLEN len)
+ CODE:
+ RETVAL = is_utf8_string((U8 *) s, len);
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_utf8_string_loc(char *s, STRLEN len)
+ PREINIT:
+ AV *av;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_utf8_string_loc((U8 *) s, len, &ep)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_utf8_string_loclen(char *s, STRLEN len)
+ PREINIT:
+ AV *av;
+ STRLEN ret_len;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_utf8_string_loclen((U8 *) s, len, &ep, &ret_len)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ av_push(av, newSVuv(ret_len));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+IV
+test_is_utf8_string_flags(char *s, STRLEN len, U32 flags)
+ CODE:
+ RETVAL = is_utf8_string_flags((U8 *) s, len, flags);
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_utf8_string_loc_flags(char *s, STRLEN len, U32 flags)
+ PREINIT:
+ AV *av;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_utf8_string_loc_flags((U8 *) s, len, &ep, flags)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_utf8_string_loclen_flags(char *s, STRLEN len, U32 flags)
+ PREINIT:
+ AV *av;
+ STRLEN ret_len;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_utf8_string_loclen_flags((U8 *) s, len, &ep, &ret_len, flags)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ av_push(av, newSVuv(ret_len));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+IV
+test_is_strict_utf8_string(char *s, STRLEN len)
+ CODE:
+ RETVAL = is_strict_utf8_string((U8 *) s, len);
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_strict_utf8_string_loc(char *s, STRLEN len)
+ PREINIT:
+ AV *av;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_strict_utf8_string_loc((U8 *) s, len, &ep)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_strict_utf8_string_loclen(char *s, STRLEN len)
+ PREINIT:
+ AV *av;
+ STRLEN ret_len;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_strict_utf8_string_loclen((U8 *) s, len, &ep, &ret_len)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ av_push(av, newSVuv(ret_len));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+IV
+test_is_c9strict_utf8_string(char *s, STRLEN len)
+ CODE:
+ RETVAL = is_c9strict_utf8_string((U8 *) s, len);
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_c9strict_utf8_string_loc(char *s, STRLEN len)
+ PREINIT:
+ AV *av;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_c9strict_utf8_string_loc((U8 *) s, len, &ep)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+AV *
+test_is_c9strict_utf8_string_loclen(char *s, STRLEN len)
+ PREINIT:
+ AV *av;
+ STRLEN ret_len;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_c9strict_utf8_string_loclen((U8 *) s, len, &ep, &ret_len)));
+ av_push(av, newSViv(ep - (U8 *) s));
+ av_push(av, newSVuv(ret_len));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
UV
test_toLOWER(UV ord)
CODE:
use warnings 'utf8';
local $SIG{__WARN__} = sub { push @warnings, @_ };
-# This set of tests looks for basic sanity, and lastly tests the bottom level
-# decode routine for the given code point. If the earlier tests for that code
-# point fail, that one probably will too. Malformations are tested in later
+my %restriction_types;
+
+$restriction_types{""}{'valid_strings'} = "";
+$restriction_types{"c9strict"}{'valid_strings'} = "";
+$restriction_types{"strict"}{'valid_strings'} = "";
+$restriction_types{"fits_in_31_bits"}{'valid_strings'} = "";
+
+# This set of tests looks for basic sanity, and lastly tests various routines
+# for the given code point. If the earlier tests for that code point fail,
+# the later ones probably will too. Malformations are tested in later
# segments of code.
for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
keys %code_points)
my $valid_under_strict = 1;
my $valid_under_c9strict = 1;
+ my $valid_for_fits_in_31_bits = 1;
if ($n > 0x10FFFF) {
$this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
$valid_under_strict = 0;
if ($n > 2 ** 31 - 1) {
$this_utf8_flags &=
~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
+ $valid_for_fits_in_31_bits = 0;
}
}
elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
{
diag "The warnings were: " . join(", ", @warnings);
}
+
+ # Now append this code point to a string that we will test various
+ # versions of is_foo_utf8_string_bar on, and keep a count of how many code
+ # points are in it. All the code points in this loop are valid in Perl's
+ # extended UTF-8, but some are not valid under various restrictions. A
+ # string and count is kept separately that is entirely valid for each
+ # restriction. And, for each restriction, we note the first occurrence in
+ # the unrestricted string where we find something not in the restricted
+ # string.
+ $restriction_types{""}{'valid_strings'} .= $bytes;
+ $restriction_types{""}{'valid_counts'}++;
+
+ if ($valid_under_c9strict) {
+ $restriction_types{"c9strict"}{'valid_strings'} .= $bytes;
+ $restriction_types{"c9strict"}{'valid_counts'}++;
+ }
+ elsif (! exists $restriction_types{"c9strict"}{'first_invalid_offset'}) {
+ $restriction_types{"c9strict"}{'first_invalid_offset'}
+ = length $restriction_types{"c9strict"}{'valid_strings'};
+ $restriction_types{"c9strict"}{'first_invalid_count'}
+ = $restriction_types{"c9strict"}{'valid_counts'};
+ }
+
+ if ($valid_under_strict) {
+ $restriction_types{"strict"}{'valid_strings'} .= $bytes;
+ $restriction_types{"strict"}{'valid_counts'}++;
+ }
+ elsif (! exists $restriction_types{"strict"}{'first_invalid_offset'}) {
+ $restriction_types{"strict"}{'first_invalid_offset'}
+ = length $restriction_types{"strict"}{'valid_strings'};
+ $restriction_types{"strict"}{'first_invalid_count'}
+ = $restriction_types{"strict"}{'valid_counts'};
+ }
+
+ if ($valid_for_fits_in_31_bits) {
+ $restriction_types{"fits_in_31_bits"}{'valid_strings'} .= $bytes;
+ $restriction_types{"fits_in_31_bits"}{'valid_counts'}++;
+ }
+ elsif (! exists
+ $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'})
+ {
+ $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'}
+ = length $restriction_types{"fits_in_31_bits"}{'valid_strings'};
+ $restriction_types{"fits_in_31_bits"}{'first_invalid_count'}
+ = $restriction_types{"fits_in_31_bits"}{'valid_counts'};
+ }
+}
+
+my $I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
+my $cont_byte = I8_to_native($I8c);
+my $p = (isASCII) ? "\xe1\x80" : I8_to_native("\xE4\xA0"); # partial
+
+# The loop above tested the single or partial character functions/macros,
+# while building up strings to test the string functions, which we do now.
+
+for my $restriction (sort keys %restriction_types) {
+ use bytes;
+
+ for my $use_flags ("", "_flags") {
+
+ # For each restriction, we test it in both the is_foo_flags functions
+ # and the specially named foo function. But not if there isn't such a
+ # specially named function. Currently, this is the only tested
+ # restriction that doesn't have a specially named function
+ next if $use_flags eq "" && $restriction eq "fits_in_31_bits";
+
+ # Start building up the name of the function we will test.
+ my $name = "is_";
+
+ if (! $use_flags && $restriction ne "") {
+ $name .= $restriction . "_";
+ }
+ $name .= "utf8_string";
+
+ # We test each version of the function
+ for my $function ("_loclen", "_loc", "") {
+
+ # We test each function against
+ # a) valid input
+ # b) invalid input created by appending an out-of-place
+ # continuation character to the valid string
+ # c) invalid input created by appending a partial character
+ # d) invalid input created by calling a function that is
+ # expecting a restricted form of the input using the string
+ # that's valid when unrestricted
+ for my $error_type (0, $cont_byte, $p, $restriction) {
+ #diag "restriction=$restriction, use_flags=$use_flags, function=$function, error_type=" . display_bytes($error_type);
+
+ # If there is no restriction, the error type will be "",
+ # which is redundant with 0.
+ next if $error_type eq "";
+
+ my $this_name = "$name$function$use_flags";
+ my $bytes
+ = $restriction_types{$restriction}{'valid_strings'};
+ my $expected_offset = length $bytes;
+ my $expected_count
+ = $restriction_types{$restriction}{'valid_counts'};
+ my $test_name_suffix = "";
+
+ my $this_error_type = $error_type;
+ if ($this_error_type) {
+
+ # Appending a bare continuation byte or a partial
+ # character makes it invalid, but the character count
+ # and offset remain the same. But in the other cases,
+ # we have saved where the failures should occur, so
+ # use those.
+ if ($this_error_type eq $cont_byte || $this_error_type eq $p) {
+ $bytes .= $this_error_type;
+ if ($this_error_type eq $cont_byte) {
+ $test_name_suffix
+ = " for an unexpected continuation";
+ }
+ else {
+ $test_name_suffix
+ = " if ends with a partial character";
+ }
+ }
+ else {
+ $test_name_suffix
+ = " if contains forbidden code points";
+ if ($this_error_type eq "c9strict") {
+ $bytes = $restriction_types{""}{'valid_strings'};
+ $expected_offset
+ = $restriction_types{"c9strict"}
+ {'first_invalid_offset'};
+ $expected_count
+ = $restriction_types{"c9strict"}
+ {'first_invalid_count'};
+ }
+ elsif ($this_error_type eq "strict") {
+ $bytes = $restriction_types{""}{'valid_strings'};
+ $expected_offset
+ = $restriction_types{"strict"}
+ {'first_invalid_offset'};
+ $expected_count
+ = $restriction_types{"strict"}
+ {'first_invalid_count'};
+
+ }
+ elsif ($this_error_type eq "fits_in_31_bits") {
+ $bytes = $restriction_types{""}{'valid_strings'};
+ $expected_offset
+ = $restriction_types{"fits_in_31_bits"}
+ {'first_invalid_offset'};
+ $expected_count
+ = $restriction_types{"fits_in_31_bits"}
+ {'first_invalid_count'};
+ }
+ else {
+ fail("Internal test error: Unknown error type "
+ . "'$this_error_type'");
+ next;
+ }
+ }
+ }
+
+ my $length = length $bytes;
+ my $ret_ref;
+
+ my $test = "\$ret_ref = test_$this_name(\$bytes, $length";
+
+ # If using the _flags functions, we have to figure out what
+ # flags to pass. This is done to match the restriction.
+ if ($use_flags eq "_flags") {
+ if (! $restriction) {
+ $test .= ", 0"; # The flag
+
+ # Indicate the kind of flag in the test name.
+ $this_name .= "(0)";
+ }
+ else {
+ $this_name .= "($restriction)";
+ if ($restriction eq "c9strict") {
+ $test
+ .= ", $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE";
+ }
+ elsif ($restriction eq "strict") {
+ $test .= ", $UTF8_DISALLOW_ILLEGAL_INTERCHANGE";
+ }
+ elsif ($restriction eq "fits_in_31_bits") {
+ $test .= ", $UTF8_DISALLOW_ABOVE_31_BIT";
+ }
+ else {
+ fail("Internal test error: Unknown restriction "
+ . "'$restriction'");
+ next;
+ }
+ }
+ }
+ $test .= ")";
+
+ # Actually run the test
+ eval $test;
+ if ($@) {
+ fail($test);
+ diag $@;
+ next;
+ }
+
+ my $ret;
+ my $error_offset;
+ my $cp_count;
+
+ if ($function eq "") {
+ $ret = $ret_ref; # For plain function, there's only a
+ # single return value
+ }
+ else { # Otherwise, the multiple values come in an array.
+ $ret = shift @$ret_ref ;
+ $error_offset = shift @$ret_ref;
+ $cp_count = shift@$ret_ref if $function eq "_loclen";
+ }
+
+ if ($this_error_type) {
+ is($ret, 0,
+ "Verify $this_name is FALSE$test_name_suffix");
+ }
+ else {
+ unless(is($ret, 1,
+ "Verify $this_name is TRUE for valid input"
+ . "$test_name_suffix"))
+ {
+ diag("The bytes starting at offset"
+ . " $error_offset are"
+ . display_bytes(substr(
+ $restriction_types{$restriction}
+ {'valid_strings'},
+ $error_offset)));
+ next;
+ }
+ }
+
+ if ($function ne "") {
+ unless (is($error_offset, $expected_offset,
+ "\tAnd returns the correct offset"))
+ {
+ my $min = ($error_offset < $expected_offset)
+ ? $error_offset
+ : $expected_offset;
+ diag display_bytes(substr($bytes, $min));
+ }
+
+ if ($function eq '_loclen') {
+ is($cp_count, $expected_count,
+ "\tAnd returns the correct character count");
+ }
+ }
+ }
+ }
+ }
}
my $REPLACEMENT = 0xFFFD;
# Now test the malformations. All these raise category utf8 warnings.
-my $I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
my @malformations = (
[ "zero length string malformation", "", 0,
$UTF8_ALLOW_EMPTY, 0, 0,
use this option, that C<s> can't have embedded C<NUL> characters and has to
have a terminating C<NUL> byte).
-See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+See also
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
=cut
*/
This function considers Perl's extended UTF-8 to be valid. That means that
code points above Unicode, surrogates, and non-character code points are
-considered valid by this function.
+considered valid by this function. Use C<L</is_strict_utf8_string>>,
+C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
+code points are considered valid.
-See also L</is_utf8_invariant_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loclen>>,
=cut
*/
}
/*
+=for apidoc is_strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that is fully interchangeable by any application using
+Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
+calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any
+code points above the Unicode max of 0x10FFFF, surrogate code points, or
+non-character code points.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string(const U8 *s, const STRLEN len)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+
+ PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING;
+
+ while (x < send) {
+ const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ return FALSE;
+ }
+ x += cur_len;
+ }
+
+ return TRUE;
+}
+
+/*
+=for apidoc is_c9strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that conforms to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
+otherwise it returns FALSE. If C<len> is 0, it will be calculated using
+C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
+C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
+characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any code points above the
+Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
+code points per
+L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+
+ PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING;
+
+ while (x < send) {
+ const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ return FALSE;
+ }
+ x += cur_len;
+ }
+
+ return TRUE;
+}
+
+/* The above 3 functions could have been moved into the more general one just
+ * below, and made #defines that call it with the right 'flags'. They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+=for apidoc is_utf8_string_flags
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8 string, subject to the restrictions imposed by C<flags>;
+returns FALSE otherwise. If C<len> is 0, it will be calculated
+using C<strlen(s)> (which means if you use this option, that C<s> can't have
+embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
+that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
+C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</is_strict_utf8_string>>; and if C<flags> is
+C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
+C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
+combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
+C<L</utf8n_to_uvchr>>, with the same meanings.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+
+ PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_ABOVE_31_BIT)));
+
+ if (flags == 0) {
+ return is_utf8_string(s, len);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+ {
+ return is_strict_utf8_string(s, len);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+ {
+ return is_c9strict_utf8_string(s, len);
+ }
+
+ while (x < send) {
+ STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+ if (UNLIKELY(! cur_len)) {
+ return FALSE;
+ }
+ x += cur_len;
+ }
+
+ return TRUE;
+}
+
+/*
=for apidoc is_utf8_string_loc
Like C<L</is_utf8_string>> but stores the location of the failure (in the
case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>, and the number of UTF-8
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
encoded characters in the C<el> pointer.
See also C<L</is_utf8_string_loc>>.
}
/*
+
+=for apidoc is_strict_utf8_string_loc
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string_loc(s, len, ep) \
+ is_strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_strict_utf8_string_loclen
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+ STRLEN outlen = 0;
+
+ PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
+
+ while (x < send) {
+ const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+}
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loc
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string_loc(s, len, ep) \
+ is_c9strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loclen
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
+characters in the C<el> pointer.
+
+See also C<L</is_c9strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+ STRLEN outlen = 0;
+
+ PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
+
+ while (x < send) {
+ const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+}
+
+/*
+
+=for apidoc is_utf8_string_loc_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc_flags(s, len, ep, flags) \
+ is_utf8_string_loclen_flags(s, len, ep, 0, flags)
+
+
+/* The above 3 actual functions could have been moved into the more general one
+ * just below, and made #defines that call it with the right 'flags'. They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+
+=for apidoc is_utf8_string_loclen_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc_flags>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+ STRLEN outlen = 0;
+
+ PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
+ assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+ |UTF8_DISALLOW_ABOVE_31_BIT)));
+
+ if (flags == 0) {
+ return is_utf8_string_loclen(s, len, ep, el);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+ {
+ return is_strict_utf8_string_loclen(s, len, ep, el);
+ }
+
+ if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+ {
+ return is_c9strict_utf8_string_loclen(s, len, ep, el);
+ }
+
+ while (x < send) {
+ const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+}
+
+/*
=for apidoc utf8_distance
Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
__attribute__warn_unused_result__
__attribute__pure__; */
+PERL_STATIC_INLINE bool S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING \
+ assert(s)
+
+/* PERL_CALLCONV bool is_c9strict_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); */
+PERL_STATIC_INLINE bool S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el);
+#define PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN \
+ assert(s)
/* PERL_CALLCONV bool Perl_is_invariant_string(const U8* const s, const STRLEN len)
__attribute__warn_unused_result__
__attribute__pure__; */
PERL_CALLCONV I32 Perl_is_lvalue_sub(pTHX)
__attribute__warn_unused_result__;
+PERL_STATIC_INLINE bool S_is_strict_utf8_string(const U8 *s, const STRLEN len)
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING \
+ assert(s)
+
+/* PERL_CALLCONV bool is_strict_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); */
+PERL_STATIC_INLINE bool S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el);
+#define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN \
+ assert(s)
PERL_CALLCONV bool Perl_is_uni_alnum(pTHX_ UV c)
__attribute__deprecated__
__attribute__warn_unused_result__
#define PERL_ARGS_ASSERT_IS_UTF8_STRING \
assert(s)
+PERL_STATIC_INLINE bool S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS \
+ assert(s)
+
#ifndef NO_MATHOMS
PERL_CALLCONV bool Perl_is_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep);
#define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOC \
assert(s); assert(ep)
#endif
+/* PERL_CALLCONV bool is_utf8_string_loc_flags(const U8 *s, const STRLEN len, const U8 **ep, const U32 flags); */
PERL_STATIC_INLINE bool Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el);
#define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN \
assert(s)
+PERL_STATIC_INLINE bool S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags);
+#define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS \
+ assert(s)
PERL_CALLCONV bool Perl_is_utf8_upper(pTHX_ const U8 *p)
__attribute__deprecated__
__attribute__warn_unused_result__;
code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
=cut
*/
Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
C<L</isUTF8_CHAR_flags>> for a more customized definition.
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
=cut
*/
are likely to run somewhat faster than this more general one, as they can be
inlined into your code.
+Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
+L</is_utf8_string_loclen_flags> to check entire strings.
+
=cut
*/