This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add API Unicode handling functions
authorKarl Williamson <khw@cpan.org>
Mon, 19 Sep 2016 15:59:32 +0000 (09:59 -0600)
committerKarl Williamson <khw@cpan.org>
Mon, 26 Sep 2016 04:24:20 +0000 (22:24 -0600)
These functions are all extensions of the is_utf8_string_foo()
functions, that restrict the UTF-8 recognized as valid in various ways.
There are named ones for the two definitions that Unicode makes, and
foo_flags ones for more custom restrictions.

The named ones are implemented as tries, while the flags ones provide
complete generality

embed.fnc
embed.h
ext/XS-APItest/APItest.pm
ext/XS-APItest/APItest.xs
ext/XS-APItest/t/utf8.t
inline.h
proto.h
utf8.h

index 2954eda..1ea0c76 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -742,8 +742,31 @@ AmnpdRP    |bool   |is_invariant_string|NN const U8* const s|const STRLEN len
 AnpdD  |STRLEN |is_utf8_char   |NN const U8 *s
 Abmnpd |STRLEN |is_utf8_char_buf|NN const U8 *buf|NN const U8 *buf_end
 AnipdP |bool   |is_utf8_string |NN const U8 *s|const STRLEN len
-Anpdmb |bool   |is_utf8_string_loc|NN const U8 *s|const STRLEN len|NN const U8 **ep
-Anipd  |bool   |is_utf8_string_loclen|NN const U8 *s|const STRLEN len|NULLOK const U8 **ep|NULLOK STRLEN *el
+AnidP  |bool   |is_utf8_string_flags                                       \
+               |NN const U8 *s|const STRLEN len|const U32 flags
+AnidP  |bool   |is_strict_utf8_string|NN const U8 *s|const STRLEN len
+AnidP  |bool   |is_c9strict_utf8_string|NN const U8 *s|const STRLEN len
+Anpdmb |bool   |is_utf8_string_loc                                         \
+               |NN const U8 *s|const STRLEN len|NN const U8 **ep
+Andm   |bool   |is_utf8_string_loc_flags                                   \
+               |NN const U8 *s|const STRLEN len|NN const U8 **ep           \
+               |const U32 flags
+Andm   |bool   |is_strict_utf8_string_loc                                  \
+               |NN const U8 *s|const STRLEN len|NN const U8 **ep
+Andm   |bool   |is_c9strict_utf8_string_loc                                \
+               |NN const U8 *s|const STRLEN len|NN const U8 **ep
+Anipd  |bool   |is_utf8_string_loclen                                      \
+               |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep       \
+               |NULLOK STRLEN *el
+Anid   |bool   |is_utf8_string_loclen_flags                                \
+               |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep       \
+               |NULLOK STRLEN *el|const U32 flags
+Anid   |bool   |is_strict_utf8_string_loclen                               \
+               |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep       \
+               |NULLOK STRLEN *el
+Anid   |bool   |is_c9strict_utf8_string_loclen                             \
+               |NN const U8 *s|const STRLEN len|NULLOK const U8 **ep       \
+               |NULLOK STRLEN *el
 AmndP  |bool   |is_utf8_valid_partial_char                                 \
                |NN const U8 * const s|NN const U8 * const e
 AnidP  |bool   |is_utf8_valid_partial_char_flags                           \
diff --git a/embed.h b/embed.h
index 50a19a4..6485397 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define intro_my()             Perl_intro_my(aTHX)
 #define isALNUM_lazy(a)                Perl_isALNUM_lazy(aTHX_ a)
 #define isIDFIRST_lazy(a)      Perl_isIDFIRST_lazy(aTHX_ a)
+#define is_c9strict_utf8_string        S_is_c9strict_utf8_string
+#define is_c9strict_utf8_string_loclen S_is_c9strict_utf8_string_loclen
 #define is_lvalue_sub()                Perl_is_lvalue_sub(aTHX)
+#define is_strict_utf8_string  S_is_strict_utf8_string
+#define is_strict_utf8_string_loclen   S_is_strict_utf8_string_loclen
 #define is_uni_alnum(a)                Perl_is_uni_alnum(aTHX_ a)
 #define is_uni_alnum_lc(a)     Perl_is_uni_alnum_lc(aTHX_ a)
 #define is_uni_alnumc(a)       Perl_is_uni_alnumc(aTHX_ a)
 #define is_utf8_punct(a)       Perl_is_utf8_punct(aTHX_ a)
 #define is_utf8_space(a)       Perl_is_utf8_space(aTHX_ a)
 #define is_utf8_string         Perl_is_utf8_string
+#define is_utf8_string_flags   S_is_utf8_string_flags
 #define is_utf8_string_loclen  Perl_is_utf8_string_loclen
+#define is_utf8_string_loclen_flags    S_is_utf8_string_loclen_flags
 #define is_utf8_upper(a)       Perl_is_utf8_upper(aTHX_ a)
 #define is_utf8_valid_partial_char_flags       S_is_utf8_valid_partial_char_flags
 #define is_utf8_xdigit(a)      Perl_is_utf8_xdigit(aTHX_ a)
index 30658c2..64a25f1 100644 (file)
@@ -5,7 +5,7 @@ use strict;
 use warnings;
 use Carp;
 
-our $VERSION = '0.85';
+our $VERSION = '0.86';
 
 require XSLoader;
 
index 954bb60..5078ce9 100644 (file)
@@ -5351,12 +5351,152 @@ test_isC9_STRICT_UTF8_CHAR(char *s, STRLEN len)
 IV
 test_is_utf8_valid_partial_char_flags(char *s, STRLEN len, U32 flags)
     CODE:
-        /* RETVAL should be bool, but making it IV allows us to test it
-         * returning 0 or 1 */
+        /* RETVAL should be bool (here and in tests below), but making it IV
+         * allows us to test it returning 0 or 1 */
         RETVAL = is_utf8_valid_partial_char_flags((U8 *) s, (U8 *) s + len, flags);
     OUTPUT:
         RETVAL
 
+IV
+test_is_utf8_string(char *s, STRLEN len)
+    CODE:
+        RETVAL = is_utf8_string((U8 *) s, len);
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_utf8_string_loc(char *s, STRLEN len)
+    PREINIT:
+        AV *av;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_utf8_string_loc((U8 *) s, len, &ep)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_utf8_string_loclen(char *s, STRLEN len)
+    PREINIT:
+        AV *av;
+        STRLEN ret_len;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_utf8_string_loclen((U8 *) s, len, &ep, &ret_len)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        av_push(av, newSVuv(ret_len));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+IV
+test_is_utf8_string_flags(char *s, STRLEN len, U32 flags)
+    CODE:
+        RETVAL = is_utf8_string_flags((U8 *) s, len, flags);
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_utf8_string_loc_flags(char *s, STRLEN len, U32 flags)
+    PREINIT:
+        AV *av;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_utf8_string_loc_flags((U8 *) s, len, &ep, flags)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_utf8_string_loclen_flags(char *s, STRLEN len, U32 flags)
+    PREINIT:
+        AV *av;
+        STRLEN ret_len;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_utf8_string_loclen_flags((U8 *) s, len, &ep, &ret_len, flags)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        av_push(av, newSVuv(ret_len));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+IV
+test_is_strict_utf8_string(char *s, STRLEN len)
+    CODE:
+        RETVAL = is_strict_utf8_string((U8 *) s, len);
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_strict_utf8_string_loc(char *s, STRLEN len)
+    PREINIT:
+        AV *av;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_strict_utf8_string_loc((U8 *) s, len, &ep)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_strict_utf8_string_loclen(char *s, STRLEN len)
+    PREINIT:
+        AV *av;
+        STRLEN ret_len;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_strict_utf8_string_loclen((U8 *) s, len, &ep, &ret_len)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        av_push(av, newSVuv(ret_len));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+IV
+test_is_c9strict_utf8_string(char *s, STRLEN len)
+    CODE:
+        RETVAL = is_c9strict_utf8_string((U8 *) s, len);
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_c9strict_utf8_string_loc(char *s, STRLEN len)
+    PREINIT:
+        AV *av;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_c9strict_utf8_string_loc((U8 *) s, len, &ep)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
+AV *
+test_is_c9strict_utf8_string_loclen(char *s, STRLEN len)
+    PREINIT:
+        AV *av;
+        STRLEN ret_len;
+        const U8 * ep;
+    CODE:
+        av = newAV();
+        av_push(av, newSViv(is_c9strict_utf8_string_loclen((U8 *) s, len, &ep, &ret_len)));
+        av_push(av, newSViv(ep - (U8 *) s));
+        av_push(av, newSVuv(ret_len));
+        RETVAL = av;
+    OUTPUT:
+        RETVAL
+
 UV
 test_toLOWER(UV ord)
     CODE:
index 973188e..51d5ab4 100644 (file)
@@ -353,9 +353,16 @@ my @warnings;
 use warnings 'utf8';
 local $SIG{__WARN__} = sub { push @warnings, @_ };
 
-# This set of tests looks for basic sanity, and lastly tests the bottom level
-# decode routine for the given code point.  If the earlier tests for that code
-# point fail, that one probably will too.  Malformations are tested in later
+my %restriction_types;
+
+$restriction_types{""}{'valid_strings'} = "";
+$restriction_types{"c9strict"}{'valid_strings'} = "";
+$restriction_types{"strict"}{'valid_strings'} = "";
+$restriction_types{"fits_in_31_bits"}{'valid_strings'} = "";
+
+# This set of tests looks for basic sanity, and lastly tests various routines
+# for the given code point.  If the earlier tests for that code point fail,
+# the later ones probably will too.  Malformations are tested in later
 # segments of code.
 for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
           keys %code_points)
@@ -490,6 +497,7 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
 
     my $valid_under_strict = 1;
     my $valid_under_c9strict = 1;
+    my $valid_for_fits_in_31_bits = 1;
     if ($n > 0x10FFFF) {
         $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
         $valid_under_strict = 0;
@@ -497,6 +505,7 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
         if ($n > 2 ** 31 - 1) {
             $this_utf8_flags &=
                             ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
+            $valid_for_fits_in_31_bits = 0;
         }
     }
     elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
@@ -675,12 +684,263 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
     {
         diag "The warnings were: " . join(", ", @warnings);
     }
+
+    # Now append this code point to a string that we will test various
+    # versions of is_foo_utf8_string_bar on, and keep a count of how many code
+    # points are in it.  All the code points in this loop are valid in Perl's
+    # extended UTF-8, but some are not valid under various restrictions.  A
+    # string and count is kept separately that is entirely valid for each
+    # restriction.  And, for each restriction, we note the first occurrence in
+    # the unrestricted string where we find something not in the restricted
+    # string.
+    $restriction_types{""}{'valid_strings'} .= $bytes;
+    $restriction_types{""}{'valid_counts'}++;
+
+    if ($valid_under_c9strict) {
+        $restriction_types{"c9strict"}{'valid_strings'} .= $bytes;
+        $restriction_types{"c9strict"}{'valid_counts'}++;
+    }
+    elsif (! exists $restriction_types{"c9strict"}{'first_invalid_offset'}) {
+        $restriction_types{"c9strict"}{'first_invalid_offset'}
+                    = length $restriction_types{"c9strict"}{'valid_strings'};
+        $restriction_types{"c9strict"}{'first_invalid_count'}
+                            = $restriction_types{"c9strict"}{'valid_counts'};
+    }
+
+    if ($valid_under_strict) {
+        $restriction_types{"strict"}{'valid_strings'} .= $bytes;
+        $restriction_types{"strict"}{'valid_counts'}++;
+    }
+    elsif (! exists $restriction_types{"strict"}{'first_invalid_offset'}) {
+        $restriction_types{"strict"}{'first_invalid_offset'}
+                        = length $restriction_types{"strict"}{'valid_strings'};
+        $restriction_types{"strict"}{'first_invalid_count'}
+                                = $restriction_types{"strict"}{'valid_counts'};
+    }
+
+    if ($valid_for_fits_in_31_bits) {
+        $restriction_types{"fits_in_31_bits"}{'valid_strings'} .= $bytes;
+        $restriction_types{"fits_in_31_bits"}{'valid_counts'}++;
+    }
+    elsif (! exists
+                $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'})
+    {
+        $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'}
+                = length $restriction_types{"fits_in_31_bits"}{'valid_strings'};
+        $restriction_types{"fits_in_31_bits"}{'first_invalid_count'}
+                        = $restriction_types{"fits_in_31_bits"}{'valid_counts'};
+    }
+}
+
+my $I8c = (isASCII) ? "\x80" : "\xa0";    # A continuation byte
+my $cont_byte = I8_to_native($I8c);
+my $p = (isASCII) ? "\xe1\x80" : I8_to_native("\xE4\xA0");  # partial
+
+# The loop above tested the single or partial character functions/macros,
+# while building up strings to test the string functions, which we do now.
+
+for my $restriction (sort keys %restriction_types) {
+    use bytes;
+
+    for my $use_flags ("", "_flags") {
+
+        # For each restriction, we test it in both the is_foo_flags functions
+        # and the specially named foo function.  But not if there isn't such a
+        # specially named function.  Currently, this is the only tested
+        # restriction that doesn't have a specially named function
+        next if $use_flags eq "" && $restriction eq "fits_in_31_bits";
+
+        # Start building up the name of the function we will test.
+        my $name = "is_";
+
+        if (! $use_flags  && $restriction ne "") {
+            $name .= $restriction . "_";
+        }
+        $name .= "utf8_string";
+
+            # We test each version of the function
+            for my $function ("_loclen", "_loc", "") {
+
+                # We test each function against
+                #   a) valid input
+                #   b) invalid input created by appending an out-of-place
+                #      continuation character to the valid string
+                #   c) invalid input created by appending a partial character
+                #   d) invalid input created by calling a function that is
+                #      expecting a restricted form of the input using the string
+                #      that's valid when unrestricted
+                for my $error_type (0, $cont_byte, $p, $restriction) {
+                    #diag "restriction=$restriction, use_flags=$use_flags, function=$function, error_type=" . display_bytes($error_type);
+
+                    # If there is no restriction, the error type will be "",
+                    # which is redundant with 0.
+                    next if $error_type eq "";
+
+                    my $this_name = "$name$function$use_flags";
+                    my $bytes
+                            = $restriction_types{$restriction}{'valid_strings'};
+                    my $expected_offset = length $bytes;
+                    my $expected_count
+                            = $restriction_types{$restriction}{'valid_counts'};
+                    my $test_name_suffix = "";
+
+                    my $this_error_type = $error_type;
+                    if ($this_error_type) {
+
+                        # Appending a bare continuation byte or a partial
+                        # character makes it invalid, but the character count
+                        # and offset remain the same.  But in the other cases,
+                        # we have saved where the failures should occur, so
+                        # use those.
+                        if ($this_error_type eq $cont_byte || $this_error_type eq $p) {
+                            $bytes .= $this_error_type;
+                            if ($this_error_type eq $cont_byte) {
+                                $test_name_suffix
+                                            = " for an unexpected continuation";
+                            }
+                            else {
+                                $test_name_suffix
+                                        = " if ends with a partial character";
+                            }
+                        }
+                        else {
+                            $test_name_suffix
+                                        = " if contains forbidden code points";
+                            if ($this_error_type eq "c9strict") {
+                                $bytes = $restriction_types{""}{'valid_strings'};
+                                $expected_offset
+                                 = $restriction_types{"c9strict"}
+                                                     {'first_invalid_offset'};
+                                $expected_count
+                                  = $restriction_types{"c9strict"}
+                                                      {'first_invalid_count'};
+                            }
+                            elsif ($this_error_type eq "strict") {
+                                $bytes = $restriction_types{""}{'valid_strings'};
+                                $expected_offset
+                                  = $restriction_types{"strict"}
+                                                      {'first_invalid_offset'};
+                                $expected_count
+                                  = $restriction_types{"strict"}
+                                                      {'first_invalid_count'};
+
+                            }
+                            elsif ($this_error_type eq "fits_in_31_bits") {
+                                $bytes = $restriction_types{""}{'valid_strings'};
+                                $expected_offset
+                                  = $restriction_types{"fits_in_31_bits"}
+                                                      {'first_invalid_offset'};
+                                $expected_count
+                                    = $restriction_types{"fits_in_31_bits"}
+                                                        {'first_invalid_count'};
+                            }
+                            else {
+                                fail("Internal test error: Unknown error type "
+                                . "'$this_error_type'");
+                                next;
+                            }
+                        }
+                    }
+
+                    my $length = length $bytes;
+                    my $ret_ref;
+
+                    my $test = "\$ret_ref = test_$this_name(\$bytes, $length";
+
+                    # If using the _flags functions, we have to figure out what
+                    # flags to pass.  This is done to match the restriction.
+                    if ($use_flags eq "_flags") {
+                        if (! $restriction) {
+                            $test .= ", 0";     # The flag
+
+                            # Indicate the kind of flag in the test name.
+                            $this_name .= "(0)";
+                        }
+                        else {
+                            $this_name .= "($restriction)";
+                            if ($restriction eq "c9strict") {
+                                $test
+                                  .= ", $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE";
+                            }
+                            elsif ($restriction eq "strict") {
+                                $test .= ", $UTF8_DISALLOW_ILLEGAL_INTERCHANGE";
+                            }
+                            elsif ($restriction eq "fits_in_31_bits") {
+                                $test .= ", $UTF8_DISALLOW_ABOVE_31_BIT";
+                            }
+                            else {
+                                fail("Internal test error: Unknown restriction "
+                                . "'$restriction'");
+                                next;
+                            }
+                        }
+                    }
+                    $test .= ")";
+
+                    # Actually run the test
+                    eval $test;
+                    if ($@) {
+                        fail($test);
+                        diag $@;
+                        next;
+                    }
+
+                    my $ret;
+                    my $error_offset;
+                    my $cp_count;
+
+                    if ($function eq "") {
+                        $ret = $ret_ref;    # For plain function, there's only a
+                                            # single return value
+                    }
+                    else {  # Otherwise, the multiple values come in an array.
+                        $ret = shift @$ret_ref ;
+                        $error_offset = shift @$ret_ref;
+                        $cp_count = shift@$ret_ref if $function eq "_loclen";
+                    }
+
+                    if ($this_error_type) {
+                        is($ret, 0,
+                           "Verify $this_name is FALSE$test_name_suffix");
+                    }
+                    else {
+                        unless(is($ret, 1,
+                                  "Verify $this_name is TRUE for valid input"
+                                . "$test_name_suffix"))
+                        {
+                            diag("The bytes starting at offset"
+                               . " $error_offset are"
+                               . display_bytes(substr(
+                                          $restriction_types{$restriction}
+                                                            {'valid_strings'},
+                                          $error_offset)));
+                            next;
+                        }
+                    }
+
+                    if ($function ne "") {
+                        unless (is($error_offset, $expected_offset,
+                                   "\tAnd returns the correct offset"))
+                        {
+                            my $min = ($error_offset < $expected_offset)
+                                    ? $error_offset
+                                    : $expected_offset;
+                            diag display_bytes(substr($bytes, $min));
+                        }
+
+                        if ($function eq '_loclen') {
+                            is($cp_count, $expected_count,
+                               "\tAnd returns the correct character count");
+                        }
+                    }
+                }
+            }
+    }
 }
 
 my $REPLACEMENT = 0xFFFD;
 
 # Now test the malformations.  All these raise category utf8 warnings.
-my $I8c = (isASCII) ? "\x80" : "\xa0";    # A continuation byte
 my @malformations = (
     [ "zero length string malformation", "", 0,
         $UTF8_ALLOW_EMPTY, 0, 0,
index 388daba..1fc9065 100644 (file)
--- a/inline.h
+++ b/inline.h
@@ -334,8 +334,20 @@ If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
 use this option, that C<s> can't have embedded C<NUL> characters and has to
 have a terminating C<NUL> byte).
 
-See also L</is_utf8_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+See also
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
 
 =cut
 */
@@ -367,10 +379,14 @@ byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
 
 This function considers Perl's extended UTF-8 to be valid.  That means that
 code points above Unicode, surrogates, and non-character code points are
-considered valid by this function.
+considered valid by this function.  Use C<L</is_strict_utf8_string>>,
+C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
+code points are considered valid.
 
-See also L</is_utf8_invariant_string>(), L</is_utf8_string_loclen>(), and
-L</is_utf8_string_loc>().
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loclen>>,
 
 =cut
 */
@@ -398,6 +414,187 @@ Perl_is_utf8_string(const U8 *s, const STRLEN len)
 }
 
 /*
+=for apidoc is_strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that is fully interchangeable by any application using
+Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
+calculated using C<strlen(s)> (which means if you use this option, that C<s>
+can't have embedded C<NUL> characters and has to have a terminating C<NUL>
+byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any
+code points above the Unicode max of 0x10FFFF, surrogate code points, or
+non-character code points.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string(const U8 *s, const STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING;
+
+    while (x < send) {
+        const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            return FALSE;
+        }
+        x += cur_len;
+    }
+
+    return TRUE;
+}
+
+/*
+=for apidoc is_c9strict_utf8_string
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8-encoded string that conforms to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
+otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
+C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
+C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
+characters being ASCII constitute 'a valid UTF-8 string'.
+
+This function returns FALSE for strings containing any code points above the
+Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
+code points per
+L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_flags>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING;
+
+    while (x < send) {
+        const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            return FALSE;
+        }
+        x += cur_len;
+    }
+
+    return TRUE;
+}
+
+/* The above 3 functions could have been moved into the more general one just
+ * below, and made #defines that call it with the right 'flags'.  They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+=for apidoc is_utf8_string_flags
+
+Returns TRUE if the first C<len> bytes of string C<s> form a valid
+UTF-8 string, subject to the restrictions imposed by C<flags>;
+returns FALSE otherwise.  If C<len> is 0, it will be calculated
+using C<strlen(s)> (which means if you use this option, that C<s> can't have
+embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
+that all characters being ASCII constitute 'a valid UTF-8 string'.
+
+If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
+C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
+as C<L</is_strict_utf8_string>>; and if C<flags> is
+C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
+C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
+combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
+C<L</utf8n_to_uvchr>>, with the same meanings.
+
+See also
+C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_string>>,
+C<L</is_utf8_string_loc>>,
+C<L</is_utf8_string_loc_flags>>,
+C<L</is_utf8_string_loclen>>,
+C<L</is_utf8_string_loclen_flags>>,
+C<L</is_strict_utf8_string>>,
+C<L</is_strict_utf8_string_loc>>,
+C<L</is_strict_utf8_string_loclen>>,
+C<L</is_c9strict_utf8_string>>,
+C<L</is_c9strict_utf8_string_loc>>,
+and
+C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+
+    if (flags == 0) {
+        return is_utf8_string(s, len);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+    {
+        return is_strict_utf8_string(s, len);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                       == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+    {
+        return is_c9strict_utf8_string(s, len);
+    }
+
+    while (x < send) {
+        STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+        if (UNLIKELY(! cur_len)) {
+            return FALSE;
+        }
+        x += cur_len;
+    }
+
+    return TRUE;
+}
+
+/*
 
 =for apidoc is_utf8_string_loc
 
@@ -418,7 +615,7 @@ See also C<L</is_utf8_string_loclen>>.
 
 Like C<L</is_utf8_string>> but stores the location of the failure (in the
 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
-"utf8ness success") in the C<ep>, and the number of UTF-8
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
 encoded characters in the C<el> pointer.
 
 See also C<L</is_utf8_string_loc>>.
@@ -455,6 +652,203 @@ Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN
 }
 
 /*
+
+=for apidoc is_strict_utf8_string_loc
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_strict_utf8_string_loc(s, len, ep)                               \
+                                is_strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_strict_utf8_string_loclen
+
+Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+    STRLEN outlen = 0;
+
+    PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
+
+    while (x < send) {
+        const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            break;
+        }
+        x += cur_len;
+        outlen++;
+    }
+
+    if (el)
+        *el = outlen;
+
+    if (ep) {
+        *ep = x;
+    }
+
+    return (x == send);
+}
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loc
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_c9strict_utf8_string_loclen>>.
+
+=cut
+*/
+
+#define is_c9strict_utf8_string_loc(s, len, ep)                                    \
+                            is_c9strict_utf8_string_loclen(s, len, ep, 0)
+
+/*
+
+=for apidoc is_c9strict_utf8_string_loclen
+
+Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
+the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
+characters in the C<el> pointer.
+
+See also C<L</is_c9strict_utf8_string_loc>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+    STRLEN outlen = 0;
+
+    PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
+
+    while (x < send) {
+        const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+        if (UNLIKELY(! cur_len)) {
+            break;
+        }
+        x += cur_len;
+        outlen++;
+    }
+
+    if (el)
+        *el = outlen;
+
+    if (ep) {
+        *ep = x;
+    }
+
+    return (x == send);
+}
+
+/*
+
+=for apidoc is_utf8_string_loc_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer.
+
+See also C<L</is_utf8_string_loclen_flags>>.
+
+=cut
+*/
+
+#define is_utf8_string_loc_flags(s, len, ep, flags)                         \
+                        is_utf8_string_loclen_flags(s, len, ep, 0, flags)
+
+
+/* The above 3 actual functions could have been moved into the more general one
+ * just below, and made #defines that call it with the right 'flags'.  They are
+ * currently kept separate to increase their chances of getting inlined */
+
+/*
+
+=for apidoc is_utf8_string_loclen_flags
+
+Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
+case of "utf8ness failure") or the location C<s>+C<len> (in the case of
+"utf8ness success") in the C<ep> pointer, and the number of UTF-8
+encoded characters in the C<el> pointer.
+
+See also C<L</is_utf8_string_loc_flags>>.
+
+=cut
+*/
+
+PERL_STATIC_INLINE bool
+S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+    STRLEN outlen = 0;
+
+    PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
+    assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                          |UTF8_DISALLOW_ABOVE_31_BIT)));
+
+    if (flags == 0) {
+        return is_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                        == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
+    {
+        return is_strict_utf8_string_loclen(s, len, ep, el);
+    }
+
+    if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+                                    == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
+    {
+        return is_c9strict_utf8_string_loclen(s, len, ep, el);
+    }
+
+    while (x < send) {
+        const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+        if (UNLIKELY(! cur_len)) {
+            break;
+        }
+        x += cur_len;
+        outlen++;
+    }
+
+    if (el)
+        *el = outlen;
+
+    if (ep) {
+        *ep = x;
+    }
+
+    return (x == send);
+}
+
+/*
 =for apidoc utf8_distance
 
 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
diff --git a/proto.h b/proto.h
index 7c2a821..6e5461d 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -1328,6 +1328,15 @@ PERL_CALLCONV bool       Perl_isIDFIRST_lazy(pTHX_ const char* p)
                        __attribute__warn_unused_result__
                        __attribute__pure__; */
 
+PERL_STATIC_INLINE bool        S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING       \
+       assert(s)
+
+/* PERL_CALLCONV bool  is_c9strict_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); */
+PERL_STATIC_INLINE bool        S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el);
+#define PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN        \
+       assert(s)
 /* PERL_CALLCONV bool  Perl_is_invariant_string(const U8* const s, const STRLEN len)
                        __attribute__warn_unused_result__
                        __attribute__pure__; */
@@ -1335,6 +1344,15 @@ PERL_CALLCONV bool       Perl_isIDFIRST_lazy(pTHX_ const char* p)
 PERL_CALLCONV I32      Perl_is_lvalue_sub(pTHX)
                        __attribute__warn_unused_result__;
 
+PERL_STATIC_INLINE bool        S_is_strict_utf8_string(const U8 *s, const STRLEN len)
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING \
+       assert(s)
+
+/* PERL_CALLCONV bool  is_strict_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep); */
+PERL_STATIC_INLINE bool        S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el);
+#define PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN  \
+       assert(s)
 PERL_CALLCONV bool     Perl_is_uni_alnum(pTHX_ UV c)
                        __attribute__deprecated__
                        __attribute__warn_unused_result__
@@ -1614,14 +1632,23 @@ PERL_STATIC_INLINE bool Perl_is_utf8_string(const U8 *s, const STRLEN len)
 #define PERL_ARGS_ASSERT_IS_UTF8_STRING        \
        assert(s)
 
+PERL_STATIC_INLINE bool        S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
+                       __attribute__pure__;
+#define PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS  \
+       assert(s)
+
 #ifndef NO_MATHOMS
 PERL_CALLCONV bool     Perl_is_utf8_string_loc(const U8 *s, const STRLEN len, const U8 **ep);
 #define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOC    \
        assert(s); assert(ep)
 #endif
+/* PERL_CALLCONV bool  is_utf8_string_loc_flags(const U8 *s, const STRLEN len, const U8 **ep, const U32 flags); */
 PERL_STATIC_INLINE bool        Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el);
 #define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN \
        assert(s)
+PERL_STATIC_INLINE bool        S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags);
+#define PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS   \
+       assert(s)
 PERL_CALLCONV bool     Perl_is_utf8_upper(pTHX_ const U8 *p)
                        __attribute__deprecated__
                        __attribute__warn_unused_result__;
diff --git a/utf8.h b/utf8.h
index 89024f8..77eb63d 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -1025,6 +1025,9 @@ Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
 
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
 =cut
 */
 
@@ -1059,6 +1062,9 @@ L<perlunicode/Noncharacter code points>.
 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
 C<L</isUTF8_CHAR_flags>> for a more customized definition.
 
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
 =cut
 */
 
@@ -1095,6 +1101,9 @@ The three alternative macros are for the most commonly needed validations; they
 are likely to run somewhat faster than this more general one, as they can be
 inlined into your code.
 
+Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
+L</is_utf8_string_loclen_flags> to check entire strings.
+
 =cut
 */