X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/e6a4ffc3f7aa69cbf3e5e83518e40e529a34b75b..421d395278133929e92518281fbc264c377cb281:/inline.h?ds=sidebyside

diff --git a/inline.h b/inline.h
index 0087389..0d43656 100644
--- a/inline.h
+++ b/inline.h
@@ -1045,7 +1045,7 @@ This uses an adaptation of the table and algorithm given in
 http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
 documentation of the original version.  A copyright notice for the original
 version is given at the beginning of this file.  The Perl adapation is
-documented at the definition of perl_extended_utf8_dfa_tab[].
+documented at the definition of PL_extended_utf8_dfa_tab[].
 
 */
 
@@ -1064,9 +1064,9 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
      * helper function for the other platforms. */
 
     while (s < e && LIKELY(state != 1)) {
-        state = perl_extended_utf8_dfa_tab[256
+        state = PL_extended_utf8_dfa_tab[256
                                          + state
-                                         + perl_extended_utf8_dfa_tab[*s]];
+                                         + PL_extended_utf8_dfa_tab[*s]];
         if (state != 0) {
             s++;
             continue;
@@ -1088,6 +1088,131 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
 
 /*
 
+=for apidoc isSTRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode code point completely acceptable for open interchange between all
+applications; otherwise it evaluates to 0.  If non-zero, the value gives how
+many bytes starting at C<s> comprise the code point's representation.  Any
+bytes remaining before C<e>, but beyond the ones needed to form the first code
+point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
+be a surrogate nor a non-character code point.  Thus this excludes any code
+point from Perl's extended UTF-8.
+
+This is used to efficiently decide if the next few bytes in C<s> is
+legal Unicode-acceptable UTF-8 for a single character.
+
+Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
+and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of strict_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+#ifndef EBCDIC
+
+    /* The dfa above drops out for certain Hanguls; handle them specially */
+    if (is_HANGUL_ED_utf8_safe(s0, e)) {
+        return 3;
+    }
+
+#endif
+
+    return 0;
+}
+
+/*
+
+=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
+the value gives how many bytes starting at C<s> comprise the code point's
+representation.  Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
+differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
+code points.  This corresponds to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+which said that non-character code points are merely discouraged rather than
+completely forbidden in open interchange.  See
+L<perlunicode/Noncharacter code points>.
+
+Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
+C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of PL_c9_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+    return 0;
+}
+
+/*
+
 =for apidoc is_strict_utf8_string_loc
 
 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
@@ -1472,9 +1597,9 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
     assert(off <= 0);
 
     while (off++ && s > start) {
-        s--;
-        while (UTF8_IS_CONTINUATION(*s) && s > start)
+        do {
             s--;
+        } while (UTF8_IS_CONTINUATION(*s) && s > start);
     }
     
     GCC_DIAG_IGNORE(-Wcast-qual)
@@ -1672,12 +1797,12 @@ S_utf8n_to_uvchr_msgs(const U8 *s,
      * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
      * comprehensive documentation of the original version.  A copyright notice
      * for the original version is given at the beginning of this file.  The
-     * Perl adapation is documented at the definition of strict_utf8_dfa_tab[].
+     * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
      */
 
     const U8 * const s0 = s;
     const U8 * send = s0 + curlen;
-    UV uv;
+    UV uv = 0;      /* The 0 silences some stupid compilers */
     UV state = 0;
 
     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
@@ -1688,12 +1813,12 @@ S_utf8n_to_uvchr_msgs(const U8 *s,
      * cases. */
 
     while (s < send && LIKELY(state != 1)) {
-        UV type = strict_utf8_dfa_tab[*s];
+        UV type = PL_strict_utf8_dfa_tab[*s];
 
         uv = (state == 0)
              ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
              : UTF8_ACCUMULATE(uv, *s);
-        state = strict_utf8_dfa_tab[256 + state + type];
+        state = PL_strict_utf8_dfa_tab[256 + state + type];
 
         if (state != 0) {
             s++;