X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/e6a4ffc3f7aa69cbf3e5e83518e40e529a34b75b..421d395278133929e92518281fbc264c377cb281:/inline.h?ds=sidebyside diff --git a/inline.h b/inline.h index 0087389..0d43656 100644 --- a/inline.h +++ b/inline.h @@ -1045,7 +1045,7 @@ This uses an adaptation of the table and algorithm given in http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive documentation of the original version. A copyright notice for the original version is given at the beginning of this file. The Perl adapation is -documented at the definition of perl_extended_utf8_dfa_tab[]. +documented at the definition of PL_extended_utf8_dfa_tab[]. */ @@ -1064,9 +1064,9 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) * helper function for the other platforms. */ while (s < e && LIKELY(state != 1)) { - state = perl_extended_utf8_dfa_tab[256 + state = PL_extended_utf8_dfa_tab[256 + state - + perl_extended_utf8_dfa_tab[*s]]; + + PL_extended_utf8_dfa_tab[*s]]; if (state != 0) { s++; continue; @@ -1088,6 +1088,131 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) /* +=for apidoc isSTRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode code point completely acceptable for open interchange between all +applications; otherwise it evaluates to 0. If non-zero, the value gives how +many bytes starting at C comprise the code point's representation. Any +bytes remaining before C, but beyond the ones needed to form the first code +point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not +be a surrogate nor a non-character code point. Thus this excludes any code +point from Perl's extended UTF-8. + +This is used to efficiently decide if the next few bytes in C is +legal Unicode-acceptable UTF-8 for a single character. + +Use C> to use the L definition of allowable +code points; C> to check for Perl's extended UTF-8; +and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of strict_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#ifndef EBCDIC + + /* The dfa above drops out for certain Hanguls; handle them specially */ + if (is_HANGUL_ED_utf8_safe(s0, e)) { + return 3; + } + +#endif + + return 0; +} + +/* + +=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C> only in that it accepts non-character +code points. This corresponds to +L. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L. + +Use C> to check for Perl's extended UTF-8; and +C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of PL_c9_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + + return 0; +} + +/* + =for apidoc is_strict_utf8_string_loc Like C> but stores the location of the failure (in the @@ -1472,9 +1597,9 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) assert(off <= 0); while (off++ && s > start) { - s--; - while (UTF8_IS_CONTINUATION(*s) && s > start) + do { s--; + } while (UTF8_IS_CONTINUATION(*s) && s > start); } GCC_DIAG_IGNORE(-Wcast-qual) @@ -1672,12 +1797,12 @@ S_utf8n_to_uvchr_msgs(const U8 *s, * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides * comprehensive documentation of the original version. A copyright notice * for the original version is given at the beginning of this file. The - * Perl adapation is documented at the definition of strict_utf8_dfa_tab[]. + * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[]. */ const U8 * const s0 = s; const U8 * send = s0 + curlen; - UV uv; + UV uv = 0; /* The 0 silences some stupid compilers */ UV state = 0; PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; @@ -1688,12 +1813,12 @@ S_utf8n_to_uvchr_msgs(const U8 *s, * cases. */ while (s < send && LIKELY(state != 1)) { - UV type = strict_utf8_dfa_tab[*s]; + UV type = PL_strict_utf8_dfa_tab[*s]; uv = (state == 0) ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) : UTF8_ACCUMULATE(uv, *s); - state = strict_utf8_dfa_tab[256 + state + type]; + state = PL_strict_utf8_dfa_tab[256 + state + type]; if (state != 0) { s++;