X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/8ed185f9c23fbf6e4b07abbe602e675889d07d4a..82651abe60a8c1ca1f9eb6aae0202ecfd34bfecf:/inline.h diff --git a/inline.h b/inline.h index d0e60e4..7d81da6 100644 --- a/inline.h +++ b/inline.h @@ -549,8 +549,8 @@ S__variant_byte_number(PERL_UINTMAX_T word) * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and * the 1 just to their left into a 0; the remainder is * untouched - * 0..0011..1 The xor with x..xx10..0 clears that remainder, sets - * bottom to all 1 + * 0..0011..1 The xor with the original, x..xx10..0, clears that + * remainder, sets the bottom to all 1 * 0..0100..0 Add 1 to clear the word except for the bit in 's' * * Another method is to do 'word &= -word'; but it generates a compiler @@ -1013,7 +1013,7 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) /* -=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e +=for apidoc isUTF8_CHAR Evaluates to non-zero if the first few bytes of the string starting at C and looking no further than S> are well-formed UTF-8, as extended by Perl, @@ -1022,7 +1022,7 @@ value gives how many bytes starting at C comprise the code point's representation. Any bytes remaining before C, but beyond the ones needed to form the first code point in C, are not examined. -The code point can be any that will fit in a UV on this machine, using Perl's +The code point can be any that will fit in an IV on this machine, using Perl's extension to official UTF-8 to represent those higher than the Unicode maximum of 0x10FFFF. That means that this macro is used to efficiently decide if the next few bytes in C is legal UTF-8 for a single character. @@ -1036,12 +1036,8 @@ code points; and C> for a more customized definition. Use C>, C>, and C> to check entire strings. -Note that it is deprecated to use code points higher than what will fit in an -IV. This macro does not raise any warnings for such code points, treating them -as valid. - -Note also that a UTF-8 INVARIANT character (i.e. ASCII on non-EBCDIC machines) -is a valid UTF-8 character. +Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC +machines) is a valid UTF-8 character. =cut @@ -1049,7 +1045,7 @@ This uses an adaptation of the table and algorithm given in http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive documentation of the original version. A copyright notice for the original version is given at the beginning of this file. The Perl adapation is -documented at the definition of perl_extended_utf8_dfa_tab[]. +documented at the definition of PL_extended_utf8_dfa_tab[]. */ @@ -1068,9 +1064,9 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) * helper function for the other platforms. */ while (s < e && LIKELY(state != 1)) { - state = perl_extended_utf8_dfa_tab[256 + state = PL_extended_utf8_dfa_tab[256 + state - + perl_extended_utf8_dfa_tab[*s]]; + + PL_extended_utf8_dfa_tab[*s]]; if (state != 0) { s++; continue; @@ -1092,6 +1088,131 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e) /* +=for apidoc isSTRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode code point completely acceptable for open interchange between all +applications; otherwise it evaluates to 0. If non-zero, the value gives how +many bytes starting at C comprise the code point's representation. Any +bytes remaining before C, but beyond the ones needed to form the first code +point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not +be a surrogate nor a non-character code point. Thus this excludes any code +point from Perl's extended UTF-8. + +This is used to efficiently decide if the next few bytes in C is +legal Unicode-acceptable UTF-8 for a single character. + +Use C> to use the L definition of allowable +code points; C> to check for Perl's extended UTF-8; +and C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of strict_extended_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + +#ifndef EBCDIC + + /* The dfa above drops out for certain Hanguls; handle them specially */ + if (is_HANGUL_ED_utf8_safe(s0, e)) { + return 3; + } + +#endif + + return 0; +} + +/* + +=for apidoc isC9_STRICT_UTF8_CHAR + +Evaluates to non-zero if the first few bytes of the string starting at C and +looking no further than S> are well-formed UTF-8 that represents some +Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero, +the value gives how many bytes starting at C comprise the code point's +representation. Any bytes remaining before C, but beyond the ones needed to +form the first code point in C, are not examined. + +The largest acceptable code point is the Unicode maximum 0x10FFFF. This +differs from C> only in that it accepts non-character +code points. This corresponds to +L. +which said that non-character code points are merely discouraged rather than +completely forbidden in open interchange. See +L. + +Use C> to check for Perl's extended UTF-8; and +C> for a more customized definition. + +Use C>, C>, and +C> to check entire strings. + +=cut + +This uses an adaptation of the tables and algorithm given in +http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive +documentation of the original version. A copyright notice for the original +version is given at the beginning of this file. The Perl adapation is +documented at the definition of PL_c9_utf8_dfa_tab[]. + +*/ + +PERL_STATIC_INLINE Size_t +S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e) +{ + const U8 * s = s0; + UV state = 0; + + PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR; + + while (s < e && LIKELY(state != 1)) { + state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]]; + + if (state != 0) { + s++; + continue; + } + + return s - s0 + 1; + } + + return 0; +} + +/* + =for apidoc is_strict_utf8_string_loc Like C> but stores the location of the failure (in the @@ -1476,9 +1597,9 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) assert(off <= 0); while (off++ && s > start) { - s--; - while (UTF8_IS_CONTINUATION(*s) && s > start) + do { s--; + } while (UTF8_IS_CONTINUATION(*s) && s > start); } GCC_DIAG_IGNORE(-Wcast-qual) @@ -1658,12 +1779,96 @@ S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, || is_utf8_valid_partial_char_flags(*ep, s + len, flags); } +PERL_STATIC_INLINE UV +S_utf8n_to_uvchr_msgs(const U8 *s, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) +{ + /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the + * simple cases, and, if necessary calls a helper function to deal with the + * more complex ones. Almost all well-formed non-problematic code points + * are considered simple, so that it's unlikely that the helper function + * will need to be called. + * + * This is an adaptation of the tables and algorithm given in + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides + * comprehensive documentation of the original version. A copyright notice + * for the original version is given at the beginning of this file. The + * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[]. + */ + + const U8 * const s0 = s; + const U8 * send = s0 + curlen; + UV uv = 0; /* The 0 silences some stupid compilers */ + UV state = 0; + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + + /* This dfa is fast. If it accepts the input, it was for a well-formed, + * non-problematic code point, which can be returned immediately. + * Otherwise we call a helper function to figure out the more complicated + * cases. */ + + while (s < send && LIKELY(state != 1)) { + UV type = PL_strict_utf8_dfa_tab[*s]; + + uv = (state == 0) + ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) + : UTF8_ACCUMULATE(uv, *s); + state = PL_strict_utf8_dfa_tab[256 + state + type]; + + if (state != 0) { + s++; + continue; + } + + if (retlen) { + *retlen = s - s0 + 1; + } + if (errors) { + *errors = 0; + } + if (msgs) { + *msgs = NULL; + } + + return uv; + } + + /* Here is potentially problematic. Use the full mechanism */ + return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs); +} + +PERL_STATIC_INLINE UV +S__utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +{ + PERL_ARGS_ASSERT__UTF8_TO_UVCHR_BUF; + + assert(s < send); + + if (! ckWARN_d(WARN_UTF8)) { + return utf8n_to_uvchr(s, send - s, retlen, + (UTF8_ALLOW_ANY & ~UTF8_ALLOW_EMPTY)); + } + else { + UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); + if (retlen && ret == 0 && *s != '\0') { + *retlen = (STRLEN) -1; + } + + return ret; + } +} + /* ------------------------------- perl.h ----------------------------- */ /* =head1 Miscellaneous Functions -=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name +=for apidoc is_safe_syscall Test that the given C doesn't contain any internal C characters. If it does, set C to C, optionally warn, and return FALSE. @@ -1729,6 +1934,40 @@ S_should_warn_nl(const char *pv) { #endif +#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C) + +PERL_STATIC_INLINE bool +S_lossless_NV_to_IV(const NV nv, IV *ivp) +{ + /* This function determines if the input NV 'nv' may be converted without + * loss of data to an IV. If not, it returns FALSE taking no other action. + * But if it is possible, it does the conversion, returning TRUE, and + * storing the converted result in '*ivp' */ + + PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV; + +# if defined(Perl_isnan) + + if (UNLIKELY(Perl_isnan(nv))) { + return FALSE; + } + +# endif + + if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) { + return FALSE; + } + + if ((IV) nv != nv) { + return FALSE; + } + + *ivp = (IV) nv; + return TRUE; +} + +#endif + /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */ #define MAX_CHARSET_NAME_LENGTH 2 @@ -2163,10 +2402,10 @@ Perl_foldEQ(const char *s1, const char *s2, I32 len) PERL_STATIC_INLINE I32 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len) { - /* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on - * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor - * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these. Nor - * does it check that the strings each have at least 'len' characters */ + /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds + * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and + * does not check for this. Nor does it check that the strings each have + * at least 'len' characters. */ const U8 *a = (const U8 *)s1; const U8 *b = (const U8 *)s2;