X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/c9cd936b8f828f1c71873b459ceaec2300fec6c7..5149e17891a64b8b39440a943d065e188e327008:/inline.h diff --git a/inline.h b/inline.h index 309d74f..1abee4f 100644 --- a/inline.h +++ b/inline.h @@ -244,17 +244,13 @@ S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp) /* ------------------------------- handy.h ------------------------------- */ /* saves machine code for a common noreturn idiom typically used in Newx*() */ -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_IGNORE(-Wunused-function) /* Intentionally left semicolonless. */ -#endif +GCC_DIAG_IGNORE_DECL(-Wunused-function); static void S_croak_memory_wrap(void) { Perl_croak_nocontext("%s",PL_memory_wrap); } -#ifdef GCC_DIAG_PRAGMA -GCC_DIAG_RESTORE /* Intentionally left semicolonless. */ -#endif +GCC_DIAG_RESTORE_DECL; /* ------------------------------- utf8.h ------------------------------- */ @@ -387,58 +383,68 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) send = s + len; #ifndef EBCDIC - /* Try to get the widest word on this platform */ -# ifdef HAS_LONG_LONG -# define PERL_WORDCAST unsigned long long -# define PERL_WORDSIZE LONGLONGSIZE -# else -# define PERL_WORDCAST UV -# define PERL_WORDSIZE UVSIZE -# endif -# if PERL_WORDSIZE == 4 -# define PERL_VARIANTS_WORD_MASK 0x80808080 -# define PERL_WORD_BOUNDARY_MASK 0x3 -# elif PERL_WORDSIZE == 8 -# define PERL_VARIANTS_WORD_MASK 0x8080808080808080 -# define PERL_WORD_BOUNDARY_MASK 0x7 -# else -# error Unexpected word size -# endif +/* This looks like 0x010101... */ +#define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF) + +/* This looks like 0x808080... */ +#define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80) +#define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER) +#define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1) + +/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by + * or'ing together the lowest bits of 'x'. Hopefully the final term gets + * optimized out completely on a 32-bit system, and its mask gets optimized out + * on a 64-bit system */ +#define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \ + | ( PTR2nat(x) >> 1) \ + | ( ( (PTR2nat(x) \ + & PERL_WORD_BOUNDARY_MASK) >> 2)))) + + /* Do the word-at-a-time iff there is at least one usable full word. That + * means that after advancing to a word boundary, there still is at least a + * full word left. The number of bytes needed to advance is 'wordsize - + * offset' unless offset is 0. */ + if ((STRLEN) (send - x) >= PERL_WORDSIZE + + /* This term is wordsize if subword; 0 if not */ + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + + /* 'offset' */ + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { - /* Process per-byte until reach word boundary. XXX This loop could be - * eliminated if we knew that this platform had fast unaligned reads */ - while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) { - if (! UTF8_IS_INVARIANT(*x)) { - if (ep) { - *ep = x; - } + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + if (! UTF8_IS_INVARIANT(*x)) { + if (ep) { + *ep = x; + } - return FALSE; + return FALSE; + } + x++; } - x++; - } - /* Process per-word as long as we have at least a full word left */ - while (x + PERL_WORDSIZE <= send) { - if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) { + /* Here, we know we have at least one full word to process. Process + * per-word as long as we have at least a full word left */ + do { + if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) { - /* Found a variant. Just return if caller doesn't want its exact - * position */ - if (! ep) { - return FALSE; - } + /* Found a variant. Just return if caller doesn't want its + * exact position */ + if (! ep) { + return FALSE; + } - /* Otherwise fall into final loop to find which byte it is */ - break; - } - x += PERL_WORDSIZE; + /* Otherwise fall into final loop to find which byte it is */ + break; + } + x += PERL_WORDSIZE; + } while (x + PERL_WORDSIZE <= send); } -# undef PERL_WORDCAST -# undef PERL_WORDSIZE -# undef PERL_WORD_BOUNDARY_MASK -# undef PERL_VARIANTS_WORD_MASK #endif /* Process per-byte */ @@ -457,6 +463,89 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) return TRUE; } +#if defined(PERL_CORE) || defined(PERL_EXT) + +/* +=for apidoc variant_under_utf8_count + +This function looks at the sequence of bytes between C and C, which are +assumed to be encoded in ASCII/Latin1, and returns how many of them would +change should the string be translated into UTF-8. Due to the nature of UTF-8, +each of these would occupy two bytes instead of the single one in the input +string. Thus, this function returns the precise number of bytes the string +would expand by when translated to UTF-8. + +Unlike most of the other functions that have C in their name, the input +to this function is NOT a UTF-8-encoded string. The function name is slightly +I to emphasize this. + +This function is internal to Perl because khw thinks that any XS code that +would want this is probably operating too close to the internals. Presenting a +valid use case could change that. + +See also +C> +and +C>, + +=cut + +*/ + +PERL_STATIC_INLINE Size_t +S_variant_under_utf8_count(const U8* const s, const U8* const e) +{ + const U8* x = s; + Size_t count = 0; + + PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT; + +# ifndef EBCDIC + + if ((STRLEN) (e - x) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x) + - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) { + count += ! UTF8_IS_INVARIANT(*x++); + } + + /* Process per-word as long as we have at least a full word left */ + do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an + explanation of how this works */ + count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7) + * PERL_COUNT_MULTIPLIER) + >> ((PERL_WORDSIZE - 1) * CHARBITS); + x += PERL_WORDSIZE; + } while (x + PERL_WORDSIZE <= e); + } + +# endif + + /* Process per-byte */ + while (x < e) { + if (! UTF8_IS_INVARIANT(*x)) { + count++; + } + + x++; + } + + return count; +} + +#endif + +#ifndef PERL_IN_REGEXEC_C /* Keep these around for that file */ +# undef PERL_WORDSIZE +# undef PERL_COUNT_MULTIPLIER +# undef PERL_WORD_BOUNDARY_MASK +# undef PERL_VARIANTS_WORD_MASK +#endif + /* =for apidoc is_utf8_string @@ -652,8 +741,7 @@ C>. PERL_STATIC_INLINE bool S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) { - const U8* send; - const U8* x = s; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE @@ -679,13 +767,17 @@ S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags) return is_c9strict_utf8_string(s, len); } - send = s + len; - while (x < send) { - STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); - if (UNLIKELY(! cur_len)) { - return FALSE; + if (! is_utf8_invariant_string_loc(s, len, &first_variant)) { + const U8* const send = s + len; + const U8* x = first_variant; + + while (x < send) { + STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + return FALSE; + } + x += cur_len; } - x += cur_len; } return TRUE; @@ -721,31 +813,50 @@ See also C>. */ PERL_STATIC_INLINE bool -Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; + } - if (ep) { - *ep = x; + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -779,31 +890,50 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; - if (ep) { - *ep = x; + if (ep) { + *ep = s + len; + } + + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -837,31 +967,50 @@ See also C>. */ PERL_STATIC_INLINE bool -S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el) +S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { - const U8* const send = s + (len ? len : strlen((const char *)s)); - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN; - while (x < send) { - const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); - if (UNLIKELY(! cur_len)) { - break; - } - x += cur_len; - outlen++; + if (len == 0) { + len = strlen((const char *) s); } - if (el) - *el = outlen; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; - if (ep) { - *ep = x; + if (ep) { + *ep = s + len; + } + + return TRUE; } - return (x == send); + { + const U8* const send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; + + if (ep) { + *ep = x; + } + + return (x == send); + } } /* @@ -902,16 +1051,14 @@ See also C>. PERL_STATIC_INLINE bool S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags) { - const U8* send; - const U8* x = s; - STRLEN outlen = 0; + const U8 * first_variant; PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS; assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE |UTF8_DISALLOW_PERL_EXTENDED))); if (len == 0) { - len = strlen((const char *)s); + len = strlen((const char *) s); } if (flags == 0) { @@ -930,24 +1077,40 @@ S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el return is_c9strict_utf8_string_loclen(s, len, ep, el); } - send = s + len; - while (x < send) { - const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); - if (UNLIKELY(! cur_len)) { - break; + if (is_utf8_invariant_string_loc(s, len, &first_variant)) { + if (el) + *el = len; + + if (ep) { + *ep = s + len; } - x += cur_len; - outlen++; + + return TRUE; } - if (el) - *el = outlen; + { + const U8* send = s + len; + const U8* x = first_variant; + STRLEN outlen = first_variant - s; + + while (x < send) { + const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags); + if (UNLIKELY(! cur_len)) { + break; + } + x += cur_len; + outlen++; + } + + if (el) + *el = outlen; - if (ep) { - *ep = x; - } + if (ep) { + *ep = x; + } - return (x == send); + return (x == send); + } } /* @@ -1003,9 +1166,9 @@ Perl_utf8_hop(const U8 *s, SSize_t off) s--; } } - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE_STMT(-Wcast-qual); return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE_STMT; } /* @@ -1040,16 +1203,16 @@ Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end) while (off--) { STRLEN skip = UTF8SKIP(s); if ((STRLEN)(end - s) <= skip) { - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE_STMT(-Wcast-qual); return (U8 *)end; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE_STMT; } s += skip; } - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE_STMT(-Wcast-qual); return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE_STMT; } /* @@ -1087,9 +1250,9 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start) s--; } - GCC_DIAG_IGNORE(-Wcast-qual); + GCC_DIAG_IGNORE_STMT(-Wcast-qual); return (U8 *)s; - GCC_DIAG_RESTORE; + GCC_DIAG_RESTORE_STMT; } /* @@ -1245,7 +1408,7 @@ complete, valid characters found in the C pointer. PERL_STATIC_INLINE bool S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, - const STRLEN len, + STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)