#define FOLD_FLAGS_NOMIX_ASCII 0x4
/*
-=head1 Unicode Support
-L<perlguts/Unicode Support> has an introduction to this API.
-
-See also L</Character classification>,
-and L</Character case changing>.
-Various functions outside this section also work specially with Unicode.
-Search for the string "utf8" in this document.
-
=for apidoc is_ascii_string
This is a misleadingly-named synonym for L</is_utf8_invariant_string>.
#define FOLDEQ_S1_FOLDS_SANE (1 << 4)
#define FOLDEQ_S2_FOLDS_SANE (1 << 5)
-#define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
- cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
-
#ifdef EBCDIC
/* The equivalent of these macros but implementing UTF-EBCDIC
are in the following header file:
/* 2**UTF_ACCUMULATION_SHIFT - 1. This masks out all but the bits that carry
* real information in a continuation byte. This turns out to be 0x3F in
* UTF-8, 0x1F in UTF-EBCDIC. */
-#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
+#define UTF_CONTINUATION_MASK ((U8) (nBIT_MASK(UTF_ACCUMULATION_SHIFT)))
/* For use in UTF8_IS_CONTINUATION(). This turns out to be 0xC0 in UTF-8,
* E0 in UTF-EBCDIC */
* ASCII platforms, everything is representable by 7 bytes */
#if defined(UV_IS_QUAD) || defined(EBCDIC)
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) \
- (UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)) ? 7 : UTF8_MAXBYTES)
+ LIKELY((UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT))) \
+ ? 7 \
+ : UTF8_MAXBYTES)
#else
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
#endif
/* The largest code point representable by two UTF-8 bytes on any platform that
* Perl runs on. This value is constrained by EBCDIC which has 5 bits per
* continuation byte */
-#define MAX_PORTABLE_UTF8_TWO_BYTE (32 * (1U << 5) - 1)
+#define MAX_PORTABLE_UTF8_TWO_BYTE (32 * nBIT_UMAX(5))
/*
* UTF-8 encoded character that mark it as a start byte and give the number of
* bytes that comprise the character. 'len' is the number of bytes in the
* multi-byte sequence. */
-#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : ((U8) (0xFE << (7-(len)))))
+#define UTF_START_MARK(len) (UNLIKELY((len) > 7) \
+ ? 0xFF \
+ : ((U8) (0xFE << (7-(len)))))
/* Masks out the initial one bits in a start byte, leaving the real data ones.
* Doesn't work on an invariant byte. 'len' is the number of bytes in the
* multi-byte sequence that comprises the character. */
-#define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
+#define UTF_START_MASK(len) (UNLIKELY((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
/* Adds a UTF8 continuation byte 'new' of information to a running total code
* point 'old' of all the continuation bytes so far. This is designed to be
=over
-=item L</C<UTF8_SAFE_SKIP>> if you know the maximum ending pointer in the
+=item C<L</UTF8_SAFE_SKIP>> if you know the maximum ending pointer in the
buffer pointed to by C<s>; or
-=item L</C<UTF8_CHK_SKIP>> if you don't know it.
+=item C<L</UTF8_CHK_SKIP>> if you don't know it.
=back
It is better to restructure your code so the end pointer is passed down so that
you know what it actually is at the point of this call, but if that isn't
-possible, L</C<UTF8_CHK_SKIP>> can minimize the chance of accessing beyond the end
+possible, C<L</UTF8_CHK_SKIP>> can minimize the chance of accessing beyond the end
of the input buffer.
=cut
/*
=for apidoc Am|STRLEN|UTF8_SKIP|char* s
-This is a synonym for L</C<UTF8SKIP>>
+This is a synonym for C<L</UTF8SKIP>>
=cut
*/
/*
=for apidoc Am|STRLEN|UTF8_CHK_SKIP|char* s
-This is a safer version of L</C<UTF8SKIP>>, but still not as safe as
-L</C<UTF8_SAFE_SKIP>>. This version doesn't blindly assume that the input
+This is a safer version of C<L</UTF8SKIP>>, but still not as safe as
+C<L</UTF8_SAFE_SKIP>>. This version doesn't blindly assume that the input
string pointed to by C<s> is well-formed, but verifies that there isn't a NUL
terminating character before the expected end of the next character in C<s>.
The length C<UTF8_CHK_SKIP> returns stops just before any such NUL.
This macro is intended to be used by XS modules where the inputs could be
malformed, and it isn't feasible to restructure to use the safer
-L</C<UTF8_SAFE_SKIP>>, for example when interfacing with a C library.
+C<L</UTF8_SAFE_SKIP>>, for example when interfacing with a C library.
=cut
*/
#define UTF8_CHK_SKIP(s) \
- (s[0] == '\0' ? 1 : MIN(UTF8SKIP(s), \
+ (UNLIKELY(s[0] == '\0') ? 1 : MIN(UTF8SKIP(s), \
my_strnlen((char *) (s), UTF8SKIP(s))))
/*
=cut
*/
#define UTF8_SAFE_SKIP(s, e) (__ASSERT_((e) >= (s)) \
- ((e) - (s)) <= 0 \
+ UNLIKELY(((e) - (s)) <= 0) \
? 0 \
: MIN(((e) - (s)), UTF8_SKIP(s)))
*/
#ifdef EBCDIC
# define UTF8_IS_SUPER(s, e) \
- (( LIKELY((e) > (s) + 4) \
- && NATIVE_UTF8_TO_I8(*(s)) >= 0xF9 \
- && ( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \
- || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \
- && LIKELY((s) + UTF8SKIP(s) <= (e))) \
- ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
+ (( ((e) > (s) + 4) \
+ && (NATIVE_UTF8_TO_I8(*(s)) >= 0xF9) \
+ && UNLIKELY( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \
+ || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \
+ && LIKELY((s) + UTF8SKIP(s) <= (e))) \
+ ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0
#else
# define UTF8_IS_SUPER(s, e) \
- (( LIKELY((e) > (s) + 3) \
+ (( ((e) > (s) + 3) \
&& (*(U8*) (s)) >= 0xF4 \
- && ((*(U8*) (s)) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))\
+ && (UNLIKELY( ((*(U8*) (s)) > 0xF4) \
+ || (*((U8*) (s) + 1) >= 0x90))) \
&& LIKELY((s) + UTF8SKIP(s) <= (e))) \
? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
#endif
/* This matches the 2048 code points between UNICODE_SURROGATE_FIRST (0xD800) and
* UNICODE_SURROGATE_LAST (0xDFFF) */
-#define UNICODE_IS_SURROGATE(uv) (((UV) (uv) & (~0xFFFF | 0xF800)) \
+#define UNICODE_IS_SURROGATE(uv) UNLIKELY(((UV) (uv) & (~0xFFFF | 0xF800)) \
== 0xD800)
-#define UNICODE_IS_REPLACEMENT(uv) ((UV) (uv) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(uv) ((UV) (uv) == UNICODE_BYTE_ORDER_MARK)
+#define UNICODE_IS_REPLACEMENT(uv) UNLIKELY((UV) (uv) == UNICODE_REPLACEMENT)
+#define UNICODE_IS_BYTE_ORDER_MARK(uv) UNLIKELY((UV) (uv) \
+ == UNICODE_BYTE_ORDER_MARK)
/* Is 'uv' one of the 32 contiguous-range noncharacters? */
-#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) ((UV) (uv) >= 0xFDD0 \
- && (UV) (uv) <= 0xFDEF)
+#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) UNLIKELY((UV) (uv) >= 0xFDD0 \
+ && (UV) (uv) <= 0xFDEF)
/* Is 'uv' one of the 34 plane-ending noncharacters 0xFFFE, 0xFFFF, 0x1FFFE,
* 0x1FFFF, ... 0x10FFFE, 0x10FFFF, given that we know that 'uv' is not above
* the Unicode legal max */
#define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
- (((UV) (uv) & 0xFFFE) == 0xFFFE)
+ UNLIKELY(((UV) (uv) & 0xFFFE) == 0xFFFE)
#define UNICODE_IS_NONCHAR(uv) \
( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \
|| ( LIKELY( ! UNICODE_IS_SUPER(uv)) \
&& UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
-#define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
+#define UNICODE_IS_SUPER(uv) UNLIKELY((UV) (uv) > PERL_UNICODE_MAX)
#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \
/* Character classes could also allow \b, but not patterns in general */
#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
-#define ANYOF_FOLD_SHARP_S(node, input, end) \
- (ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
- (ANYOF_NONBITMAP(node)) && \
- (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
- ((end) > (input) + 1) && \
- isALPHA_FOLD_EQ((input)[0], 's'))
-
+/* Should be removed; maybe deprecated, but not used in CPAN */
#define SHARP_S_SKIP 2
#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)