#define FOLD_FLAGS_NOMIX_ASCII 0x4
/*
-=head1 Unicode Support
-L<perlguts/Unicode Support> has an introduction to this API.
-
-See also L</Character classification>,
-and L</Character case changing>.
-Various functions outside this section also work specially with Unicode.
-Search for the string "utf8" in this document.
-
=for apidoc is_ascii_string
This is a misleadingly-named synonym for L</is_utf8_invariant_string>.
#define FOLDEQ_S1_FOLDS_SANE (1 << 4)
#define FOLDEQ_S2_FOLDS_SANE (1 << 5)
-#define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
- cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
-
#ifdef EBCDIC
/* The equivalent of these macros but implementing UTF-EBCDIC
are in the following header file:
/* Surrogates, non-character code points and above-Unicode code points are
* problematic in some contexts. This allows code that needs to check for
- * those to to quickly exclude the vast majority of code points it will
+ * those to quickly exclude the vast majority of code points it will
* encounter */
#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(U8) c >= 0xED)
/* 2**UTF_ACCUMULATION_SHIFT - 1. This masks out all but the bits that carry
* real information in a continuation byte. This turns out to be 0x3F in
* UTF-8, 0x1F in UTF-EBCDIC. */
-#define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
+#define UTF_CONTINUATION_MASK ((U8) (nBIT_MASK(UTF_ACCUMULATION_SHIFT)))
/* For use in UTF8_IS_CONTINUATION(). This turns out to be 0xC0 in UTF-8,
* E0 in UTF-EBCDIC */
* ASCII platforms, everything is representable by 7 bytes */
#if defined(UV_IS_QUAD) || defined(EBCDIC)
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) \
- (UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)) ? 7 : UTF8_MAXBYTES)
+ LIKELY((UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT))) \
+ ? 7 \
+ : UTF8_MAXBYTES)
#else
# define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
#endif
((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
- * This doesn't catch invariants (they are single-byte). It also excludes the
+ * This excludes invariants (they are single-byte). It also excludes the
* illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and
- * C0-C4 I8 start bytes on EBCDIC ones */
-#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ * C0-C4 I8 start bytes on EBCDIC ones. On EBCDIC E0 can't start a
+ * non-overlong sequence, so we define a base macro and for those platforms,
+ * extend it to also exclude E0 */
+#define UTF8_IS_START_base(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE))
+#ifdef EBCDIC
+# define UTF8_IS_START(c) \
+ (UTF8_IS_START_base(c) && (c) != I8_TO_NATIVE_UTF8(0xE0))
+#else
+# define UTF8_IS_START(c) UTF8_IS_START_base(c)
+#endif
#define UTF_MIN_ABOVE_LATIN1_BYTE \
((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
/* The largest code point representable by two UTF-8 bytes on any platform that
* Perl runs on. This value is constrained by EBCDIC which has 5 bits per
* continuation byte */
-#define MAX_PORTABLE_UTF8_TWO_BYTE (32 * (1U << 5) - 1)
+#define MAX_PORTABLE_UTF8_TWO_BYTE (32 * nBIT_UMAX(5))
/*
=cut
*/
#define UTF8_MAXBYTES_CASE \
- (UTF8_MAXBYTES >= (UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF)) \
- ? UTF8_MAXBYTES \
- : (UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF)))
+ MAX(UTF8_MAXBYTES, UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF))
/* Rest of these are attributes of Unicode and perl's internals rather than the
* encoding, or happen to be the same in both ASCII and EBCDIC (at least at
* UTF-8 encoded character that mark it as a start byte and give the number of
* bytes that comprise the character. 'len' is the number of bytes in the
* multi-byte sequence. */
-#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
+#define UTF_START_MARK(len) (UNLIKELY((len) > 7) \
+ ? 0xFF \
+ : ((U8) (0xFE << (7-(len)))))
/* Masks out the initial one bits in a start byte, leaving the real data ones.
* Doesn't work on an invariant byte. 'len' is the number of bytes in the
* multi-byte sequence that comprises the character. */
-#define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
+#define UTF_START_MASK(len) (UNLIKELY((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
/* Adds a UTF8 continuation byte 'new' of information to a running total code
* point 'old' of all the continuation bytes so far. This is designed to be
=over
-=item L</C<UTF8_SAFE_SKIP>> if you know the maximum ending pointer in the
+=item C<L</UTF8_SAFE_SKIP>> if you know the maximum ending pointer in the
buffer pointed to by C<s>; or
-=item L</C<UTF8_CHK_SKIP>> if you don't know it.
+=item C<L</UTF8_CHK_SKIP>> if you don't know it.
=back
It is better to restructure your code so the end pointer is passed down so that
you know what it actually is at the point of this call, but if that isn't
-possible, L</C<UTF8_CHK_SKIP>> can minimize the chance of accessing beyond the end
+possible, C<L</UTF8_CHK_SKIP>> can minimize the chance of accessing beyond the end
of the input buffer.
=cut
/*
=for apidoc Am|STRLEN|UTF8_SKIP|char* s
-This is a synonym for L</C<UTF8SKIP>>
+This is a synonym for C<L</UTF8SKIP>>
=cut
*/
/*
=for apidoc Am|STRLEN|UTF8_CHK_SKIP|char* s
-This is a safer version of L</C<UTF8SKIP>>, but still not as safe as
-L</C<UTF8_SAFE_SKIP>>. This version doesn't blindly assume that the input
+This is a safer version of C<L</UTF8SKIP>>, but still not as safe as
+C<L</UTF8_SAFE_SKIP>>. This version doesn't blindly assume that the input
string pointed to by C<s> is well-formed, but verifies that there isn't a NUL
terminating character before the expected end of the next character in C<s>.
The length C<UTF8_CHK_SKIP> returns stops just before any such NUL.
This macro is intended to be used by XS modules where the inputs could be
malformed, and it isn't feasible to restructure to use the safer
-L</C<UTF8_SAFE_SKIP>>, for example when interfacing with a C library.
+C<L</UTF8_SAFE_SKIP>>, for example when interfacing with a C library.
=cut
*/
#define UTF8_CHK_SKIP(s) \
- (s[0] == '\0' ? 1 : MIN(UTF8SKIP(s), \
+ (UNLIKELY(s[0] == '\0') ? 1 : MIN(UTF8SKIP(s), \
my_strnlen((char *) (s), UTF8SKIP(s))))
/*
=cut
*/
#define UTF8_SAFE_SKIP(s, e) (__ASSERT_((e) >= (s)) \
- ((e) - (s)) <= 0 \
+ UNLIKELY(((e) - (s)) <= 0) \
? 0 \
: MIN(((e) - (s)), UTF8_SKIP(s)))
* beginning of a utf8 character. Now that foo_utf8() determines that itself,
* no need to do it again here
*/
-#define isIDFIRST_lazy_if(p,UTF) \
- _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isIDFIRST_lazy_if", \
- "isIDFIRST_lazy_if_safe", \
- cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
#define isIDFIRST_lazy_if_safe(p, e, UTF) \
((IN_BYTES || !UTF) \
? isIDFIRST(*(p)) \
: isIDFIRST_utf8_safe(p, e))
-
-#define isWORDCHAR_lazy_if(p,UTF) \
- _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isWORDCHAR_lazy_if", \
- "isWORDCHAR_lazy_if_safe", \
- cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
#define isWORDCHAR_lazy_if_safe(p, e, UTF) \
((IN_BYTES || !UTF) \
? isWORDCHAR(*(p)) \
: isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
-
-#define isALNUM_lazy_if(p,UTF) \
- _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isALNUM_lazy_if", \
- "isWORDCHAR_lazy_if_safe", \
- cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
+#define isALNUM_lazy_if_safe(p, e, UTF) isWORDCHAR_lazy_if_safe(p, e, UTF)
#define UTF8_MAXLEN UTF8_MAXBYTES
*/
#ifdef EBCDIC
# define UTF8_IS_SUPER(s, e) \
- (( LIKELY((e) > (s) + 4) \
- && NATIVE_UTF8_TO_I8(*(s)) >= 0xF9 \
- && ( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \
- || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \
- && LIKELY((s) + UTF8SKIP(s) <= (e))) \
- ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
+ (( ((e) > (s) + 4) \
+ && (NATIVE_UTF8_TO_I8(*(s)) >= 0xF9) \
+ && UNLIKELY( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \
+ || (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \
+ && LIKELY((s) + UTF8SKIP(s) <= (e))) \
+ ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0
#else
# define UTF8_IS_SUPER(s, e) \
- (( LIKELY((e) > (s) + 3) \
+ (( ((e) > (s) + 3) \
&& (*(U8*) (s)) >= 0xF4 \
- && ((*(U8*) (s)) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))\
+ && (UNLIKELY( ((*(U8*) (s)) > 0xF4) \
+ || (*((U8*) (s) + 1) >= 0x90))) \
&& LIKELY((s) + UTF8SKIP(s) <= (e))) \
? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
#endif
#define UNICODE_DISALLOW_NONCHAR 0x0020
#define UNICODE_DISALLOW_SUPER 0x0040
#define UNICODE_DISALLOW_PERL_EXTENDED 0x0080
+
+#ifdef PERL_CORE
+# define UNICODE_ALLOW_ABOVE_IV_MAX 0x0100
+#endif
#define UNICODE_DISALLOW_ABOVE_31_BIT UNICODE_DISALLOW_PERL_EXTENDED
#define UNICODE_GOT_SURROGATE UNICODE_DISALLOW_SURROGATE
/* This matches the 2048 code points between UNICODE_SURROGATE_FIRST (0xD800) and
* UNICODE_SURROGATE_LAST (0xDFFF) */
-#define UNICODE_IS_SURROGATE(uv) (((UV) (uv) & (~0xFFFF | 0xF800)) \
+#define UNICODE_IS_SURROGATE(uv) UNLIKELY(((UV) (uv) & (~0xFFFF | 0xF800)) \
== 0xD800)
-#define UNICODE_IS_REPLACEMENT(uv) ((UV) (uv) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(uv) ((UV) (uv) == UNICODE_BYTE_ORDER_MARK)
+#define UNICODE_IS_REPLACEMENT(uv) UNLIKELY((UV) (uv) == UNICODE_REPLACEMENT)
+#define UNICODE_IS_BYTE_ORDER_MARK(uv) UNLIKELY((UV) (uv) \
+ == UNICODE_BYTE_ORDER_MARK)
/* Is 'uv' one of the 32 contiguous-range noncharacters? */
-#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) ((UV) (uv) >= 0xFDD0 \
- && (UV) (uv) <= 0xFDEF)
+#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) UNLIKELY((UV) (uv) >= 0xFDD0 \
+ && (UV) (uv) <= 0xFDEF)
/* Is 'uv' one of the 34 plane-ending noncharacters 0xFFFE, 0xFFFF, 0x1FFFE,
* 0x1FFFF, ... 0x10FFFE, 0x10FFFF, given that we know that 'uv' is not above
* the Unicode legal max */
#define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
- (((UV) (uv) & 0xFFFE) == 0xFFFE)
+ UNLIKELY(((UV) (uv) & 0xFFFE) == 0xFFFE)
#define UNICODE_IS_NONCHAR(uv) \
( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \
|| ( LIKELY( ! UNICODE_IS_SUPER(uv)) \
&& UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
-#define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
+#define UNICODE_IS_SUPER(uv) UNLIKELY((UV) (uv) > PERL_UNICODE_MAX)
#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \
#define UNI_DISPLAY_ISPRINT 0x0001
#define UNI_DISPLAY_BACKSLASH 0x0002
-#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
-#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
+#define UNI_DISPLAY_BACKSPACE 0x0004 /* Allow \b when also
+ UNI_DISPLAY_BACKSLASH */
+#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT \
+ |UNI_DISPLAY_BACKSLASH \
+ |UNI_DISPLAY_BACKSPACE)
-#define ANYOF_FOLD_SHARP_S(node, input, end) \
- (ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
- (ANYOF_NONBITMAP(node)) && \
- (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
- ((end) > (input) + 1) && \
- isALPHA_FOLD_EQ((input)[0], 's'))
+/* Character classes could also allow \b, but not patterns in general */
+#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
+/* Should be removed; maybe deprecated, but not used in CPAN */
#define SHARP_S_SKIP 2
#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)