#define uvuni_to_utf8(d, uv) uvuni_to_utf8_flags(d, uv, 0)
#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
+/*
+=for apidoc ibcmp_utf8
+
+This is a synonym for (! foldEQ_utf8())
+
+=cut
+*/
+#define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
+ cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
+
#ifdef EBCDIC
/* The equivalent of these macros but implementing UTF-EBCDIC
are in the following header file:
Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
U+0000..U+007F 00..7F
- U+0080..U+07FF C2..DF 80..BF
+ U+0080..U+07FF * C2..DF 80..BF
U+0800..U+0FFF E0 * A0..BF 80..BF
U+1000..U+CFFF E1..EC 80..BF 80..BF
- U+D000..U+D7FF ED * 80..9F 80..BF
+ U+D000..U+D7FF ED 80..9F 80..BF
U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++
U+E000..U+FFFF EE..EF 80..BF 80..BF
U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
-Note the gaps before the 2nd Byte entries above marked by '*'. These are
+Note the gaps before several of the byte entries above marked by '*'. These are
caused by legal UTF-8 avoiding non-shortest encodings: it is technically
possible to UTF-8-encode a single code point in different ways, but that is
explicitly forbidden, and the shortest possible encoding should always be used
00000dddccccccbbbbbbaaaaaa 11110ddd 10cccccc 10bbbbbb 10aaaaaa
As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tell how many bytes the are in the
+leading bits of the start byte tell how many bytes there are in the
encoded character.
+Perl's extended UTF-8 means we can have start bytes up to FF.
+
*/
#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
* below might ought to be C2 */
-#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
+#define UTF8_IS_START(c) (((U8)c) >= 0xc0)
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)
#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
+/* Convert a two (not one) byte utf8 character to a unicode code point value.
+ * Needs just one iteration of accumulate. Should not be used unless it is
+ * known that the two bytes are legal: 1) two-byte start, and 2) continuation.
+ * Note that the result can be larger than 255 if the input character is not
+ * downgradable */
+#define TWO_BYTE_UTF8_TO_UNI(HI, LO) \
+ UTF8_ACCUMULATE((NATIVE_TO_UTF(HI) & UTF_START_MASK(2)), \
+ NATIVE_TO_UTF(LO))
+
#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
-#define IN_UNI_8_BIT ( (! (CopHINTS_get(PL_curcop) & HINT_NOT_UNI_8_BIT)) \
+#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \
&& ! IN_LOCALE_RUNTIME && ! IN_BYTES)
#define UTF8_ALLOW_EMPTY 0x0001
# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
#endif
-#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
-
#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
+#define GREEK_SMALL_LETTER_MU 0x03BC
#define UNI_DISPLAY_ISPRINT 0x0001
#define UNI_DISPLAY_BACKSLASH 0x0002
#define ANYOF_FOLD_SHARP_S(node, input, end) \
(ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
- (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
+ (ANYOF_FLAGS(node) & ANYOF_NONBITMAP) && \
(ANYOF_FLAGS(node) & ANYOF_FOLD) && \
((end) > (input) + 1) && \
toLOWER((input)[0]) == 's' && \