#define _CORE_SWASH_INIT_RETURN_IF_UNDEF 0x2
#define _CORE_SWASH_INIT_ACCEPT_INVLIST 0x4
+/*
+=head1 Unicode Support
+
+=for apidoc is_ascii_string
+
+This is a misleadingly-named synonym for L</is_invariant_string>.
+On ASCII-ish platforms, the name isn't misleading: the ASCII-range characters
+are exactly the UTF-8 invariants. But EBCDIC machines have more invariants
+than just the ASCII characters, so C<is_invariant_string> is preferred.
+
+=cut
+*/
+#define is_ascii_string(s, len) is_invariant_string(s, len)
+
#define uvchr_to_utf8(a,b) uvchr_to_utf8_flags(a,b,0)
#define uvchr_to_utf8_flags(d,uv,flags) \
uvoffuni_to_utf8_flags(d,NATIVE_TO_UNI(uv),flags)
#define FOLDEQ_LOCALE (1 << 1)
#define FOLDEQ_S1_ALREADY_FOLDED (1 << 2)
#define FOLDEQ_S2_ALREADY_FOLDED (1 << 3)
+#define FOLDEQ_S1_FOLDS_SANE (1 << 4)
+#define FOLDEQ_S2_FOLDS_SANE (1 << 5)
-/*
-=for apidoc ibcmp_utf8
-
-This is a synonym for (! foldEQ_utf8())
-
-=cut
-*/
#define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
* code point whose UTF-8 is known to occupy 2 bytes; they are less efficient
* than the EIGHT_BIT versions on EBCDIC platforms. We use the logical '~'
* operator instead of "<=" to avoid getting compiler warnings.
- * MAX_PORTABLE_UTF8_TWO_BYTE should be exactly all one bits in the lower few
+ * MAX_UTF8_TWO_BYTE should be exactly all one bits in the lower few
* places, so the ~ works */
#define UTF8_TWO_BYTE_HI(c) \
(__ASSERT_((sizeof(c) == 1) \
- || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE)) \
- ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+ || !(((WIDEST_UTYPE)(c)) & ~MAX_UTF8_TWO_BYTE)) \
+ ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_UNI)))
#define UTF8_TWO_BYTE_LO(c) \
(__ASSERT_((sizeof(c) == 1) \
- || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE)) \
- ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
+ || !(((WIDEST_UTYPE)(c)) & ~MAX_UTF8_TWO_BYTE)) \
+ ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_UNI)))
/* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
* as it is only in overlongs. */
#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
#define IN_UNI_8_BIT \
- (CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS) \
- && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
+ (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT)) \
+ || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL \
+ /* -1 below is for :not_characters */ \
+ && _is_in_locale_category(FALSE, -1))) \
+ && ! IN_BYTES)
#define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */
* U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode
* U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0
* U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1
- */
+ *
+ * BE AWARE that this test doesn't rule out malformed code points, in
+ * particular overlongs */
#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
# define UTF8_IS_SUPER(s) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9 \
&& (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9 \
(ANYOF_NONBITMAP(node)) && \
(ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
((end) > (input) + 1) && \
- toFOLD((input)[0]) == 's' && \
- toFOLD((input)[1]) == 's')
+ isALPHA_FOLD_EQ((input)[0], 's'))
#define SHARP_S_SKIP 2
#endif
/*
-=head1 Unicode Support
=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e