/* For to_utf8_fold_flags, q.v. */
#define FOLD_FLAGS_LOCALE 0x1
#define FOLD_FLAGS_FULL 0x2
+#define FOLD_FLAGS_NOMIX_ASCII 0x4
-#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, 1)
+/* For _core_swash_init(), internal core use only */
+#define _CORE_SWASH_INIT_USER_DEFINED_PROPERTY 0x1
+#define _CORE_SWASH_INIT_RETURN_IF_UNDEF 0x2
+#define _CORE_SWASH_INIT_ACCEPT_INVLIST 0x4
+
+#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
FOLD_FLAGS_FULL, NULL)
#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0, NULL)
/* As there are no translations, avoid the function wrapper */
#define utf8n_to_uvchr utf8n_to_uvuni
+#define valid_utf8_to_uvchr valid_utf8_to_uvuni
#define uvchr_to_utf8 uvuni_to_utf8
/*
# define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \
&& ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \
|| *((s) + 1) == UTF_TO_NATIVE(0xB7)))
+ /* <send> points to one beyond the end of the string that starts at <s> */
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \
+ && (send - s) >= 4 \
+ && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
+ && *((s) + 3) == UTF_TO_NATIVE(0xBD)
#else
# define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
+# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \
+ && (send - s) >= 3 \
+ && *((s) + 1) == 0xBF \
+ && *((s) + 2) == 0xBD)
#endif
/* ASCII EBCDIC I8
#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */
#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
+#define LATIN_SMALL_LETTER_LONG_S 0x017F
+#define KELVIN_SIGN 0x212A
+#define ANGSTROM_SIGN 0x212B
#define UNI_DISPLAY_ISPRINT 0x0001
#define UNI_DISPLAY_BACKSLASH 0x0002
#define SHARP_S_SKIP 2
#ifndef EBCDIC
+/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
+ * log for earlier versions which gave details for these */
# define IS_UTF8_CHAR_1(p) \
((p)[0] <= 0x7F)
# define IS_UTF8_CHAR_2(p) \
(p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF)
# define IS_UTF8_CHAR_3b(p) \
- ((p)[0] >= 0xE1 && (p)[0] <= 0xEC && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
-# define IS_UTF8_CHAR_3c(p) \
- ((p)[0] == 0xED && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF)
- /* In IS_UTF8_CHAR_3c(p) one could use
- * (p)[1] >= 0x80 && (p)[1] <= 0x9F
- * if one wanted to exclude surrogates. */
-# define IS_UTF8_CHAR_3d(p) \
- ((p)[0] >= 0xEE && (p)[0] <= 0xEF && \
+ ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \
(p)[1] >= 0x80 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF)
# define IS_UTF8_CHAR_4a(p) \
(p)[1] >= 0x90 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF && \
(p)[3] >= 0x80 && (p)[3] <= 0xBF)
-# define IS_UTF8_CHAR_4b(p) \
- ((p)[0] >= 0xF1 && (p)[0] <= 0xF3 && \
- (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
- (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
- (p)[3] >= 0x80 && (p)[3] <= 0xBF)
-/* In IS_UTF8_CHAR_4c(p) one could use
- * (p)[0] == 0xF4
- * if one wanted to stop at the Unicode limit U+10FFFF.
- * The 0xF7 allows us to go to 0x1fffff (0x200000 would
+/* The 0xF7 allows us to go to 0x1fffff (0x200000 would
* require five bytes). Not doing any further code points
* since that is not needed (and that would not be strict
* UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
* will take care of the "extended UTF-8". */
-# define IS_UTF8_CHAR_4c(p) \
- ((p)[0] >= 0xF4 && (p)[0] <= 0xF7 && \
+# define IS_UTF8_CHAR_4b(p) \
+ ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \
(p)[1] >= 0x80 && (p)[1] <= 0xBF && \
(p)[2] >= 0x80 && (p)[2] <= 0xBF && \
(p)[3] >= 0x80 && (p)[3] <= 0xBF)
# define IS_UTF8_CHAR_3(p) \
(IS_UTF8_CHAR_3a(p) || \
- IS_UTF8_CHAR_3b(p) || \
- IS_UTF8_CHAR_3c(p) || \
- IS_UTF8_CHAR_3d(p))
+ IS_UTF8_CHAR_3b(p))
# define IS_UTF8_CHAR_4(p) \
(IS_UTF8_CHAR_4a(p) || \
- IS_UTF8_CHAR_4b(p) || \
- IS_UTF8_CHAR_4c(p))
+ IS_UTF8_CHAR_4b(p))
/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
* (1) allows UTF-8 encoded UTF-16 surrogates
* Local variables:
* c-indentation-style: bsd
* c-basic-offset: 4
- * indent-tabs-mode: t
+ * indent-tabs-mode: nil
* End:
*
- * ex: set ts=8 sts=4 sw=4 noet:
+ * ex: set ts=8 sts=4 sw=4 et:
*/