#define uvchr_to_utf8_flags(d,uv,flags) \
uvoffuni_to_utf8_flags(d,NATIVE_TO_UNI(uv),flags)
#define utf8_to_uvchr_buf(s, e, lenp) \
- utf8n_to_uvchr(s, (e) - (s), lenp, \
+ utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp, \
ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
-#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
- FOLD_FLAGS_FULL, NULL)
-#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0, NULL)
-#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0, NULL)
-#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0, NULL)
+#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
+#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0)
+#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0)
+#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0)
/* Source backward compatibility. */
#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
(((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) \
- UTF_ACCUMULATION_SHIFT))
-#ifdef HAS_QUAD
+#if UVSIZE >= 8
+# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
+
/* Input is a true Unicode (not-native) code point */
#define OFFUNISKIP(uv) ( (uv) < 0x80 ? 1 : \
(uv) < 0x800 ? 2 : \
* UTF-8 encoded string) */
#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
-#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
+/* Like the above, but its name implies a non-UTF8 input, and is implemented
+ * differently (for no particular reason) */
+#define NATIVE_BYTE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
+
+/* Like the above, but accepts any UV as input */
+#define UVCHR_IS_INVARIANT(uv) UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))
#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */
-/* The macros in the next sets are used to generate the two utf8 or utfebcdic
- * bytes from an ordinal that is known to fit into two bytes; it must be less
- * than 0x3FF to work across both encodings. */
-/* Nocast allows these to be used in the case label of a switch statement;
- * however this doesn't won't work for ebcdic, and should be avoided. Use
- * regen/unicode_constants instead */
-#define UTF8_TWO_BYTE_HI_nocast(c) I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c) \
- >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
-#define UTF8_TWO_BYTE_LO_nocast(c) I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c) \
- & UTF_CONTINUATION_MASK) \
- | UTF_CONTINUATION_MARK)
-
-#define UTF8_TWO_BYTE_HI(c) ((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
-#define UTF8_TWO_BYTE_LO(c) ((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
-
-/* This name is used when the source is a single byte (input not checked).
- * These expand identically to the TWO_BYTE versions on ASCII platforms, but
- * use to/from LATIN1 instead of UNI, which on EBCDIC eliminates tests */
-#define UTF8_EIGHT_BIT_HI(c) I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c) \
- >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
-#define UTF8_EIGHT_BIT_LO(c) I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c) \
- & UTF_CONTINUATION_MASK) \
- | UTF_CONTINUATION_MARK)
+/* The macros in the next 4 sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into exactly two (not one) bytes;
+ * it must be less than 0x3FF to work across both encodings. */
+
+/* These two are helper macros for the other three sets, and should not be used
+ * directly anywhere else. 'translate_function' is either NATIVE_TO_LATIN1
+ * (which works for code points up to 0xFF) or NATIVE_TO_UNI which works for any
+ * code point */
+#define __BASE_TWO_BYTE_HI(c, translate_function) \
+ I8_TO_NATIVE_UTF8((translate_function(c) >> UTF_ACCUMULATION_SHIFT) \
+ | UTF_START_MARK(2))
+#define __BASE_TWO_BYTE_LO(c, translate_function) \
+ I8_TO_NATIVE_UTF8((translate_function(c) & UTF_CONTINUATION_MASK) \
+ | UTF_CONTINUATION_MARK)
+
+/* The next two macros should not be used. They were designed to be usable as
+ * the case label of a switch statement, but this doesn't work for EBCDIC. Use
+ * regen/unicode_constants.pl instead */
+#define UTF8_TWO_BYTE_HI_nocast(c) __BASE_TWO_BYTE_HI(c, NATIVE_TO_UNI)
+#define UTF8_TWO_BYTE_LO_nocast(c) __BASE_TWO_BYTE_LO(c, NATIVE_TO_UNI)
+
+/* The next two macros are used when the source should be a single byte
+ * character; checked for under DEBUGGING */
+#define UTF8_EIGHT_BIT_HI(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+#define UTF8_EIGHT_BIT_LO(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
+
+/* These final two macros in the series are used when the source can be any
+ * code point whose UTF-8 is known to occupy 2 bytes; they are less efficient
+ * than the EIGHT_BIT versions on EBCDIC platforms. We use the logical '~'
+ * operator instead of "<=" to avoid getting compiler warnings.
+ * MAX_PORTABLE_UTF8_TWO_BYTE should be exactly all one bits in the lower few
+ * places, so the ~ works */
+#define UTF8_TWO_BYTE_HI(c) \
+ (__ASSERT_((sizeof(c) == 1) \
+ || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE)) \
+ ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+#define UTF8_TWO_BYTE_LO(c) \
+ (__ASSERT_((sizeof(c) == 1) \
+ || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE)) \
+ ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
/* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
* as it is only in overlongs. */
#define UTF8_WARN_SUPER 0x0400 /* points above the legal max */
/* Code points which never were part of the original UTF-8 standard, the first
- * byte of which is a FE or FF on ASCII platforms. */
+ * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it
+ * will overflow a 32-bit word. If the first byte is FE, it will overflow a
+ * signed 32-bit word. */
#define UTF8_DISALLOW_FE_FF 0x0800
#define UTF8_WARN_FE_FF 0x1000
#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
# define UTF8_IS_SUPER(s) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9 \
&& (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9 \
- || (NATIVE_UTF8_TO_I8(* (U8*) ((s)) + 1 >= 0xA2))))
+ || (NATIVE_UTF8_TO_I8(* ((U8*) (s) + 1)) >= 0xA2)))
#else
# define UTF8_IS_SUPER(s) (*(U8*) (s) >= 0xF4 \
&& (*(U8*) (s) > 0xF4 || (*((U8*) (s) + 1) >= 0x90)))
#define UNICODE_IS_SUPER(c) ((c) > PERL_UNICODE_MAX)
#define UNICODE_IS_FE_FF(c) ((c) > 0x7FFFFFFF)
-#ifdef HAS_QUAD
-# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
-#endif
-
#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \
LATIN_SMALL_LETTER_Y_WITH_DIAERESIS_NATIVE
#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
#define GREEK_SMALL_LETTER_MU 0x03BC
-#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */
-#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
-#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
-#define LATIN_SMALL_LETTER_LONG_S 0x017F
-#define KELVIN_SIGN 0x212A
-#define ANGSTROM_SIGN 0x212B
+#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case
+ of MICRON */
+#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
+#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
+#define LATIN_SMALL_LETTER_LONG_S 0x017F
+#define LATIN_SMALL_LIGATURE_LONG_S_T 0xFB05
+#define LATIN_SMALL_LIGATURE_ST 0xFB06
+#define KELVIN_SIGN 0x212A
+#define ANGSTROM_SIGN 0x212B
#define UNI_DISPLAY_ISPRINT 0x0001
#define UNI_DISPLAY_BACKSLASH 0x0002