- * each for the exact same set of bit patterns. (And it works on any byte in a
- * UTF-8 encoded string) */
-#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
-
-#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
-
-#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */
-
-/* The macros in the next sets are used to generate the two utf8 or utfebcdic
- * bytes from an ordinal that is known to fit into two bytes; it must be less
- * than 0x3FF to work across both encodings. */
-/* Nocast allows these to be used in the case label of a switch statement;
- * however this doesn't won't work for ebcdic, and should be avoided. Use
- * regen/unicode_constants instead */
-#define UTF8_TWO_BYTE_HI_nocast(c) I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c) \
- >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2)))
-#define UTF8_TWO_BYTE_LO_nocast(c) I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c) \
- & UTF_CONTINUATION_MASK) \
- | UTF_CONTINUATION_MARK)
-
-#define UTF8_TWO_BYTE_HI(c) ((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
-#define UTF8_TWO_BYTE_LO(c) ((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
-
-/* This name is used when the source is a single byte (input not checked).
- * These expand identically to the TWO_BYTE versions on ASCII platforms, but
- * use to/from LATIN1 instead of UNI, which on EBCDIC eliminates tests */
-#define UTF8_EIGHT_BIT_HI(c) I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c) \
- >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2)))
-#define UTF8_EIGHT_BIT_LO(c) I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c) \
- & UTF_CONTINUATION_MASK) \
- | UTF_CONTINUATION_MARK)
+ * each for the exact same set of bit patterns. It is valid on a subset of
+ * what UVCHR_IS_INVARIANT is valid on, so can just use that; and the compiler
+ * should optimize out anything extraneous given the implementation of the
+ * latter. The |0 makes sure this isn't mistakenly called with a ptr argument.
+ * */
+#define UTF8_IS_INVARIANT(c) UVCHR_IS_INVARIANT((c) | 0)
+
+/* Like the above, but its name implies a non-UTF8 input, which as the comments
+ * above show, doesn't matter as to its implementation */
+#define NATIVE_BYTE_IS_INVARIANT(c) UVCHR_IS_INVARIANT(c)
+
+/* The macros in the next 4 sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into exactly two (not one) bytes;
+ * it must be less than 0x3FF to work across both encodings. */
+
+/* These two are helper macros for the other three sets, and should not be used
+ * directly anywhere else. 'translate_function' is either NATIVE_TO_LATIN1
+ * (which works for code points up through 0xFF) or NATIVE_TO_UNI which works
+ * for any code point */
+#define __BASE_TWO_BYTE_HI(c, translate_function) \
+ (__ASSERT_(! UVCHR_IS_INVARIANT(c)) \
+ I8_TO_NATIVE_UTF8((translate_function(c) >> UTF_ACCUMULATION_SHIFT) \
+ | UTF_START_MARK(2)))
+#define __BASE_TWO_BYTE_LO(c, translate_function) \
+ (__ASSERT_(! UVCHR_IS_INVARIANT(c)) \
+ I8_TO_NATIVE_UTF8((translate_function(c) & UTF_CONTINUATION_MASK) \
+ | UTF_CONTINUATION_MARK))
+
+/* The next two macros should not be used. They were designed to be usable as
+ * the case label of a switch statement, but this doesn't work for EBCDIC. Use
+ * regen/unicode_constants.pl instead */
+#define UTF8_TWO_BYTE_HI_nocast(c) __BASE_TWO_BYTE_HI(c, NATIVE_TO_UNI)
+#define UTF8_TWO_BYTE_LO_nocast(c) __BASE_TWO_BYTE_LO(c, NATIVE_TO_UNI)
+
+/* The next two macros are used when the source should be a single byte
+ * character; checked for under DEBUGGING */
+#define UTF8_EIGHT_BIT_HI(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ ( __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+#define UTF8_EIGHT_BIT_LO(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ (__BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
+
+/* These final two macros in the series are used when the source can be any
+ * code point whose UTF-8 is known to occupy 2 bytes; they are less efficient
+ * than the EIGHT_BIT versions on EBCDIC platforms. We use the logical '~'
+ * operator instead of "<=" to avoid getting compiler warnings.
+ * MAX_UTF8_TWO_BYTE should be exactly all one bits in the lower few
+ * places, so the ~ works */
+#define UTF8_TWO_BYTE_HI(c) \
+ (__ASSERT_((sizeof(c) == 1) \
+ || !(((WIDEST_UTYPE)(c)) & ~MAX_UTF8_TWO_BYTE)) \
+ (__BASE_TWO_BYTE_HI(c, NATIVE_TO_UNI)))
+#define UTF8_TWO_BYTE_LO(c) \
+ (__ASSERT_((sizeof(c) == 1) \
+ || !(((WIDEST_UTYPE)(c)) & ~MAX_UTF8_TWO_BYTE)) \
+ (__BASE_TWO_BYTE_LO(c, NATIVE_TO_UNI)))