/* Native character to/from iso-8859-1. Are the identity functions on ASCII
* platforms */
-#define NATIVE_TO_LATIN1(ch) (ch)
-#define LATIN1_TO_NATIVE(ch) (ch)
+#define NATIVE_TO_LATIN1(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
+#define LATIN1_TO_NATIVE(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
/* I8 is an intermediate version of UTF-8 used only in UTF-EBCDIC. We thus
* consider it to be identical to UTF-8 on ASCII platforms. Strictly speaking
* because they are 8-bit encodings that serve the same purpose in Perl, and
* rarely do we need to distinguish them. The term "NATIVE_UTF8" applies to
* whichever one is applicable on the current platform */
-#define NATIVE_UTF8_TO_I8(ch) (ch)
-#define I8_TO_NATIVE_UTF8(ch) (ch)
+#define NATIVE_UTF8_TO_I8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
+#define I8_TO_NATIVE_UTF8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
/* Transforms in wide UV chars */
#define UNI_TO_NATIVE(ch) (ch)
/* Is the representation of the Unicode code point 'cp' the same regardless of
* being encoded in UTF-8 or not? */
-#define UNI_IS_INVARIANT(cp) isASCII(cp)
+#define OFFUNI_IS_INVARIANT(cp) isASCII(cp)
/* Is the representation of the code point 'cp' the same regardless of
* being encoded in UTF-8 or not? 'cp' is native if < 256; Unicode otherwise
* */
-#define UVCHR_IS_INVARIANT(uv) UNI_IS_INVARIANT(uv)
+#define UVCHR_IS_INVARIANT(cp) OFFUNI_IS_INVARIANT(cp)
-/* Is the UTF8-encoded byte 'c' part of a variant sequence in UTF-8? This is
- * the inverse of UTF8_IS_INVARIANT */
+/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
+ * in UTF-8? This is the inverse of UTF8_IS_INVARIANT */
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
*/
#define UVCHR_SKIP(uv) OFFUNISKIP(uv)
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts. This allows code that needs to check for
+ * those to to quickly exclude the vast majority of code points it will
+ * encounter */
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+
#endif /* EBCDIC vs ASCII */
/* Rest of these are attributes of Unicode and perl's internals rather than the
#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
#define UTF8_SKIP(s) UTF8SKIP(s)
+/* Most code that says 'UNI_' really means the native value for code points up
+ * through 255 */
+#define UNI_IS_INVARIANT(cp) UVCHR_IS_INVARIANT(cp)
+
/* Is the byte 'c' the same character when encoded in UTF-8 as when not. This
* works on both UTF-8 encoded strings and non-encoded, as it returns TRUE in
- * each for the exact same set of bit patterns. (And it works on any byte in a
- * UTF-8 encoded string) */
-#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
-
-/* Like the above, but its name implies a non-UTF8 input */
+ * each for the exact same set of bit patterns. It is valid on a subset of
+ * what UVCHR_IS_INVARIANT is valid on, so can just use that; and the compiler
+ * should optimize out anything extraneous given the implementation of the
+ * latter */
+#define UTF8_IS_INVARIANT(c) UVCHR_IS_INVARIANT(c)
+
+/* Like the above, but its name implies a non-UTF8 input, which as the comments
+ * above show, doesn't matter as to its implementation */
#define NATIVE_BYTE_IS_INVARIANT(c) UVCHR_IS_INVARIANT(c)
#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */
#define UTF8_ALLOW_FFFF 0
#define UTF8_ALLOW_SURROGATE 0
-#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF)
+#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \
+ (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR \
+ |UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF)
#define UTF8_WARN_ILLEGAL_INTERCHANGE \
(UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_FE_FF)
#define UTF8_ALLOW_ANY \
#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
UTF8_ALLOW_ANYUV)
-/* Surrogates, non-character code points and above-Unicode code points are
- * problematic in some contexts. This allows code that needs to check for
- * those to to quickly exclude the vast majority of code points it will
- * encounter */
-#ifdef EBCDIC
-# define isUTF8_POSSIBLY_PROBLEMATIC(c) (NATIVE_UTF8_TO_I8(c) >= 0xF1)
-#else
-# define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
-#endif
-
/* Several of the macros below have a second parameter that is currently
* unused; but could be used in the future to make sure that the input is
* well-formed. */