Replace two ugly casts in reg_recode() calls.

[perl5.git] / utf8.h
diff --git a/utf8.h b/utf8.h

index 99545a2..dcc9b86 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -127,8 +127,8 @@ END_EXTERN_C
  
  /* Native character to/from iso-8859-1.  Are the identity functions on ASCII
   * platforms */
-#define NATIVE_TO_LATIN1(ch)     (ch)
-#define LATIN1_TO_NATIVE(ch)     (ch)
+#define NATIVE_TO_LATIN1(ch)     (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
+#define LATIN1_TO_NATIVE(ch)     (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
  
  /* I8 is an intermediate version of UTF-8 used only in UTF-EBCDIC.  We thus
   * consider it to be identical to UTF-8 on ASCII platforms.  Strictly speaking
@@ -136,8 +136,8 @@ END_EXTERN_C
   * because they are 8-bit encodings that serve the same purpose in Perl, and
   * rarely do we need to distinguish them.  The term "NATIVE_UTF8" applies to
   * whichever one is applicable on the current platform */
-#define NATIVE_UTF8_TO_I8(ch) (ch)
-#define I8_TO_NATIVE_UTF8(ch) (ch)
+#define NATIVE_UTF8_TO_I8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
+#define I8_TO_NATIVE_UTF8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) (ch))
  
  /* Transforms in wide UV chars */
  #define UNI_TO_NATIVE(ch)        (ch)
@@ -192,15 +192,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  
  /* Is the representation of the Unicode code point 'cp' the same regardless of
   * being encoded in UTF-8 or not? */
-#define UNI_IS_INVARIANT(cp)     isASCII(cp)
+#define OFFUNI_IS_INVARIANT(cp)     isASCII(cp)
  
  /* Is the representation of the code point 'cp' the same regardless of
   * being encoded in UTF-8 or not?  'cp' is native if < 256; Unicode otherwise
   * */
-#define UVCHR_IS_INVARIANT(uv)      UNI_IS_INVARIANT(uv)
+#define UVCHR_IS_INVARIANT(cp)      OFFUNI_IS_INVARIANT(cp)
  
-/* Is the UTF8-encoded byte 'c' part of a variant sequence in UTF-8?  This is
- * the inverse of UTF8_IS_INVARIANT */
+/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
+ * in UTF-8?  This is the inverse of UTF8_IS_INVARIANT */
  #define UTF8_IS_CONTINUED(c)        (((U8)c) &  0x80)
  
  /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
@@ -309,6 +309,12 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
   */
  #define UVCHR_SKIP(uv) OFFUNISKIP(uv)
  
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts.  This allows code that needs to check for
+ * those to to quickly exclude the vast majority of code points it will
+ * encounter */
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+
  #endif /* EBCDIC vs ASCII */
  
  /* Rest of these are attributes of Unicode and perl's internals rather than the
@@ -395,13 +401,20 @@ only) byte is pointed to by C<s>.
  #define UTF8SKIP(s)  PL_utf8skip[*(const U8*)(s)]
  #define UTF8_SKIP(s) UTF8SKIP(s)
  
+/* Most code that says 'UNI_' really means the native value for code points up
+ * through 255 */
+#define UNI_IS_INVARIANT(cp)   UVCHR_IS_INVARIANT(cp)
+
  /* Is the byte 'c' the same character when encoded in UTF-8 as when not.  This
   * works on both UTF-8 encoded strings and non-encoded, as it returns TRUE in
- * each for the exact same set of bit patterns.  (And it works on any byte in a
- * UTF-8 encoded string) */
-#define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
-
-/* Like the above, but its name implies a non-UTF8 input */
+ * each for the exact same set of bit patterns.  It is valid on a subset of
+ * what UVCHR_IS_INVARIANT is valid on, so can just use that; and the compiler
+ * should optimize out anything extraneous given the implementation of the
+ * latter */
+#define UTF8_IS_INVARIANT(c)   UVCHR_IS_INVARIANT(c)
+
+/* Like the above, but its name implies a non-UTF8 input, which as the comments
+ * above show, doesn't matter as to its implementation */
  #define NATIVE_BYTE_IS_INVARIANT(c)    UVCHR_IS_INVARIANT(c)
  
  #define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */
@@ -538,7 +551,9 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  #define UTF8_ALLOW_FFFF 0
  #define UTF8_ALLOW_SURROGATE 0
  
-#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF)
+#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE                                      \
+                                (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR     \
+                                 |UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF)
  #define UTF8_WARN_ILLEGAL_INTERCHANGE \
         (UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_FE_FF)
  #define UTF8_ALLOW_ANY \
@@ -549,16 +564,6 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  #define UTF8_ALLOW_DEFAULT             (ckWARN(WARN_UTF8) ? 0 : \
                                          UTF8_ALLOW_ANYUV)
  
-/* Surrogates, non-character code points and above-Unicode code points are
- * problematic in some contexts.  This allows code that needs to check for
- * those to to quickly exclude the vast majority of code points it will
- * encounter */
-#ifdef EBCDIC
-#   define isUTF8_POSSIBLY_PROBLEMATIC(c) (NATIVE_UTF8_TO_I8(c) >= 0xF1)
-#else
-#   define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
-#endif
-
  /* Several of the macros below have a second parameter that is currently
   * unused; but could be used in the future to make sure that the input is
   * well-formed. */