utf8.h: Add assertions for macros that take chars

author Karl Williamson <khw@cpan.org>

Wed, 26 Apr 2017 16:29:58 +0000 (10:29 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 1 Jun 2017 13:05:15 +0000 (07:05 -0600)
author Karl Williamson <khw@cpan.org>
Wed, 26 Apr 2017 16:29:58 +0000 (10:29 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 1 Jun 2017 13:05:15 +0000 (07:05 -0600)
diff --git a/utf8.h b/utf8.h

index affa2d6..b2e338a 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -266,13 +266,15 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
  /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
   * in UTF-8?  This is the inverse of UTF8_IS_INVARIANT.  The |0 makes sure this
   * isn't mistakenly called with a ptr argument */
  /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
   * in UTF-8?  This is the inverse of UTF8_IS_INVARIANT.  The |0 makes sure this
   * isn't mistakenly called with a ptr argument */
-#define UTF8_IS_CONTINUED(c)        (((U8)((c) | 0)) &  UTF_CONTINUATION_MARK)
+#define UTF8_IS_CONTINUED(c)  (__ASSERT_(FITS_IN_8_BITS(c))                 \
+                               ((U8)((c) | 0)) &  UTF_CONTINUATION_MARK)
  
  /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
   * This doesn't catch invariants (they are single-byte).  It also excludes the
   * illegal overlong sequences that begin with C0 and C1.  The |0 makes sure
   * this isn't mistakenly called with a ptr argument */
  
  /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
   * This doesn't catch invariants (they are single-byte).  It also excludes the
   * illegal overlong sequences that begin with C0 and C1.  The |0 makes sure
   * this isn't mistakenly called with a ptr argument */
-#define UTF8_IS_START(c)            (((U8)((c) | 0)) >= 0xc2)
+#define UTF8_IS_START(c)      (__ASSERT_(FITS_IN_8_BITS(c))                 \
+                               ((U8)((c) | 0)) >= 0xc2)
  
  /* For use in UTF8_IS_CONTINUATION() below */
  #define UTF_IS_CONTINUATION_MASK    0xC0
  
  /* For use in UTF8_IS_CONTINUATION() below */
  #define UTF_IS_CONTINUATION_MASK    0xC0
@@ -280,20 +282,22 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
  /* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the
   * first byte thereof?  The |0 makes sure this isn't mistakenly called with a
   * ptr argument */
  /* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the
   * first byte thereof?  The |0 makes sure this isn't mistakenly called with a
   * ptr argument */
-#define UTF8_IS_CONTINUATION(c)                                             \
-    ((((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK)
+#define UTF8_IS_CONTINUATION(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
+     (((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK)
  
  /* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence?  Use
   * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
   * be well-formed.  Masking with 0xfe allows the low bit to be 0 or 1; thus
   * this matches 0xc[23].  The |0 makes sure this isn't mistakenly called with a
   * ptr argument */
  
  /* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence?  Use
   * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
   * be well-formed.  Masking with 0xfe allows the low bit to be 0 or 1; thus
   * this matches 0xc[23].  The |0 makes sure this isn't mistakenly called with a
   * ptr argument */
-#define UTF8_IS_DOWNGRADEABLE_START(c) ((((U8)((c) | 0)) & 0xfe) == 0xc2)
+#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c))       \
+                                         (((U8)((c) | 0)) & 0xfe) == 0xc2)
  
  /* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
   * represent a code point > 255?  The |0 makes sure this isn't mistakenly
   * called with a ptr argument */
  
  /* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
   * represent a code point > 255?  The |0 makes sure this isn't mistakenly
   * called with a ptr argument */
-#define UTF8_IS_ABOVE_LATIN1(c)     (((U8)((c) | 0)) >= 0xc4)
+#define UTF8_IS_ABOVE_LATIN1(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
+                                     ((U8)((c) | 0)) >= 0xc4)
  
  /* This is the number of low-order bits a continuation byte in a UTF-8 encoded
   * sequence contributes to the specification of the code point.  In the bit
  
  /* This is the number of low-order bits a continuation byte in a UTF-8 encoded
   * sequence contributes to the specification of the code point.  In the bit
@@ -309,7 +313,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
   * problematic in some contexts.  This allows code that needs to check for
   * those to to quickly exclude the vast majority of code points it will
   * encounter */
   * problematic in some contexts.  This allows code that needs to check for
   * those to to quickly exclude the vast majority of code points it will
   * encounter */
-#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c))        \
+                                        (U8) c >= 0xED)
  
  /* A helper macro for isUTF8_CHAR, so use that one instead of this.  This was
   * generated by regen/regcharclass.pl, and then moved here.  Then it was
  
  /* A helper macro for isUTF8_CHAR, so use that one instead of this.  This was
   * generated by regen/regcharclass.pl, and then moved here.  Then it was
@@ -529,7 +534,8 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
   * that this is asymmetric on EBCDIC platforms, in that the 'new' parameter is
   * the UTF-EBCDIC byte, whereas the 'old' parameter is a Unicode (not EBCDIC)
   * code point in process of being generated */
   * that this is asymmetric on EBCDIC platforms, in that the 'new' parameter is
   * the UTF-EBCDIC byte, whereas the 'old' parameter is a Unicode (not EBCDIC)
   * code point in process of being generated */
-#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT)           \
+#define UTF8_ACCUMULATE(old, new) (__ASSERT_(FITS_IN_8_BITS(new))              \
+                                   ((old) << UTF_ACCUMULATION_SHIFT)           \
                                     | ((NATIVE_UTF8_TO_I8((U8)new))             \
                                         & UTF_CONTINUATION_MASK))
  
                                     | ((NATIVE_UTF8_TO_I8((U8)new))             \
                                         & UTF_CONTINUATION_MASK))
  
@@ -571,8 +577,10 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
   * Note that the result can be larger than 255 if the input character is not
   * downgradable */
  #define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \
   * Note that the result can be larger than 255 if the input character is not
   * downgradable */
  #define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \
-    ( __ASSERT_(PL_utf8skip[HI] == 2)                                           \
-      __ASSERT_(UTF8_IS_CONTINUATION(LO))                                       \
+    (__ASSERT_(FITS_IN_8_BITS(HI))                                              \
+     __ASSERT_(FITS_IN_8_BITS(LO))                                              \
+     __ASSERT_(PL_utf8skip[HI] == 2)                                            \
+     __ASSERT_(UTF8_IS_CONTINUATION(LO))                                        \
       UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \
                                     (LO))))
  
       UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \
                                     (LO))))
author	Karl Williamson <khw@cpan.org>
	Wed, 26 Apr 2017 16:29:58 +0000 (10:29 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 1 Jun 2017 13:05:15 +0000 (07:05 -0600)