From: Karl Williamson Date: Wed, 26 Apr 2017 16:29:58 +0000 (-0600) Subject: utf8.h: Add assertions for macros that take chars X-Git-Tag: v5.27.1~169 X-Git-Url: https://perl5.git.perl.org/perl5.git/commitdiff_plain/a6951642ede4abe605dcf0e94b74948e0a60a56b?hp=a5ba252751fc7fd7b9d43d0ad4491eb68a14a4a6 utf8.h: Add assertions for macros that take chars This is inspired by [perl #131190]. The UTF-8 macros whose parameters are characters now have assertions that verify they are not being called with something that won't fit in a char. These assertions should be getting optimized out if the input type is a char or U8. --- diff --git a/utf8.h b/utf8.h index affa2d6..b2e338a 100644 --- a/utf8.h +++ b/utf8.h @@ -266,13 +266,15 @@ C is Unicode if above 255; otherwise is platform-native. /* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this * isn't mistakenly called with a ptr argument */ -#define UTF8_IS_CONTINUED(c) (((U8)((c) | 0)) & UTF_CONTINUATION_MARK) +#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + ((U8)((c) | 0)) & UTF_CONTINUATION_MARK) /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence? * This doesn't catch invariants (they are single-byte). It also excludes the * illegal overlong sequences that begin with C0 and C1. The |0 makes sure * this isn't mistakenly called with a ptr argument */ -#define UTF8_IS_START(c) (((U8)((c) | 0)) >= 0xc2) +#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + ((U8)((c) | 0)) >= 0xc2) /* For use in UTF8_IS_CONTINUATION() below */ #define UTF_IS_CONTINUATION_MASK 0xC0 @@ -280,20 +282,22 @@ C is Unicode if above 255; otherwise is platform-native. /* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the * first byte thereof? The |0 makes sure this isn't mistakenly called with a * ptr argument */ -#define UTF8_IS_CONTINUATION(c) \ - ((((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) +#define UTF8_IS_CONTINUATION(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) /* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a * ptr argument */ -#define UTF8_IS_DOWNGRADEABLE_START(c) ((((U8)((c) | 0)) & 0xfe) == 0xc2) +#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (((U8)((c) | 0)) & 0xfe) == 0xc2) /* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that * represent a code point > 255? The |0 makes sure this isn't mistakenly * called with a ptr argument */ -#define UTF8_IS_ABOVE_LATIN1(c) (((U8)((c) | 0)) >= 0xc4) +#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + ((U8)((c) | 0)) >= 0xc4) /* This is the number of low-order bits a continuation byte in a UTF-8 encoded * sequence contributes to the specification of the code point. In the bit @@ -309,7 +313,8 @@ C is Unicode if above 255; otherwise is platform-native. * problematic in some contexts. This allows code that needs to check for * those to to quickly exclude the vast majority of code points it will * encounter */ -#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED) +#define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (U8) c >= 0xED) /* A helper macro for isUTF8_CHAR, so use that one instead of this. This was * generated by regen/regcharclass.pl, and then moved here. Then it was @@ -529,7 +534,8 @@ encoded as UTF-8. C is a native (ASCII or EBCDIC) code point if less than * that this is asymmetric on EBCDIC platforms, in that the 'new' parameter is * the UTF-EBCDIC byte, whereas the 'old' parameter is a Unicode (not EBCDIC) * code point in process of being generated */ -#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) \ +#define UTF8_ACCUMULATE(old, new) (__ASSERT_(FITS_IN_8_BITS(new)) \ + ((old) << UTF_ACCUMULATION_SHIFT) \ | ((NATIVE_UTF8_TO_I8((U8)new)) \ & UTF_CONTINUATION_MASK)) @@ -571,8 +577,10 @@ encoded as UTF-8. C is a native (ASCII or EBCDIC) code point if less than * Note that the result can be larger than 255 if the input character is not * downgradable */ #define TWO_BYTE_UTF8_TO_NATIVE(HI, LO) \ - ( __ASSERT_(PL_utf8skip[HI] == 2) \ - __ASSERT_(UTF8_IS_CONTINUATION(LO)) \ + (__ASSERT_(FITS_IN_8_BITS(HI)) \ + __ASSERT_(FITS_IN_8_BITS(LO)) \ + __ASSERT_(PL_utf8skip[HI] == 2) \ + __ASSERT_(UTF8_IS_CONTINUATION(LO)) \ UNI_TO_NATIVE(UTF8_ACCUMULATE((NATIVE_UTF8_TO_I8(HI) & UTF_START_MASK(2)), \ (LO))))