This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.h: Clean up and use START_MARK definition
authorKarl Williamson <public@khwilliamson.com>
Sat, 2 Mar 2013 19:12:11 +0000 (12:12 -0700)
committerKarl Williamson <public@khwilliamson.com>
Thu, 29 Aug 2013 15:56:00 +0000 (09:56 -0600)
The previous definition broke good encapsulation rules.  UTF_START_MARK
should return something that fits in a byte; it shouldn't be the caller
that does this.  So the mask is moved into the definition.  This means
it can apply only to the portion that creates something larger than a
byte.  Further, the EBCDIC version can be simplified, since 7 is the
largest possible number of bytes in an EBCDIC UTF8 character.

utf8.h
utfebcdic.h

diff --git a/utf8.h b/utf8.h
index 6a76210..4fc513b 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -204,7 +204,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * UTF-8 encoded character that give the number of bytes that comprise the
  * character.
  * */
-#define UTF_START_MARK(len) (((len) >  7) ? 0xFF : (0xFE << (7-(len))))
+#define UTF_START_MARK(len) (((len) >  7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
 
 /* Masks out the initial one bits in a start byte, leaving the real data ones.
  * Doesn't work on an invariant byte */
@@ -340,7 +340,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * however this doesn't won't work for ebcdic, and should be avoided.  Use
  * regen/unicode_constants instead */
 #define UTF8_TWO_BYTE_HI_nocast(c)     I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c)     \
-                        >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2)))
+                        >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
 #define UTF8_TWO_BYTE_LO_nocast(c)  I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c)         \
                                                   & UTF_CONTINUATION_MASK)      \
                                                 | UTF_CONTINUATION_MARK)
@@ -352,7 +352,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * These expand identically to the TWO_BYTE versions on ASCII platforms, but
  * use to/from LATIN1 instead of UNI, which on EBCDIC eliminates tests */
 #define UTF8_EIGHT_BIT_HI(c)   I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c)          \
-                        >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2)))
+                        >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
 #define UTF8_EIGHT_BIT_LO(c)   I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c)          \
                                                   & UTF_CONTINUATION_MASK)      \
                                                 | UTF_CONTINUATION_MARK)
index b5a33f8..0489621 100644 (file)
@@ -723,7 +723,9 @@ END_EXTERN_C
                                          && NATIVE_UTF8_TO_I8(c) <= 0xC7)
 #define UTF8_IS_ABOVE_LATIN1(c)        (NATIVE_UTF8_TO_I8(c) >= 0xC8)
 
-#define UTF_START_MARK(len) (((len) >  7) ? 0xFF : ((U8)(0xFE << (7-(len)))))
+/* Can't exceed 7 on EBCDIC platforms */
+#define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len))))
+
 #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2)))
 #define UTF_CONTINUATION_MARK          0xA0
 #define UTF_CONTINUATION_MASK          ((U8)0x1f)