X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/ec34087a0a7a2c973993150137f0f8428541e7a0..688ea162f4a70352089e12b46feab20232234ccf:/utfebcdic.h diff --git a/utfebcdic.h b/utfebcdic.h index 3eba83d..5705b96 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -541,8 +541,6 @@ END_EXTERN_C Unicode Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte U+0000..U+007F 000000000xxxxxxx 0xxxxxxx U+0080..U+009F 00000000100xxxxx 100xxxxx - U+00A0..U+00FF 00000000yyyxxxxx 11000yyy 101xxxxx - U+00A0..U+03FF 000000yyyyyxxxxx 110yyyyy 101xxxxx U+0400..U+3FFF 00zzzzyyyyyxxxxx 1110zzzz 101yyyyy 101xxxxx U+4000..U+3FFFF 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx @@ -562,14 +560,17 @@ END_EXTERN_C (uv) < 0x400000 ? 5 : \ (uv) < 0x4000000 ? 6 : 7 ) - #define UNI_IS_INVARIANT(c) ((c) < 0xA0) -/* UTF-EBCDIC semantic macros - transform back into I8 and then compare */ + +/* UTF-EBCDIC semantic macros - transform back into I8 and then compare + * Comments as to the meaning of each are given at their corresponding utf8.h + * definitions */ #define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xC5 && NATIVE_TO_UTF(c) != 0xE0) #define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0) #define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0) #define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xC5 && NATIVE_TO_UTF(c) <= 0xC7) +#define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_TO_I8(c) >= 0xC8) #define UTF_START_MARK(len) (((len) > 7) ? 0xFF : ((U8)(0xFE << (7-(len))))) #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) @@ -577,6 +578,19 @@ END_EXTERN_C #define UTF_CONTINUATION_MASK ((U8)0x1f) #define UTF_ACCUMULATION_SHIFT 5 +/* How wide can a single UTF-8 encoded character become in bytes. */ +/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8 + * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be + * expressed with 5 bytes. However, Perl thinks of UTF-8 as a way to encode + * non-negative integers in a binary format, even those above Unicode */ +#define UTF8_MAXBYTES 7 + +/* The maximum number of UTF-8 bytes a single Unicode character can + * uppercase/lowercase/fold into. Unicode guarantees that the maximum + * expansion is 3 characters. On EBCDIC platforms, the highest Unicode + * character occupies 5 bytes, therefore this number is 15 */ +#define UTF8_MAXBYTES_CASE 15 + /* * Local variables: * c-indentation-style: bsd