X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/2d1545e53e75b1c3ae16ad055ae011e2e015e0c3..a7db2e35a577614d7a6eaf05114b2d6224f0ab33:/utfebcdic.h diff --git a/utfebcdic.h b/utfebcdic.h index 5912b3a..10b666a 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -27,17 +27,25 @@ * invariant byte starts with 0 starts with 0 or 100 * continuation byte starts with 10 starts with 101 * start byte same in both: if the code point requires N bytes, - * then the leading N bits are 1, followed by a 0. (No - * trailing 0 for the very largest possible allocation - * in I8, far beyond the current Unicode standard's - * max, as shown in the comment later in this file.) + * then the leading N bits are 1, followed by a 0. If + * all 8 bits in the first byte are 1, the code point + * will occupy 14 bytes (compared to 13 in Perl's + * extended UTF-8). This is incompatible with what + * tr16 implies should be the representation of code + * points 2**30 and above, but allows Perl to be able + * to represent all code points that fit in a 64-bit + * word in either our extended UTF-EBCDIC or UTF-8. * 3) Use the algorithm in tr16 to convert each byte from step 2 into * final UTF-EBCDIC. This is done by table lookup from a table * constructed from the algorithm, reproduced in ebcdic_tables.h as * PL_utf2e, with its inverse being PL_e2utf. They are constructed so that * all EBCDIC invariants remain invariant, but no others do, and the first * byte of a variant will always have its upper bit set. But note that - * the upper bit of some invariants is also 1. + * the upper bit of some invariants is also 1. The table also is designed + * so that lexically comparing two UTF-EBCDIC-variant characters yields + * the Unicode code point order. (To get native code point order, one has + * to convert the latin1-range characters to their native code point + * value.) * * For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in * UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3 @@ -142,13 +150,24 @@ END_EXTERN_C #define I8_TO_NATIVE_UTF8(b) (__ASSERT_(FITS_IN_8_BITS(b)) PL_utf2e[(U8)(b)]) /* Transforms in wide UV chars */ -#define NATIVE_TO_UNI(ch) (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (ch)) -#define UNI_TO_NATIVE(ch) (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (ch)) +#define NATIVE_TO_UNI(ch) (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (UV) (ch)) +#define UNI_TO_NATIVE(ch) (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (UV) (ch)) + +/* How wide can a single UTF-8 encoded character become in bytes. */ +/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8 + * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be + * expressed with 5 bytes. However, Perl thinks of UTF-8 as a way to encode + * non-negative integers in a binary format, even those above Unicode. 14 is + * the smallest number that covers 2**64 + * + * WARNING: This number must be in sync with the value in + * regen/charset_translations.pl. */ +#define UTF8_MAXBYTES 14 /* - The following table is adapted from tr16, it shows I8 encoding of Unicode code points. + The following table is adapted from tr16, it shows the I8 encoding of Unicode code points. - Unicode U32 Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte + Unicode U32 Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th Byte U+0000..U+007F 000000000xxxxxxx 0xxxxxxx U+0080..U+009F 00000000100xxxxx 100xxxxx U+00A0..U+03FF 000000yyyyyxxxxx 110yyyyy 101xxxxx @@ -156,21 +175,20 @@ END_EXTERN_C U+4000..U+3FFFF 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx U+40000..U+3FFFFF 0vvwwwwwzzzzzyyyyyxxxxx 111110vv 101wwwww 101zzzzz 101yyyyy 101xxxxx U+400000..U+3FFFFFF 0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx - U+4000000..U+7FFFFFFF 0tuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 1111111t 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx + U+4000000..U+3FFFFFFF 00uuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111110 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx - Note: The I8 transformation is valid for UCS-4 values X'0' to - X'7FFFFFFF' (the full extent of ISO/IEC 10646 coding space). +Beyond this, Perl uses an incompatible extension, similar to the one used in +regular UTF-8. There are now 14 bytes. A full 32 bits of information thus looks like this: + 1st Byte 2nd-7th 8th Byte 9th Byte 10th B 11th B 12th B 13th B 14th B +U+40000000..U+FFFFFFFF ttuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111111 10100000 101000tt 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx - */ +For 32-bit words, the 2nd through 7th bytes effectively function as leading +zeros. Above 32 bits, these fill up, with each byte yielding 5 bits of +information, so that with 13 continuation bytes, we can handle 65 bits, just +above what a 64 bit word can hold */ -/* Input is a true Unicode (not-native) code point */ -#define OFFUNISKIP(uv) ( (uv) < 0xA0 ? 1 : \ - (uv) < 0x400 ? 2 : \ - (uv) < 0x4000 ? 3 : \ - (uv) < 0x40000 ? 4 : \ - (uv) < 0x400000 ? 5 : \ - (uv) < 0x4000000 ? 6 : 7 ) +/* This is a fundamental property of UTF-EBCDIC */ #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) /* It turns out that on EBCDIC platforms, the invariants are the characters @@ -180,13 +198,6 @@ END_EXTERN_C #define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \ && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) -#define UVCHR_SKIP(uv) (UVCHR_IS_INVARIANT(uv) ? 1 : \ - (uv) < 0x400 ? 2 : \ - (uv) < 0x4000 ? 3 : \ - (uv) < 0x40000 ? 4 : \ - (uv) < 0x400000 ? 5 : \ - (uv) < 0x4000000 ? 6 : 7 ) - /* UTF-EBCDIC semantic macros - We used to transform back into I8 and then * compare, but now only have to do a single lookup by using a bit in * l1_char_class_tab.h. @@ -194,8 +205,17 @@ END_EXTERN_C * definitions. */ #define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START) + +#define UTF_IS_CONTINUATION_MASK 0xE0 + #define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION) +/* The above instead could be written as this: +#define UTF8_IS_CONTINUATION(c) \ + (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK) \ + == UTF_CONTINUATION_MARK) + */ + /* Equivalent to ! UVCHR_IS_INVARIANT(c) */ #define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \ && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) @@ -213,33 +233,13 @@ END_EXTERN_C #define isUTF8_POSSIBLY_PROBLEMATIC(c) \ _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE) -/* Can't exceed 7 on EBCDIC platforms */ -#define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len)))) - -#define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) #define UTF_CONTINUATION_MARK 0xA0 -#define UTF_CONTINUATION_MASK ((U8)0x1f) #define UTF_ACCUMULATION_SHIFT 5 -/* How wide can a single UTF-8 encoded character become in bytes. */ -/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8 - * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be - * expressed with 5 bytes. However, Perl thinks of UTF-8 as a way to encode - * non-negative integers in a binary format, even those above Unicode */ -#define UTF8_MAXBYTES 7 - -/* The maximum number of UTF-8 bytes a single Unicode character can - * uppercase/lowercase/fold into. Unicode guarantees that the maximum - * expansion is 3 characters. On EBCDIC platforms, the highest Unicode - * character occupies 5 bytes, therefore this number is 15 */ -#define UTF8_MAXBYTES_CASE 15 - /* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() * for more */ #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) -#define MAX_UTF8_TWO_BYTE 0x3FF - /* * ex: set ts=8 sts=4 sw=4 et: */