X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/4bb101f2758f169969171dfe6b70f68a406dcc1e..cdaa314533693e4e1e9e67c24fd4ba3a5953ec00:/utfebcdic.h diff --git a/utfebcdic.h b/utfebcdic.h index 9659315..e61b4a7 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -1,6 +1,7 @@ /* utfebcdic.h * - * Copyright (C) 2001, 2002, by Larry Wall, Nick Ing-Simmons, and others + * Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009 by Larry Wall, + * Nick Ing-Simmons, and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. @@ -8,6 +9,69 @@ * Macros to implement UTF-EBCDIC as perl's internal encoding * Taken from version 7.1 of Unicode Techical Report #16: * http://www.unicode.org/unicode/reports/tr16 + * + * To summarize, the way it works is: + * To convert an EBCDIC character to UTF-EBCDIC: + * 1) convert to Unicode. The table in this file that does this is for + * EBCDIC bytes is PL_e2a (with inverse PLa2e). The 'a' stands for + * ASCIIish, meaning latin1. + * 2) convert that to a utf8-like string called I8 with variant characters + * occupying multiple bytes. This step is similar to the utf8-creating + * step from Unicode, but the details are different. There is a chart + * about the bit patterns in a comment later in this file. But + * essentially here are the differences: + * UTF8 I8 + * invariant byte starts with 0 starts with 0 or 100 + * continuation byte starts with 10 starts with 101 + * start byte same in both: if the code point requires N bytes, + * then the leading N bits are 1, followed by a 0. (No + * trailing 0 for the very largest possible allocation + * in I8, far beyond the current Unicode standard's + * max, as shown in the comment later in this file.) + * 3) Use the table published in tr16 to convert each byte from step 2 into + * final UTF-EBCDIC. The table in this file is PL_utf2e, and its inverse + * is PL_e2utf. They are constructed so that all EBCDIC invariants remain + * invariant, but no others do. For example, the ordinal value of 'A' is + * 193 in EBCDIC, and also is 193 in UTF-EBCDIC. Step 1) converts it to + * 65, Step 2 leaves it at 65, and Step 3 converts it back to 193. As an + * example of how a variant character works, take LATIN SMALL LETTER Y + * WITH DIAERESIS, which is typicially 0xDF in EBCDIC. Step 1 converts it + * to the Unicode value, 0xFF. Step 2 converts that to two bytes = + * 11000111 10111111 = C7 BF, and Step 3 converts those to 0x8B 0x73. The + * table is constructed so that the first bytes of a variant will always + * have its upper bit set (at least in the encodings that Perl recognizes, + * and probably all). + * + * If you're starting from Unicode, skip step 1. For UTF-EBCDIC to straight + * EBCDIC, reverse the steps. + * + * The EBCDIC invariants have been chosen to be those characters whose Unicode + * equivalents have ordinal numbers less than 160, that is the same characters + * that are expressible in ASCII, plus the C1 controls. So there are 160 + * invariants instead of the 128 in UTF-8. (My guess is that this is because + * the C1 control NEL (and maybe others) is important in IBM.) + * + * The purpose of Step 3 is to make the encoding be invariant for the chosen + * characters. This messes up the convenient patterns found in step 2, so + * generally, one has to undo step 3 into a temporary to use them. However, + * a "shadow", or parallel table, PL_utf8skip, has been constructed so that for + * each byte, it says how long the sequence is if that byte were to begin it + * + * There are actually 3 slightly different UTF-EBCDIC encodings in this file, + * one for each of the code pages recognized by Perl. That means that there + * are actually three different sets of tables, one for each code page. (If + * Perl is compiled on platforms using other EBCDIC code pages, it may not + * compile, or silently mistake it for one of the three.) + * + * EBCDIC characters above 0xFF are the same as Unicode in Perl's + * implementation of all 3 encodings, so for those Step 1 is trivial. + * + * (Note that the entries for invariant characters are necessarily the same in + * PL_e2a and PLe2f, and the same for their inverses.) + * + * UTF-EBCDIC strings are the same length or longer than UTF-8 representations + * of the same string. The maximum code point representable as 2 bytes in + * UTF-EBCDIC is 0x3FFF, instead of 0x7FFF in UTF-8. */ START_EXTERN_C @@ -81,7 +145,9 @@ unsigned char PL_utf8skip[] = { }; #endif -/* Transform tables from tr16 applied after encoding to render encoding EBCDIC like */ +/* Transform tables from tr16 applied after encoding to render encoding EBCDIC + * like, meaning that all the invariants are actually invariant, eg, that 'A' + * remains 'A' */ #if '^' == 95 /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */ EXTCONST unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-1047) */ @@ -100,7 +166,7 @@ EXTCONST unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-1047) */ 0x74, 0x75, 0x76, 0x77, 0x78, 0x80, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xAA, 0xAB, 0xAC, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBE, 0xBF, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xDA, 0xDB, - 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, + 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE }; EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to UTF-8-mod */ @@ -119,7 +185,7 @@ EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to UTF-8-mod */ 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0x5C, 0xF4, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 0x9F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 0x9F }; #endif /* 1047 */ @@ -140,7 +206,7 @@ unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (POSIX-BC) */ 0x74, 0x75, 0x76, 0x77, 0x78, 0x80, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xAA, 0xAB, 0xAC, 0xAE, 0xAF, 0xBA, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xAD, 0x79, 0xA1, 0xBE, 0xBF, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xDA, 0xDB, - 0xDC, 0xC0, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xDD, 0xFC, 0xE0, 0xFE, + 0xDC, 0xC0, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xDD, 0xFC, 0xE0, 0xFE }; unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to UTF-8-mod */ @@ -159,7 +225,7 @@ unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to UTF-8-mod */ 0xF1, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xBB, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0xEE, 0xEF, 0xF0, 0xFC, 0xF2, 0xF3, 0xFE, 0xF4, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xFB, 0x7B, 0xFD, 0x7D, 0xFF, 0x7E, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xFB, 0x7B, 0xFD, 0x7D, 0xFF, 0x7E }; #endif /* POSIX-BC */ @@ -180,7 +246,7 @@ unsigned char PL_utf2e[] = { /* UTF-8-mod to EBCDIC (IBM-037) */ 0x74, 0x75, 0x76, 0x77, 0x78, 0x80, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xAA, 0xAB, 0xAC, 0xAE, 0xAF, 0x5F, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xAD, 0xBD, 0xBC, 0xBE, 0xBF, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xDA, 0xDB, - 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, + 0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE }; unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to UTF-8-mod */ @@ -199,7 +265,7 @@ unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to UTF-8-mod */ 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0x5C, 0xF4, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 0x9F, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, 0x9F }; #endif /* 037 */ @@ -337,12 +403,13 @@ EXTCONST unsigned char PL_a2e[]; END_EXTERN_C -#define UTF8SKIP(s) PL_utf8skip[*(U8*)s] +#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)] -/* EBCDIC-happy ways of converting native code to UTF8 */ +/* EBCDIC-happy ways of converting native code to UTF-8 */ /* Native to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) PL_e2a[(U8)(ch)] +#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* synonym */ #define ASCII_TO_NATIVE(ch) PL_a2e[(U8)(ch)] /* Transform after encoding */ #define NATIVE_TO_UTF(ch) PL_e2utf[(U8)(ch)] @@ -357,16 +424,16 @@ END_EXTERN_C /* * Note: we should try and be careful never to call the isXXX_utf8() functions * unless we're pretty sure we've seen the beginning of a UTF-EBCDIC character - * Otherwise we risk loading in the heavy-duty SWASHINIT and SWASHGET routines - * unnecessarily. + * Otherwise we risk loading in the heavy-duty swash_init and swash_fetch + * routines unnecessarily. */ #define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \ ? isIDFIRST(*(p)) \ - : isIDFIRST_utf8((U8*)p)) + : isIDFIRST_utf8((const U8*)p)) #define isALNUM_lazy_if(p,c) ((IN_BYTES || (!c || UTF8_IS_INVARIANT(*p))) \ ? isALNUM(*(p)) \ - : isALNUM_utf8((U8*)p)) + : isALNUM_utf8((const U8*)p)) /* The following table is adapted from tr16, it shows UTF-8-mod encoding of Unicode code points. @@ -398,15 +465,15 @@ END_EXTERN_C #define UNI_IS_INVARIANT(c) ((c) < 0xA0) /* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */ -#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c)) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) #define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) #define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0) #define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0) #define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0) #define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xF8) == 0xC0) -#define UTF_START_MARK(len) ((len > 7) ? 0xFF : (0xFE << (7-len))) -#define UTF_START_MASK(len) ((len >= 6) ? 0x01 : (0x1F >> (len-2))) +#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : ((U8)(0xFE << (7-(len))))) +#define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) #define UTF_CONTINUATION_MARK 0xA0 #define UTF_CONTINUATION_MASK ((U8)0x1f) #define UTF_ACCUMULATION_SHIFT 5 @@ -417,4 +484,12 @@ END_EXTERN_C #define UTF8_EIGHT_BIT_HI(c) UTF_TO_NATIVE((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2)) #define UTF8_EIGHT_BIT_LO(c) UTF_TO_NATIVE(((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK) - +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: t + * End: + * + * ex: set ts=8 sts=4 sw=4 noet: + */