| 1 | /* utfebcdic.h |
| 2 | * |
| 3 | * Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009, |
| 4 | * 2010, 2011 by Larry Wall, Nick Ing-Simmons, and others |
| 5 | * |
| 6 | * You may distribute under the terms of either the GNU General Public |
| 7 | * License or the Artistic License, as specified in the README file. |
| 8 | * |
| 9 | * Macros to implement UTF-EBCDIC as perl's internal encoding |
| 10 | * Adapted from version 7.1 of Unicode Technical Report #16: |
| 11 | * http://www.unicode.org/unicode/reports/tr16 |
| 12 | * |
| 13 | * To summarize, the way it works is: |
| 14 | * To convert an EBCDIC character to UTF-EBCDIC: |
| 15 | * 1) convert to Unicode. The table in the generated file 'ebcdic_tables.h' |
| 16 | * that does this for EBCDIC bytes is PL_e2a (with inverse PL_a2e). The |
| 17 | * 'a' stands for ASCII platform, meaning latin1. |
| 18 | * 2) convert that to a utf8-like string called I8 ('I' stands for |
| 19 | * intermediate) with variant characters occupying multiple bytes. This |
| 20 | * step is similar to the utf8-creating step from Unicode, but the details |
| 21 | * are different. This transformation is called UTF8-Mod. There is a |
| 22 | * chart about the bit patterns in a comment later in this file. But |
| 23 | * essentially here are the differences: |
| 24 | * UTF8 I8 |
| 25 | * invariant byte starts with 0 starts with 0 or 100 |
| 26 | * continuation byte starts with 10 starts with 101 |
| 27 | * start byte same in both: if the code point requires N bytes, |
| 28 | * then the leading N bits are 1, followed by a 0. (No |
| 29 | * trailing 0 for the very largest possible allocation |
| 30 | * in I8, far beyond the current Unicode standard's |
| 31 | * max, as shown in the comment later in this file.) |
| 32 | * 3) Use the algorithm in tr16 to convert each byte from step 2 into |
| 33 | * final UTF-EBCDIC. This is done by table lookup from a table |
| 34 | * constructed from the algorithm, reproduced in ebcdic_tables.h as |
| 35 | * PL_utf2e, with its inverse being PL_e2utf. They are constructed so that |
| 36 | * all EBCDIC invariants remain invariant, but no others do, and the first |
| 37 | * byte of a variant will always have its upper bit set. But note that |
| 38 | * the upper bit of some invariants is also 1. |
| 39 | * |
| 40 | * For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in |
| 41 | * UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3 |
| 42 | * converts it back to 193. As an example of how a variant character works, |
| 43 | * take LATIN SMALL LETTER Y WITH DIAERESIS, which is typically 0xDF in |
| 44 | * EBCDIC. Step 1 converts it to the Unicode value, 0xFF. Step 2 converts |
| 45 | * that to two bytes = 11000111 10111111 = C7 BF, and Step 3 converts those to |
| 46 | * 0x8B 0x73. |
| 47 | * |
| 48 | * If you're starting from Unicode, skip step 1. For UTF-EBCDIC to straight |
| 49 | * EBCDIC, reverse the steps. |
| 50 | * |
| 51 | * The EBCDIC invariants have been chosen to be those characters whose Unicode |
| 52 | * equivalents have ordinal numbers less than 160, that is the same characters |
| 53 | * that are expressible in ASCII, plus the C1 controls. So there are 160 |
| 54 | * invariants instead of the 128 in UTF-8. (My guess is that this is because |
| 55 | * the C1 control NEL (and maybe others) is important in IBM.) |
| 56 | * |
| 57 | * The purpose of Step 3 is to make the encoding be invariant for the chosen |
| 58 | * characters. This messes up the convenient patterns found in step 2, so |
| 59 | * generally, one has to undo step 3 into a temporary to use them. However, |
| 60 | * one "shadow", or parallel table, PL_utf8skip, has been constructed that |
| 61 | * doesn't require undoing things. It is such that for each byte, it says |
| 62 | * how long the sequence is if that (UTF-EBCDIC) byte were to begin it |
| 63 | * |
| 64 | * There are actually 3 slightly different UTF-EBCDIC encodings in |
| 65 | * ebcdic_tables.h, one for each of the code pages recognized by Perl. That |
| 66 | * means that there are actually three different sets of tables, one for each |
| 67 | * code page. (If Perl is compiled on platforms using another EBCDIC code |
| 68 | * page, it may not compile, or Perl may silently mistake it for one of the |
| 69 | * three.) |
| 70 | * |
| 71 | * Note that tr16 actually only specifies one version of UTF-EBCDIC, based on |
| 72 | * the 1047 encoding, and which is supposed to be used for all code pages. |
| 73 | * But this doesn't work. To illustrate the problem, consider the '^' character. |
| 74 | * On a 037 code page it is the single byte 176, whereas under 1047 UTF-EBCDIC |
| 75 | * it is the single byte 95. If Perl implemented tr16 exactly, it would mean |
| 76 | * that changing a string containing '^' to UTF-EBCDIC would change that '^' |
| 77 | * from 176 to 95 (and vice-versa), violating the rule that ASCII-range |
| 78 | * characters are the same in UTF-8 or not. Much code in Perl assumes this |
| 79 | * rule. See for example |
| 80 | * http://grokbase.com/t/perl/mvs/025xf0yhmn/utf-ebcdic-for-posix-bc-malformed-utf-8-character |
| 81 | * What Perl does is create a version of UTF-EBCDIC suited to each code page; |
| 82 | * the one for the 1047 code page is identical to what's specified in tr16. |
| 83 | * This complicates interchanging files between computers using different code |
| 84 | * pages. Best is to convert to I8 before sending them, as the I8 |
| 85 | * representation is the same no matter what the underlying code page is. |
| 86 | * |
| 87 | * tr16 also says that NEL and LF be swapped. We don't do that. |
| 88 | * |
| 89 | * EBCDIC characters above 0xFF are the same as Unicode in Perl's |
| 90 | * implementation of all 3 encodings, so for those Step 1 is trivial. |
| 91 | * |
| 92 | * (Note that the entries for invariant characters are necessarily the same in |
| 93 | * PL_e2a and PL_e2utf; likewise for their inverses.) |
| 94 | * |
| 95 | * UTF-EBCDIC strings are the same length or longer than UTF-8 representations |
| 96 | * of the same string. The maximum code point representable as 2 bytes in |
| 97 | * UTF-EBCDIC is 0x3FFF, instead of 0x7FFF in UTF-8. |
| 98 | */ |
| 99 | |
| 100 | START_EXTERN_C |
| 101 | |
| 102 | #ifdef DOINIT |
| 103 | |
| 104 | #include "ebcdic_tables.h" |
| 105 | |
| 106 | #else |
| 107 | EXTCONST U8 PL_utf8skip[]; |
| 108 | EXTCONST U8 PL_e2utf[]; |
| 109 | EXTCONST U8 PL_utf2e[]; |
| 110 | EXTCONST U8 PL_e2a[]; |
| 111 | EXTCONST U8 PL_a2e[]; |
| 112 | EXTCONST U8 PL_fold[]; |
| 113 | EXTCONST U8 PL_fold_latin1[]; |
| 114 | EXTCONST U8 PL_latin1_lc[]; |
| 115 | EXTCONST U8 PL_mod_latin1_uc[]; |
| 116 | #endif |
| 117 | |
| 118 | END_EXTERN_C |
| 119 | |
| 120 | /* EBCDIC-happy ways of converting native code to UTF-8 */ |
| 121 | |
| 122 | #define NATIVE_TO_LATIN1(ch) PL_e2a[(U8)(ch)] |
| 123 | #define LATIN1_TO_NATIVE(ch) PL_a2e[(U8)(ch)] |
| 124 | |
| 125 | #define NATIVE_UTF8_TO_I8(ch) PL_e2utf[(U8)(ch)] |
| 126 | #define I8_TO_NATIVE_UTF8(ch) PL_utf2e[(U8)(ch)] |
| 127 | |
| 128 | /* Transforms in wide UV chars */ |
| 129 | #define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : NATIVE_TO_LATIN1(ch)) |
| 130 | #define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : LATIN1_TO_NATIVE(ch)) |
| 131 | |
| 132 | /* |
| 133 | The following table is adapted from tr16, it shows I8 encoding of Unicode code points. |
| 134 | |
| 135 | Unicode Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte |
| 136 | U+0000..U+007F 000000000xxxxxxx 0xxxxxxx |
| 137 | U+0080..U+009F 00000000100xxxxx 100xxxxx |
| 138 | U+00A0..U+03FF 000000yyyyyxxxxx 110yyyyy 101xxxxx |
| 139 | U+0400..U+3FFF 00zzzzyyyyyxxxxx 1110zzzz 101yyyyy 101xxxxx |
| 140 | U+4000..U+3FFFF 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx |
| 141 | U+40000..U+3FFFFF 0vvwwwwwzzzzzyyyyyxxxxx 111110vv 101wwwww 101zzzzz 101yyyyy 101xxxxx |
| 142 | U+400000..U+3FFFFFF 0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx |
| 143 | U+4000000..U+7FFFFFFF 0tuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 1111111t 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx |
| 144 | |
| 145 | Note: The I8 transformation is valid for UCS-4 values X'0' to |
| 146 | X'7FFFFFFF' (the full extent of ISO/IEC 10646 coding space). |
| 147 | |
| 148 | */ |
| 149 | |
| 150 | /* Input is a true Unicode (not-native) code point */ |
| 151 | #define OFFUNISKIP(uv) ( (uv) < 0xA0 ? 1 : \ |
| 152 | (uv) < 0x400 ? 2 : \ |
| 153 | (uv) < 0x4000 ? 3 : \ |
| 154 | (uv) < 0x40000 ? 4 : \ |
| 155 | (uv) < 0x400000 ? 5 : \ |
| 156 | (uv) < 0x4000000 ? 6 : 7 ) |
| 157 | |
| 158 | #define UNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) |
| 159 | |
| 160 | /* UTF-EBCDIC semantic macros - transform back into I8 and then compare |
| 161 | * Comments as to the meaning of each are given at their corresponding utf8.h |
| 162 | * definitions */ |
| 163 | |
| 164 | #define UTF8_IS_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \ |
| 165 | && NATIVE_UTF8_TO_I8(c) != 0xE0) |
| 166 | #define UTF8_IS_CONTINUATION(c) ((NATIVE_UTF8_TO_I8(c) & 0xE0) == 0xA0) |
| 167 | #define UTF8_IS_CONTINUED(c) (NATIVE_UTF8_TO_I8(c) >= 0xA0) |
| 168 | |
| 169 | #define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5 \ |
| 170 | && NATIVE_UTF8_TO_I8(c) <= 0xC7) |
| 171 | /* Saying it this way adds a runtime test, but removes 2 run-time lookups */ |
| 172 | /*#define UTF8_IS_DOWNGRADEABLE_START(c) ((c) == I8_TO_NATIVE_UTF8(0xC5) \ |
| 173 | || (c) == I8_TO_NATIVE_UTF8(0xC6) \ |
| 174 | || (c) == I8_TO_NATIVE_UTF8(0xC7)) |
| 175 | */ |
| 176 | #define UTF8_IS_ABOVE_LATIN1(c) (NATIVE_UTF8_TO_I8(c) >= 0xC8) |
| 177 | |
| 178 | /* Can't exceed 7 on EBCDIC platforms */ |
| 179 | #define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len)))) |
| 180 | |
| 181 | #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) |
| 182 | #define UTF_CONTINUATION_MARK 0xA0 |
| 183 | #define UTF_CONTINUATION_MASK ((U8)0x1f) |
| 184 | #define UTF_ACCUMULATION_SHIFT 5 |
| 185 | |
| 186 | /* How wide can a single UTF-8 encoded character become in bytes. */ |
| 187 | /* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8 |
| 188 | * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be |
| 189 | * expressed with 5 bytes. However, Perl thinks of UTF-8 as a way to encode |
| 190 | * non-negative integers in a binary format, even those above Unicode */ |
| 191 | #define UTF8_MAXBYTES 7 |
| 192 | |
| 193 | /* The maximum number of UTF-8 bytes a single Unicode character can |
| 194 | * uppercase/lowercase/fold into. Unicode guarantees that the maximum |
| 195 | * expansion is 3 characters. On EBCDIC platforms, the highest Unicode |
| 196 | * character occupies 5 bytes, therefore this number is 15 */ |
| 197 | #define UTF8_MAXBYTES_CASE 15 |
| 198 | |
| 199 | /* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() |
| 200 | * for more */ |
| 201 | #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) |
| 202 | |
| 203 | #define MAX_UTF8_TWO_BYTE 0x3FF |
| 204 | |
| 205 | /* |
| 206 | * Local variables: |
| 207 | * c-indentation-style: bsd |
| 208 | * c-basic-offset: 4 |
| 209 | * indent-tabs-mode: nil |
| 210 | * End: |
| 211 | * |
| 212 | * ex: set ts=8 sts=4 sw=4 et: |
| 213 | */ |