Commit | Line | Data |
---|---|---|
1d72bdf6 NIS |
1 | /* utfebcdic.h |
2 | * | |
2eee27d7 SS |
3 | * Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009, |
4 | * 2010, 2011 by Larry Wall, Nick Ing-Simmons, and others | |
1d72bdf6 NIS |
5 | * |
6 | * You may distribute under the terms of either the GNU General Public | |
7 | * License or the Artistic License, as specified in the README file. | |
8 | * | |
9 | * Macros to implement UTF-EBCDIC as perl's internal encoding | |
97237291 | 10 | * Adapted from version 7.1 of Unicode Technical Report #16: |
1d72bdf6 | 11 | * http://www.unicode.org/unicode/reports/tr16 |
fe749c9a KW |
12 | * |
13 | * To summarize, the way it works is: | |
c229c178 KW |
14 | * To convert an EBCDIC code point to UTF-EBCDIC: |
15 | * 1) convert to Unicode. No conversion is necesary for code points above | |
16 | * 255, as Unicode and EBCDIC are identical in this range. For smaller | |
17 | * code points, the conversion is done by lookup in the PL_e2a table (with | |
18 | * inverse PL_a2e) in the generated file 'ebcdic_tables.h'. The 'a' | |
19 | * stands for ASCII platform, meaning 0-255 Unicode. | |
97237291 | 20 | * 2) convert that to a utf8-like string called I8 ('I' stands for |
d06134e5 KW |
21 | * intermediate) with variant characters occupying multiple bytes. This |
22 | * step is similar to the utf8-creating step from Unicode, but the details | |
23 | * are different. This transformation is called UTF8-Mod. There is a | |
24 | * chart about the bit patterns in a comment later in this file. But | |
fe749c9a KW |
25 | * essentially here are the differences: |
26 | * UTF8 I8 | |
27 | * invariant byte starts with 0 starts with 0 or 100 | |
28 | * continuation byte starts with 10 starts with 101 | |
29 | * start byte same in both: if the code point requires N bytes, | |
c0236afe KW |
30 | * then the leading N bits are 1, followed by a 0. If |
31 | * all 8 bits in the first byte are 1, the code point | |
32 | * will occupy 14 bytes (compared to 13 in Perl's | |
33 | * extended UTF-8). This is incompatible with what | |
34 | * tr16 implies should be the representation of code | |
35 | * points 2**30 and above, but allows Perl to be able | |
36 | * to represent all code points that fit in a 64-bit | |
37 | * word in either our extended UTF-EBCDIC or UTF-8. | |
97237291 KW |
38 | * 3) Use the algorithm in tr16 to convert each byte from step 2 into |
39 | * final UTF-EBCDIC. This is done by table lookup from a table | |
4bc3dcfa | 40 | * constructed from the algorithm, reproduced in ebcdic_tables.h as |
97237291 KW |
41 | * PL_utf2e, with its inverse being PL_e2utf. They are constructed so that |
42 | * all EBCDIC invariants remain invariant, but no others do, and the first | |
43 | * byte of a variant will always have its upper bit set. But note that | |
97d0ceda KW |
44 | * the upper bit of some invariants is also 1. The table also is designed |
45 | * so that lexically comparing two UTF-EBCDIC-variant characters yields | |
46 | * the Unicode code point order. (To get native code point order, one has | |
47 | * to convert the latin1-range characters to their native code point | |
48 | * value.) | |
97237291 KW |
49 | * |
50 | * For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in | |
51 | * UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3 | |
52 | * converts it back to 193. As an example of how a variant character works, | |
53 | * take LATIN SMALL LETTER Y WITH DIAERESIS, which is typically 0xDF in | |
54 | * EBCDIC. Step 1 converts it to the Unicode value, 0xFF. Step 2 converts | |
55 | * that to two bytes = 11000111 10111111 = C7 BF, and Step 3 converts those to | |
56 | * 0x8B 0x73. | |
45f80db9 | 57 | * |
fe749c9a KW |
58 | * If you're starting from Unicode, skip step 1. For UTF-EBCDIC to straight |
59 | * EBCDIC, reverse the steps. | |
60 | * | |
61 | * The EBCDIC invariants have been chosen to be those characters whose Unicode | |
62 | * equivalents have ordinal numbers less than 160, that is the same characters | |
63 | * that are expressible in ASCII, plus the C1 controls. So there are 160 | |
bc2161fd | 64 | * invariants instead of the 128 in UTF-8. |
fe749c9a KW |
65 | * |
66 | * The purpose of Step 3 is to make the encoding be invariant for the chosen | |
67 | * characters. This messes up the convenient patterns found in step 2, so | |
68 | * generally, one has to undo step 3 into a temporary to use them. However, | |
97237291 KW |
69 | * one "shadow", or parallel table, PL_utf8skip, has been constructed that |
70 | * doesn't require undoing things. It is such that for each byte, it says | |
71 | * how long the sequence is if that (UTF-EBCDIC) byte were to begin it | |
72 | * | |
73 | * There are actually 3 slightly different UTF-EBCDIC encodings in | |
4bc3dcfa | 74 | * ebcdic_tables.h, one for each of the code pages recognized by Perl. That |
97237291 KW |
75 | * means that there are actually three different sets of tables, one for each |
76 | * code page. (If Perl is compiled on platforms using another EBCDIC code | |
77 | * page, it may not compile, or Perl may silently mistake it for one of the | |
78 | * three.) | |
fe749c9a | 79 | * |
97237291 KW |
80 | * Note that tr16 actually only specifies one version of UTF-EBCDIC, based on |
81 | * the 1047 encoding, and which is supposed to be used for all code pages. | |
82 | * But this doesn't work. To illustrate the problem, consider the '^' character. | |
83 | * On a 037 code page it is the single byte 176, whereas under 1047 UTF-EBCDIC | |
84 | * it is the single byte 95. If Perl implemented tr16 exactly, it would mean | |
85 | * that changing a string containing '^' to UTF-EBCDIC would change that '^' | |
86 | * from 176 to 95 (and vice-versa), violating the rule that ASCII-range | |
87 | * characters are the same in UTF-8 or not. Much code in Perl assumes this | |
88 | * rule. See for example | |
89 | * http://grokbase.com/t/perl/mvs/025xf0yhmn/utf-ebcdic-for-posix-bc-malformed-utf-8-character | |
90 | * What Perl does is create a version of UTF-EBCDIC suited to each code page; | |
91 | * the one for the 1047 code page is identical to what's specified in tr16. | |
92 | * This complicates interchanging files between computers using different code | |
93 | * pages. Best is to convert to I8 before sending them, as the I8 | |
94 | * representation is the same no matter what the underlying code page is. | |
fe749c9a | 95 | * |
ff982d00 KW |
96 | * Because of the way UTF-EBCDIC is constructed, the lowest 32 code points that |
97 | * aren't equivalent to ASCII characters nor C1 controls form the set of | |
98 | * continuation bytes; the remaining 64 non-ASCII, non-control code points form | |
99 | * the potential start bytes, in order. (However, the first 5 of these lead to | |
80bfb4dc KW |
100 | * malformed overlongs, so there really are only 59 start bytes, and the first |
101 | * three of the 59 are the start bytes for the Latin1 range.) Hence the | |
ff982d00 KW |
102 | * UTF-EBCDIC for the smallest variant code point, 0x160, will have likely 0x41 |
103 | * as its continuation byte, provided 0x41 isn't an ASCII or C1 equivalent. | |
104 | * And its start byte will be the code point that is 37 (32+5) non-ASCII, | |
105 | * non-control code points past it. (0 - 3F are controls, and 40 is SPACE, | |
106 | * leaving 41 as the first potentially available one.) In contrast, on ASCII | |
107 | * platforms, the first 64 (not 32) non-ASCII code points are the continuation | |
108 | * bytes. And the first 2 (not 5) potential start bytes form overlong | |
109 | * malformed sequences. | |
110 | * | |
fe749c9a KW |
111 | * EBCDIC characters above 0xFF are the same as Unicode in Perl's |
112 | * implementation of all 3 encodings, so for those Step 1 is trivial. | |
113 | * | |
114 | * (Note that the entries for invariant characters are necessarily the same in | |
97237291 | 115 | * PL_e2a and PL_e2utf; likewise for their inverses.) |
fe749c9a KW |
116 | * |
117 | * UTF-EBCDIC strings are the same length or longer than UTF-8 representations | |
118 | * of the same string. The maximum code point representable as 2 bytes in | |
119 | * UTF-EBCDIC is 0x3FFF, instead of 0x7FFF in UTF-8. | |
1d72bdf6 NIS |
120 | */ |
121 | ||
122 | START_EXTERN_C | |
123 | ||
124 | #ifdef DOINIT | |
f5e1abaf | 125 | |
4bc3dcfa | 126 | #include "ebcdic_tables.h" |
44f2fc15 | 127 | |
1d72bdf6 | 128 | #else |
f466f02a KW |
129 | EXTCONST U8 PL_utf8skip[]; |
130 | EXTCONST U8 PL_e2utf[]; | |
131 | EXTCONST U8 PL_utf2e[]; | |
132 | EXTCONST U8 PL_e2a[]; | |
133 | EXTCONST U8 PL_a2e[]; | |
134 | EXTCONST U8 PL_fold[]; | |
135 | EXTCONST U8 PL_fold_latin1[]; | |
136 | EXTCONST U8 PL_latin1_lc[]; | |
137 | EXTCONST U8 PL_mod_latin1_uc[]; | |
1d72bdf6 NIS |
138 | #endif |
139 | ||
140 | END_EXTERN_C | |
141 | ||
1e54db1a | 142 | /* EBCDIC-happy ways of converting native code to UTF-8 */ |
1d72bdf6 | 143 | |
e9b19ab7 KW |
144 | /* Use these when ch is known to be < 256 */ |
145 | #define NATIVE_TO_LATIN1(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) PL_e2a[(U8)(ch)]) | |
146 | #define LATIN1_TO_NATIVE(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) PL_a2e[(U8)(ch)]) | |
59a449d5 | 147 | |
e9b19ab7 KW |
148 | /* Use these on bytes */ |
149 | #define NATIVE_UTF8_TO_I8(b) (__ASSERT_(FITS_IN_8_BITS(b)) PL_e2utf[(U8)(b)]) | |
150 | #define I8_TO_NATIVE_UTF8(b) (__ASSERT_(FITS_IN_8_BITS(b)) PL_utf2e[(U8)(b)]) | |
59a449d5 | 151 | |
bc3632a8 | 152 | /* Transforms in wide UV chars */ |
4c8cd605 KW |
153 | #define NATIVE_TO_UNI(ch) (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (UV) (ch)) |
154 | #define UNI_TO_NATIVE(ch) (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (UV) (ch)) | |
bc3632a8 | 155 | |
111e8ed9 KW |
156 | /* How wide can a single UTF-8 encoded character become in bytes. */ |
157 | /* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8 | |
158 | * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be | |
159 | * expressed with 5 bytes. However, Perl thinks of UTF-8 as a way to encode | |
c0236afe KW |
160 | * non-negative integers in a binary format, even those above Unicode. 14 is |
161 | * the smallest number that covers 2**64 | |
162 | * | |
163 | * WARNING: This number must be in sync with the value in | |
164 | * regen/charset_translations.pl. */ | |
165 | #define UTF8_MAXBYTES 14 | |
111e8ed9 | 166 | |
1d72bdf6 | 167 | /* |
c0236afe | 168 | The following table is adapted from tr16, it shows the I8 encoding of Unicode code points. |
1d72bdf6 | 169 | |
c0236afe | 170 | Unicode U32 Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th Byte |
1d72bdf6 NIS |
171 | U+0000..U+007F 000000000xxxxxxx 0xxxxxxx |
172 | U+0080..U+009F 00000000100xxxxx 100xxxxx | |
1d72bdf6 NIS |
173 | U+00A0..U+03FF 000000yyyyyxxxxx 110yyyyy 101xxxxx |
174 | U+0400..U+3FFF 00zzzzyyyyyxxxxx 1110zzzz 101yyyyy 101xxxxx | |
175 | U+4000..U+3FFFF 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx | |
176 | U+40000..U+3FFFFF 0vvwwwwwzzzzzyyyyyxxxxx 111110vv 101wwwww 101zzzzz 101yyyyy 101xxxxx | |
177 | U+400000..U+3FFFFFF 0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx | |
c0236afe | 178 | U+4000000..U+3FFFFFFF 00uuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111110 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx |
1d72bdf6 | 179 | |
c0236afe KW |
180 | Beyond this, Perl uses an incompatible extension, similar to the one used in |
181 | regular UTF-8. There are now 14 bytes. A full 32 bits of information thus looks like this: | |
182 | 1st Byte 2nd-7th 8th Byte 9th Byte 10th B 11th B 12th B 13th B 14th B | |
183 | U+40000000..U+FFFFFFFF ttuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111111 10100000 101000tt 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx | |
1d72bdf6 | 184 | |
c0236afe KW |
185 | For 32-bit words, the 2nd through 7th bytes effectively function as leading |
186 | zeros. Above 32 bits, these fill up, with each byte yielding 5 bits of | |
187 | information, so that with 13 continuation bytes, we can handle 65 bits, just | |
188 | above what a 64 bit word can hold */ | |
1d72bdf6 | 189 | |
1ff3baa2 | 190 | |
97d0ceda | 191 | /* This is a fundamental property of UTF-EBCDIC */ |
2d1545e5 | 192 | #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) |
530495eb | 193 | |
38953e5a KW |
194 | /* It turns out that on EBCDIC platforms, the invariants are the characters |
195 | * that have ASCII equivalents, plus the C1 controls. Since the C0 controls | |
196 | * and DELETE are ASCII, this is the same as: (isASCII(uv) || isCNTRL_L1(uv)) | |
197 | * */ | |
198 | #define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \ | |
199 | && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) | |
200 | ||
e4fd7312 KW |
201 | /* UTF-EBCDIC semantic macros - We used to transform back into I8 and then |
202 | * compare, but now only have to do a single lookup by using a bit in | |
203 | * l1_char_class_tab.h. | |
15824458 | 204 | * Comments as to the meaning of each are given at their corresponding utf8.h |
1ff3baa2 | 205 | * definitions. */ |
0447e8df | 206 | |
e4fd7312 | 207 | #define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START) |
858cd8ab KW |
208 | |
209 | #define UTF_IS_CONTINUATION_MASK 0xE0 | |
210 | ||
e4fd7312 KW |
211 | #define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION) |
212 | ||
858cd8ab KW |
213 | /* The above instead could be written as this: |
214 | #define UTF8_IS_CONTINUATION(c) \ | |
215 | (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK) \ | |
216 | == UTF_CONTINUATION_MARK) | |
217 | */ | |
218 | ||
e4fd7312 KW |
219 | /* Equivalent to ! UVCHR_IS_INVARIANT(c) */ |
220 | #define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \ | |
221 | && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) | |
222 | ||
223 | #define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \ | |
224 | _CC_UTF8_IS_DOWNGRADEABLE_START) | |
225 | ||
226 | /* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c)) | |
227 | * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */ | |
228 | #define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c) \ | |
229 | && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START) \ | |
230 | |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START))) \ | |
231 | == _CC_mask(_CC_UTF8_IS_START))) | |
1d72bdf6 | 232 | |
5d5376e2 KW |
233 | #define isUTF8_POSSIBLY_PROBLEMATIC(c) \ |
234 | _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE) | |
235 | ||
1d72bdf6 | 236 | #define UTF_CONTINUATION_MARK 0xA0 |
1d72bdf6 NIS |
237 | #define UTF_ACCUMULATION_SHIFT 5 |
238 | ||
0ed2b00b KW |
239 | /* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() |
240 | * for more */ | |
241 | #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) | |
242 | ||
e9a8c099 | 243 | /* |
14d04a33 | 244 | * ex: set ts=8 sts=4 sw=4 et: |
e9a8c099 | 245 | */ |