Commit | Line | Data |
---|---|---|
1d72bdf6 NIS |
1 | /* utfebcdic.h |
2 | * | |
2eee27d7 SS |
3 | * Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009, |
4 | * 2010, 2011 by Larry Wall, Nick Ing-Simmons, and others | |
1d72bdf6 NIS |
5 | * |
6 | * You may distribute under the terms of either the GNU General Public | |
7 | * License or the Artistic License, as specified in the README file. | |
8 | * | |
9 | * Macros to implement UTF-EBCDIC as perl's internal encoding | |
97237291 | 10 | * Adapted from version 7.1 of Unicode Technical Report #16: |
1d72bdf6 | 11 | * http://www.unicode.org/unicode/reports/tr16 |
fe749c9a KW |
12 | * |
13 | * To summarize, the way it works is: | |
c229c178 KW |
14 | * To convert an EBCDIC code point to UTF-EBCDIC: |
15 | * 1) convert to Unicode. No conversion is necesary for code points above | |
16 | * 255, as Unicode and EBCDIC are identical in this range. For smaller | |
17 | * code points, the conversion is done by lookup in the PL_e2a table (with | |
18 | * inverse PL_a2e) in the generated file 'ebcdic_tables.h'. The 'a' | |
19 | * stands for ASCII platform, meaning 0-255 Unicode. | |
97237291 | 20 | * 2) convert that to a utf8-like string called I8 ('I' stands for |
d06134e5 KW |
21 | * intermediate) with variant characters occupying multiple bytes. This |
22 | * step is similar to the utf8-creating step from Unicode, but the details | |
23 | * are different. This transformation is called UTF8-Mod. There is a | |
24 | * chart about the bit patterns in a comment later in this file. But | |
fe749c9a KW |
25 | * essentially here are the differences: |
26 | * UTF8 I8 | |
27 | * invariant byte starts with 0 starts with 0 or 100 | |
28 | * continuation byte starts with 10 starts with 101 | |
29 | * start byte same in both: if the code point requires N bytes, | |
30 | * then the leading N bits are 1, followed by a 0. (No | |
31 | * trailing 0 for the very largest possible allocation | |
32 | * in I8, far beyond the current Unicode standard's | |
33 | * max, as shown in the comment later in this file.) | |
97237291 KW |
34 | * 3) Use the algorithm in tr16 to convert each byte from step 2 into |
35 | * final UTF-EBCDIC. This is done by table lookup from a table | |
4bc3dcfa | 36 | * constructed from the algorithm, reproduced in ebcdic_tables.h as |
97237291 KW |
37 | * PL_utf2e, with its inverse being PL_e2utf. They are constructed so that |
38 | * all EBCDIC invariants remain invariant, but no others do, and the first | |
39 | * byte of a variant will always have its upper bit set. But note that | |
40 | * the upper bit of some invariants is also 1. | |
41 | * | |
42 | * For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in | |
43 | * UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3 | |
44 | * converts it back to 193. As an example of how a variant character works, | |
45 | * take LATIN SMALL LETTER Y WITH DIAERESIS, which is typically 0xDF in | |
46 | * EBCDIC. Step 1 converts it to the Unicode value, 0xFF. Step 2 converts | |
47 | * that to two bytes = 11000111 10111111 = C7 BF, and Step 3 converts those to | |
48 | * 0x8B 0x73. | |
45f80db9 | 49 | * |
fe749c9a KW |
50 | * If you're starting from Unicode, skip step 1. For UTF-EBCDIC to straight |
51 | * EBCDIC, reverse the steps. | |
52 | * | |
53 | * The EBCDIC invariants have been chosen to be those characters whose Unicode | |
54 | * equivalents have ordinal numbers less than 160, that is the same characters | |
55 | * that are expressible in ASCII, plus the C1 controls. So there are 160 | |
bc2161fd | 56 | * invariants instead of the 128 in UTF-8. |
fe749c9a KW |
57 | * |
58 | * The purpose of Step 3 is to make the encoding be invariant for the chosen | |
59 | * characters. This messes up the convenient patterns found in step 2, so | |
60 | * generally, one has to undo step 3 into a temporary to use them. However, | |
97237291 KW |
61 | * one "shadow", or parallel table, PL_utf8skip, has been constructed that |
62 | * doesn't require undoing things. It is such that for each byte, it says | |
63 | * how long the sequence is if that (UTF-EBCDIC) byte were to begin it | |
64 | * | |
65 | * There are actually 3 slightly different UTF-EBCDIC encodings in | |
4bc3dcfa | 66 | * ebcdic_tables.h, one for each of the code pages recognized by Perl. That |
97237291 KW |
67 | * means that there are actually three different sets of tables, one for each |
68 | * code page. (If Perl is compiled on platforms using another EBCDIC code | |
69 | * page, it may not compile, or Perl may silently mistake it for one of the | |
70 | * three.) | |
fe749c9a | 71 | * |
97237291 KW |
72 | * Note that tr16 actually only specifies one version of UTF-EBCDIC, based on |
73 | * the 1047 encoding, and which is supposed to be used for all code pages. | |
74 | * But this doesn't work. To illustrate the problem, consider the '^' character. | |
75 | * On a 037 code page it is the single byte 176, whereas under 1047 UTF-EBCDIC | |
76 | * it is the single byte 95. If Perl implemented tr16 exactly, it would mean | |
77 | * that changing a string containing '^' to UTF-EBCDIC would change that '^' | |
78 | * from 176 to 95 (and vice-versa), violating the rule that ASCII-range | |
79 | * characters are the same in UTF-8 or not. Much code in Perl assumes this | |
80 | * rule. See for example | |
81 | * http://grokbase.com/t/perl/mvs/025xf0yhmn/utf-ebcdic-for-posix-bc-malformed-utf-8-character | |
82 | * What Perl does is create a version of UTF-EBCDIC suited to each code page; | |
83 | * the one for the 1047 code page is identical to what's specified in tr16. | |
84 | * This complicates interchanging files between computers using different code | |
85 | * pages. Best is to convert to I8 before sending them, as the I8 | |
86 | * representation is the same no matter what the underlying code page is. | |
fe749c9a | 87 | * |
ff982d00 KW |
88 | * Because of the way UTF-EBCDIC is constructed, the lowest 32 code points that |
89 | * aren't equivalent to ASCII characters nor C1 controls form the set of | |
90 | * continuation bytes; the remaining 64 non-ASCII, non-control code points form | |
91 | * the potential start bytes, in order. (However, the first 5 of these lead to | |
80bfb4dc KW |
92 | * malformed overlongs, so there really are only 59 start bytes, and the first |
93 | * three of the 59 are the start bytes for the Latin1 range.) Hence the | |
ff982d00 KW |
94 | * UTF-EBCDIC for the smallest variant code point, 0x160, will have likely 0x41 |
95 | * as its continuation byte, provided 0x41 isn't an ASCII or C1 equivalent. | |
96 | * And its start byte will be the code point that is 37 (32+5) non-ASCII, | |
97 | * non-control code points past it. (0 - 3F are controls, and 40 is SPACE, | |
98 | * leaving 41 as the first potentially available one.) In contrast, on ASCII | |
99 | * platforms, the first 64 (not 32) non-ASCII code points are the continuation | |
100 | * bytes. And the first 2 (not 5) potential start bytes form overlong | |
101 | * malformed sequences. | |
102 | * | |
fe749c9a KW |
103 | * EBCDIC characters above 0xFF are the same as Unicode in Perl's |
104 | * implementation of all 3 encodings, so for those Step 1 is trivial. | |
105 | * | |
106 | * (Note that the entries for invariant characters are necessarily the same in | |
97237291 | 107 | * PL_e2a and PL_e2utf; likewise for their inverses.) |
fe749c9a KW |
108 | * |
109 | * UTF-EBCDIC strings are the same length or longer than UTF-8 representations | |
110 | * of the same string. The maximum code point representable as 2 bytes in | |
111 | * UTF-EBCDIC is 0x3FFF, instead of 0x7FFF in UTF-8. | |
1d72bdf6 NIS |
112 | */ |
113 | ||
114 | START_EXTERN_C | |
115 | ||
116 | #ifdef DOINIT | |
f5e1abaf | 117 | |
4bc3dcfa | 118 | #include "ebcdic_tables.h" |
44f2fc15 | 119 | |
1d72bdf6 | 120 | #else |
f466f02a KW |
121 | EXTCONST U8 PL_utf8skip[]; |
122 | EXTCONST U8 PL_e2utf[]; | |
123 | EXTCONST U8 PL_utf2e[]; | |
124 | EXTCONST U8 PL_e2a[]; | |
125 | EXTCONST U8 PL_a2e[]; | |
126 | EXTCONST U8 PL_fold[]; | |
127 | EXTCONST U8 PL_fold_latin1[]; | |
128 | EXTCONST U8 PL_latin1_lc[]; | |
129 | EXTCONST U8 PL_mod_latin1_uc[]; | |
1d72bdf6 NIS |
130 | #endif |
131 | ||
132 | END_EXTERN_C | |
133 | ||
1e54db1a | 134 | /* EBCDIC-happy ways of converting native code to UTF-8 */ |
1d72bdf6 | 135 | |
e9b19ab7 KW |
136 | /* Use these when ch is known to be < 256 */ |
137 | #define NATIVE_TO_LATIN1(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) PL_e2a[(U8)(ch)]) | |
138 | #define LATIN1_TO_NATIVE(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) PL_a2e[(U8)(ch)]) | |
59a449d5 | 139 | |
e9b19ab7 KW |
140 | /* Use these on bytes */ |
141 | #define NATIVE_UTF8_TO_I8(b) (__ASSERT_(FITS_IN_8_BITS(b)) PL_e2utf[(U8)(b)]) | |
142 | #define I8_TO_NATIVE_UTF8(b) (__ASSERT_(FITS_IN_8_BITS(b)) PL_utf2e[(U8)(b)]) | |
59a449d5 | 143 | |
bc3632a8 | 144 | /* Transforms in wide UV chars */ |
8dcc0f39 KW |
145 | #define NATIVE_TO_UNI(ch) (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (ch)) |
146 | #define UNI_TO_NATIVE(ch) (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (ch)) | |
bc3632a8 | 147 | |
1d72bdf6 | 148 | /* |
d06134e5 | 149 | The following table is adapted from tr16, it shows I8 encoding of Unicode code points. |
1d72bdf6 | 150 | |
80bfb4dc | 151 | Unicode U32 Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte |
1d72bdf6 NIS |
152 | U+0000..U+007F 000000000xxxxxxx 0xxxxxxx |
153 | U+0080..U+009F 00000000100xxxxx 100xxxxx | |
1d72bdf6 NIS |
154 | U+00A0..U+03FF 000000yyyyyxxxxx 110yyyyy 101xxxxx |
155 | U+0400..U+3FFF 00zzzzyyyyyxxxxx 1110zzzz 101yyyyy 101xxxxx | |
156 | U+4000..U+3FFFF 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx | |
157 | U+40000..U+3FFFFF 0vvwwwwwzzzzzyyyyyxxxxx 111110vv 101wwwww 101zzzzz 101yyyyy 101xxxxx | |
158 | U+400000..U+3FFFFFF 0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx | |
159 | U+4000000..U+7FFFFFFF 0tuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 1111111t 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx | |
160 | ||
d06134e5 | 161 | Note: The I8 transformation is valid for UCS-4 values X'0' to |
1d72bdf6 NIS |
162 | X'7FFFFFFF' (the full extent of ISO/IEC 10646 coding space). |
163 | ||
164 | */ | |
165 | ||
5aaebcb3 KW |
166 | /* Input is a true Unicode (not-native) code point */ |
167 | #define OFFUNISKIP(uv) ( (uv) < 0xA0 ? 1 : \ | |
1ff3baa2 KW |
168 | (uv) < 0x400 ? 2 : \ |
169 | (uv) < 0x4000 ? 3 : \ | |
170 | (uv) < 0x40000 ? 4 : \ | |
171 | (uv) < 0x400000 ? 5 : \ | |
172 | (uv) < 0x4000000 ? 6 : 7 ) | |
173 | ||
2d1545e5 | 174 | #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) |
530495eb | 175 | |
38953e5a KW |
176 | /* It turns out that on EBCDIC platforms, the invariants are the characters |
177 | * that have ASCII equivalents, plus the C1 controls. Since the C0 controls | |
178 | * and DELETE are ASCII, this is the same as: (isASCII(uv) || isCNTRL_L1(uv)) | |
179 | * */ | |
180 | #define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \ | |
181 | && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) | |
182 | ||
5352a763 KW |
183 | #define UVCHR_SKIP(uv) (UVCHR_IS_INVARIANT(uv) ? 1 : \ |
184 | (uv) < 0x400 ? 2 : \ | |
185 | (uv) < 0x4000 ? 3 : \ | |
186 | (uv) < 0x40000 ? 4 : \ | |
187 | (uv) < 0x400000 ? 5 : \ | |
188 | (uv) < 0x4000000 ? 6 : 7 ) | |
38953e5a | 189 | |
e4fd7312 KW |
190 | /* UTF-EBCDIC semantic macros - We used to transform back into I8 and then |
191 | * compare, but now only have to do a single lookup by using a bit in | |
192 | * l1_char_class_tab.h. | |
15824458 | 193 | * Comments as to the meaning of each are given at their corresponding utf8.h |
1ff3baa2 | 194 | * definitions. */ |
0447e8df | 195 | |
e4fd7312 KW |
196 | #define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START) |
197 | #define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION) | |
198 | ||
199 | /* Equivalent to ! UVCHR_IS_INVARIANT(c) */ | |
200 | #define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \ | |
201 | && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) | |
202 | ||
203 | #define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \ | |
204 | _CC_UTF8_IS_DOWNGRADEABLE_START) | |
205 | ||
206 | /* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c)) | |
207 | * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */ | |
208 | #define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c) \ | |
209 | && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START) \ | |
210 | |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START))) \ | |
211 | == _CC_mask(_CC_UTF8_IS_START))) | |
1d72bdf6 | 212 | |
5d5376e2 KW |
213 | #define isUTF8_POSSIBLY_PROBLEMATIC(c) \ |
214 | _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE) | |
215 | ||
ee372ee9 KW |
216 | /* Can't exceed 7 on EBCDIC platforms */ |
217 | #define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len)))) | |
218 | ||
22901f30 | 219 | #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2))) |
1d72bdf6 NIS |
220 | #define UTF_CONTINUATION_MARK 0xA0 |
221 | #define UTF_CONTINUATION_MASK ((U8)0x1f) | |
222 | #define UTF_ACCUMULATION_SHIFT 5 | |
223 | ||
03c76984 KW |
224 | /* How wide can a single UTF-8 encoded character become in bytes. */ |
225 | /* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8 | |
226 | * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be | |
227 | * expressed with 5 bytes. However, Perl thinks of UTF-8 as a way to encode | |
228 | * non-negative integers in a binary format, even those above Unicode */ | |
229 | #define UTF8_MAXBYTES 7 | |
230 | ||
231 | /* The maximum number of UTF-8 bytes a single Unicode character can | |
232 | * uppercase/lowercase/fold into. Unicode guarantees that the maximum | |
233 | * expansion is 3 characters. On EBCDIC platforms, the highest Unicode | |
234 | * character occupies 5 bytes, therefore this number is 15 */ | |
235 | #define UTF8_MAXBYTES_CASE 15 | |
236 | ||
0ed2b00b KW |
237 | /* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() |
238 | * for more */ | |
239 | #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) | |
240 | ||
843a4590 KW |
241 | #define MAX_UTF8_TWO_BYTE 0x3FF |
242 | ||
e9a8c099 | 243 | /* |
14d04a33 | 244 | * ex: set ts=8 sts=4 sw=4 et: |
e9a8c099 | 245 | */ |