Commit | Line | Data |
---|---|---|
6ff677df KW |
1 | #!/usr/bin/perl -w |
2 | use strict; | |
3 | use warnings; | |
4 | ||
c0236afe KW |
5 | # WARNING: This must be kept in sync with the UTF8_MAXBYTES value in |
6 | # utfebcdic.h | |
7 | $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES = 14; | |
8 | ||
6ff677df KW |
9 | # Utilities for various character set issues. Currently handles ASCII and |
10 | # EBCDIC only. It is trivial to add support for new EBCDIC code pages (unless | |
11 | # they have identical variant character signatures as existing ones, and there | |
12 | # aren't other glitches that arise): just add a mapping table to | |
13 | # %ebcdic_translations and regen everything that uses this. | |
14 | ||
15 | my %ebcdic_translations = ( | |
16 | # Keys are code page name; values are arrays that map ASCII ordinals to | |
17 | # the code page's ordinals | |
18 | ||
19 | 'EBCDIC 1047' => | |
20 | [ 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, | |
21 | 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, | |
22 | 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, | |
23 | 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, | |
24 | 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, | |
25 | 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D, | |
26 | 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, | |
27 | 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, | |
28 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, | |
29 | 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, | |
30 | 0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBB, 0xB4, 0x9A, 0x8A, 0xB0, 0xCA, 0xAF, 0xBC, | |
31 | 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, | |
32 | 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, | |
33 | 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xBA, 0xAE, 0x59, | |
34 | 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, | |
35 | 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF | |
36 | ], | |
37 | ||
01ffcbd4 KW |
38 | # 'EBCDIC POSIX-BC' => |
39 | # [ | |
40 | # 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, | |
41 | # 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, | |
42 | # 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, | |
43 | # 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, | |
44 | # 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, | |
45 | # 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBB, 0xBC, 0xBD, 0x6A, 0x6D, | |
46 | # 0x4A, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, | |
47 | # 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xFB, 0x4F, 0xFD, 0xFF, 0x07, | |
48 | # 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, | |
49 | # 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0x5F, | |
50 | # 0x41, 0xAA, 0xB0, 0xB1, 0x9F, 0xB2, 0xD0, 0xB5, 0x79, 0xB4, 0x9A, 0x8A, 0xBA, 0xCA, 0xAF, 0xA1, | |
51 | # 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, | |
52 | # 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, | |
53 | # 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xE0, 0xFE, 0xDD, 0xFC, 0xAD, 0xAE, 0x59, | |
54 | # 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, | |
55 | # 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xC0, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF | |
56 | # ], | |
6ff677df KW |
57 | |
58 | 'EBCDIC 037' => | |
59 | [ | |
60 | 0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, | |
61 | 0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F, | |
62 | 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61, | |
63 | 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F, | |
64 | 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, | |
65 | 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xBA, 0xE0, 0xBB, 0xB0, 0x6D, | |
66 | 0x79, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, | |
67 | 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xC0, 0x4F, 0xD0, 0xA1, 0x07, | |
68 | 0x20, 0x21, 0x22, 0x23, 0x24, 0x15, 0x06, 0x17, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x09, 0x0A, 0x1B, | |
69 | 0x30, 0x31, 0x1A, 0x33, 0x34, 0x35, 0x36, 0x08, 0x38, 0x39, 0x3A, 0x3B, 0x04, 0x14, 0x3E, 0xFF, | |
70 | 0x41, 0xAA, 0x4A, 0xB1, 0x9F, 0xB2, 0x6A, 0xB5, 0xBD, 0xB4, 0x9A, 0x8A, 0x5F, 0xCA, 0xAF, 0xBC, | |
71 | 0x90, 0x8F, 0xEA, 0xFA, 0xBE, 0xA0, 0xB6, 0xB3, 0x9D, 0xDA, 0x9B, 0x8B, 0xB7, 0xB8, 0xB9, 0xAB, | |
72 | 0x64, 0x65, 0x62, 0x66, 0x63, 0x67, 0x9E, 0x68, 0x74, 0x71, 0x72, 0x73, 0x78, 0x75, 0x76, 0x77, | |
73 | 0xAC, 0x69, 0xED, 0xEE, 0xEB, 0xEF, 0xEC, 0xBF, 0x80, 0xFD, 0xFE, 0xFB, 0xFC, 0xAD, 0xAE, 0x59, | |
74 | 0x44, 0x45, 0x42, 0x46, 0x43, 0x47, 0x9C, 0x48, 0x54, 0x51, 0x52, 0x53, 0x58, 0x55, 0x56, 0x57, | |
75 | 0x8C, 0x49, 0xCD, 0xCE, 0xCB, 0xCF, 0xCC, 0xE1, 0x70, 0xDD, 0xDE, 0xDB, 0xDC, 0x8D, 0x8E, 0xDF | |
76 | ], | |
77 | ); | |
78 | ||
79 | my $ascii_key = 'ASCII/Latin1'; | |
80 | ||
81 | my %I8_TO_NATIVE_UTF8; # Maps I8 UTF to final UTF-EBCDIC | |
82 | # See http://www.unicode.org/reports/tr16/ | |
83 | ||
84 | sub get_supported_code_pages() { | |
85 | # Returns an ordered array of the currently supported code pages, | |
86 | # including ASCII as the 0th element, 1047 as the 1th, and the others | |
87 | # sorted lexically by code page name. | |
88 | ||
89 | # Create an ASCII table. | |
90 | unless (exists $ebcdic_translations{$ascii_key}) { | |
91 | for my $i (0 .. 255) { | |
92 | $ebcdic_translations{$ascii_key}->[$i] = $i; | |
93 | } | |
94 | } | |
95 | ||
96 | return sort { | |
97 | $a eq $ascii_key | |
98 | ? -1 | |
99 | : $b eq $ascii_key | |
100 | ? 1 | |
101 | : $a =~ /1047/ | |
102 | ? -1 | |
103 | : $b =~ /1047/ | |
104 | ? 1 | |
105 | : $a cmp $b | |
106 | } keys %ebcdic_translations; | |
107 | } | |
108 | ||
109 | sub get_a2n($) { | |
110 | # Returns the mapping array for ASCII to code page for the code page named | |
111 | # by the input parameter. | |
112 | ||
113 | my $charset = shift; | |
114 | ||
115 | if (! exists $ebcdic_translations{$charset}) { | |
116 | die "Unknown character set '$charset'"; | |
117 | } | |
118 | ||
c30a0cf2 | 119 | return $ebcdic_translations{$charset}; |
6ff677df KW |
120 | } |
121 | ||
122 | sub get_I8_2_utf($) { | |
123 | # Returns the mapping array for I8 to code page UTF-EBCDIC for the code | |
124 | # page named by the input parameter. This is Table 2 of TR16 customized | |
125 | # for the code page. See utfebcdic.h for why, contrary to TR16, it has to | |
126 | # be code-page-specific. | |
127 | ||
128 | my $charset = shift; | |
129 | ||
130 | die "I8 not a valid concept for ASCII" if $charset eq $ascii_key; | |
131 | die "'$charset' unknown" unless exists $ebcdic_translations{$charset}; | |
132 | ||
133 | # Generate the table if not already present | |
134 | if (! exists $I8_TO_NATIVE_UTF8{$charset}) { | |
135 | ||
136 | # The code points not used for invariants. Initialized to everything, | |
137 | # then entries are removed as we go along. | |
138 | my %unused_cps; | |
139 | for my $i (0 .. 255) { | |
140 | $unused_cps{$i} = 1; | |
141 | } | |
142 | ||
143 | # These are the invariants. The output has them mapped to the | |
144 | # original EBCDIC code point. | |
145 | for my $i (0 .. 0x9F) { | |
146 | use charnames (); | |
147 | my $ebcdic_value = $ebcdic_translations{$charset}[$i]; | |
148 | #printf "$charset: using %02x which is %02x ascii, %s\n", $ebcdic_value, $i, charnames::viacode($i); | |
149 | $I8_TO_NATIVE_UTF8{$charset}[$i] = $ebcdic_value; | |
150 | if (! defined delete $unused_cps{$ebcdic_value}) { | |
151 | die "Two code points map to $ebcdic_value; one is $i"; | |
152 | } | |
153 | } | |
154 | ||
155 | # Put the unused code points in order | |
156 | my @unused_cps = sort { $a <=> $b } keys %unused_cps; | |
157 | ||
158 | # Fill in the rest of the map with these ordered code points, as TR16 | |
159 | # specifies | |
160 | for my $i (0xA0 .. 255) { | |
161 | $I8_TO_NATIVE_UTF8{$charset}[$i] = shift @unused_cps; | |
162 | #printf "$charset: filling in %02x which is %02x ascii, %s\n", $I8_TO_NATIVE_UTF8{$charset}[$i], $i, charnames::viacode($i); | |
163 | } | |
164 | ||
165 | if (@unused_cps) { | |
166 | die "Left-over code points"; | |
167 | } | |
168 | } | |
169 | ||
e0dcdb0a | 170 | return $I8_TO_NATIVE_UTF8{$charset}; |
6ff677df KW |
171 | } |
172 | ||
173 | { # Closure | |
174 | ||
175 | my $charset; # We use these to do some error checking that the #if and | |
176 | # #endif are matched. | |
177 | my $indent; | |
178 | ||
179 | sub get_conditional_compile_line_start($;$) { | |
180 | # Returns the '#if' line to put into C code to compile for the code | |
181 | # page given by the first parameter. The second parameter, if | |
182 | # present, is the indentation level, like '# if ...' | |
183 | ||
184 | if (defined $charset || defined $indent) { | |
185 | die "Missing call to get_conditional_compile_line_end()" | |
186 | } | |
187 | ||
188 | $charset = shift; | |
189 | my $indent_level = shift // 0; | |
190 | ||
191 | die "This is designed to run only on an ASCII platform" unless ord "A" == 65; | |
192 | ||
193 | if ($indent_level == 0) { | |
194 | $indent = ""; | |
195 | } | |
196 | else { | |
0769eb91 | 197 | $indent = " " x $indent_level; |
6ff677df KW |
198 | } |
199 | ||
200 | die "Unknown character set '$charset'" unless exists $ebcdic_translations{$charset}; | |
201 | ||
202 | my $return = ""; | |
203 | { | |
204 | no warnings 'qw'; | |
205 | my $count = -1; | |
206 | ||
207 | # We use all the typical variant characters to construct the #if, | |
208 | # so that it is unlikely that a different code page will match | |
209 | # this #if | |
c11f6329 KW |
210 | my @variant_chars = qw/A \\\ [ ] { } ^ ~ ! # | $ @ `/; |
211 | push @variant_chars, "\n"; | |
212 | for my $char (@variant_chars) { | |
6ff677df KW |
213 | my $compare; |
214 | my $ascii_ord = ord $char; | |
215 | my $first_time = $return eq ""; | |
216 | ||
217 | $compare = $ebcdic_translations{$charset}[$ascii_ord]; | |
218 | $return .= " && " unless $first_time; | |
c11f6329 KW |
219 | $char = '\n' if $char eq "\n"; |
220 | die "Non-graphical character ord=" . ord($char) | |
221 | if $char !~ /[[:graph:]]/; | |
6ff677df KW |
222 | $return .= "'$char' == $compare"; |
223 | $return .= " /* $charset */" if $first_time; | |
224 | last if $charset eq $ascii_key; | |
225 | $count++; | |
226 | $return .= " \\\n " if $first_time || $count % 5 == 0; | |
227 | } | |
228 | } | |
229 | ||
230 | return "#${indent}if $return\n"; | |
231 | } | |
232 | ||
233 | sub get_conditional_compile_line_end () { | |
234 | # Returns the #endif for the currently open #if | |
235 | ||
236 | my $return = "#${indent}endif\t/* $charset */\n"; | |
237 | undef $charset; | |
238 | undef $indent; | |
239 | return $return; | |
240 | } | |
241 | } | |
242 | ||
243 | sub _UTF_START_MASK($) { | |
244 | # Internal | |
245 | my $len = shift; | |
c0236afe | 246 | return (($len >= 7) ? 0x00 : (0x1F >> ($len - 2))); |
6ff677df KW |
247 | } |
248 | ||
249 | sub _UTF_START_MARK($) { | |
250 | # Internal | |
c0236afe KW |
251 | my $len = shift; |
252 | return (($len > 7) ? 0xFF : (0xFF & (0xFE << (7- $len)))); | |
6ff677df KW |
253 | } |
254 | ||
255 | sub cp_2_utfbytes($$) { | |
256 | # Returns a string consisting of the UTF-EBCDIC for the code page given by | |
7799b1c9 KW |
257 | # the 2nd parameter, of the Unicode code point given by the first |
258 | # parameter, using the UTF-MOD algorithm published in TR16. (If the "code | |
259 | # page" is ASCII, straight UTF-8 is returned.) | |
6ff677df KW |
260 | |
261 | my ($ucp, $charset) = @_; | |
262 | ||
263 | if ($charset eq $ascii_key) { | |
264 | my $str = chr $ucp; | |
265 | utf8::upgrade($str); | |
266 | utf8::encode($str); | |
267 | return $str; | |
268 | } | |
269 | elsif (exists $ebcdic_translations{$charset}) { | |
270 | ||
271 | if ($ucp < 0xA0) { | |
272 | return chr $ebcdic_translations{$charset}[$ucp]; | |
273 | } | |
274 | ||
e0dcdb0a | 275 | my $I8_2_utf = get_I8_2_utf($charset); |
6ff677df KW |
276 | |
277 | my $len = $ucp < 0xA0 ? 1 : | |
278 | $ucp < 0x400 ? 2 : | |
279 | $ucp < 0x4000 ? 3 : | |
280 | $ucp < 0x40000 ? 4 : | |
281 | $ucp < 0x400000 ? 5 : | |
c0236afe KW |
282 | $ucp < 0x4000000 ? 6 : |
283 | $ucp < 0x40000000? 7 : | |
284 | $CHARSET_TRANSLATIONS::UTF_EBCDIC_MAXBYTES; | |
6ff677df KW |
285 | |
286 | my @str; | |
287 | for (1 .. $len - 1) { | |
e0dcdb0a | 288 | unshift @str, chr $I8_2_utf->[($ucp & 0x1f) | 0xA0]; |
6ff677df KW |
289 | $ucp >>= 5; |
290 | } | |
291 | ||
e0dcdb0a | 292 | unshift @str, chr $I8_2_utf->[($ucp & _UTF_START_MASK($len)) | _UTF_START_MARK($len)]; |
6ff677df KW |
293 | |
294 | return join "", @str; | |
295 | } | |
296 | else { | |
297 | die "Unknown character set '$charset'"; | |
298 | } | |
299 | } | |
300 | ||
301 | 1; |